def create_COG(mode, mot):
    # generate random sequence set1 (100 seqs of length 300 bp)
    num_seqs_in_set = 100
    len_seq = 300

    geom_rvs = geom.rvs(0.75, size=num_seqs_in_set,
                        loc=-1)  #sym2=0.75, sym3=.7. Originally 0.5

    set1 = MGlib.random_DNA(len_seq, {
        'A': 0.3,
        'C': 0.2,
        'G': 0.2,
        'T': 0.3
    }, num_seqs_in_set)

    # sample large number of sites from motif
    pmot1 = MGlib.sample_motif(mot, num_seqs_in_set)

    if mode == "positive":
        #insert sites in sequences
        e = 0
        while (e < len(set1)):
            # edit sequence to include random site(s)
            # determine number of sites per geometric distribution
            num_sites = geom_rvs[e]
            new_sites = ""
            for j in range(0, num_sites):
                new_sites += random.choice(pmot1)
            if len(new_sites) > len_seq:
                new_sites = new_sites[:len_seq]
            set1[e] = new_sites + set1[e][len(new_sites):]
            e = e + 1

    set2 = set1
    return set2
def create_COG(mode, mot):
    # generate random sequence set1 (100 seqs of length 300 bp)
    num_seqs_in_set = 100
    len_seq = 300

    geom_rvs = geom.rvs(0.75, size=num_seqs_in_set, loc=-1)  # sym2=0.75, sym3=.7. Originally 0.5

    set1 = MGlib.random_DNA(len_seq, {"A": 0.3, "C": 0.2, "G": 0.2, "T": 0.3}, num_seqs_in_set)

    # sample large number of sites from motif
    pmot1 = MGlib.sample_motif(mot, num_seqs_in_set)

    if mode == "positive":
        # insert sites in sequences
        e = 0
        while e < len(set1):
            # edit sequence to include random site(s)
            # determine number of sites per geometric distribution
            num_sites = geom_rvs[e]
            new_sites = ""
            for j in range(0, num_sites):
                new_sites += random.choice(pmot1)
            if len(new_sites) > len_seq:
                new_sites = new_sites[:len_seq]
            set1[e] = new_sites + set1[e][len(new_sites) :]
            e = e + 1

    set2 = set1
    return set2
Exemple #3
#all sites are inserted at the first position of the sequence
#get individual sequence posteriors and write them together with the
#score of the site inserted
#first sequence set includes randomly distributed sites
#second sequence set includes only one site
#experiment is repeated 100 times

#write csv header

#loop experiments
for cnt in range(0, 100):
    print "Experiment: ", cnt
    #create background sequence set: 100 seqs 283 bp long
    set1 = MGlib.random_DNA(283, {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3}, 100)
    set2 = set1[:]
    #compute softmax scores for background sequences in dataset
    gscr = MGlib.esfmax_score_seqs(set1, pssm, rpssm)
    #compute softmax scores for motif sequences
    mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm)

    #get normal distributions for background and motif
    n_g = norm(mean(gscr), std(gscr))
    n_m = norm(mean(mscr), std(mscr))

    #create motif instances
    pmot1 = MGlib.sample_motif(mot, 100)
    pmot2 = MGlib.sample_motif(mot, 1)

    #insert sites in sequences
#Experiment 0:
#100 sequences, all with sampled pseudo-sites inserted
#all sites are inserted at the first position of the sequence
#get individual sequence posteriors and write them together with the
#score of the site inserted
#repeat 100 times

#write csv header

#loop experiments
for cnt in range(0,100):
    print "Experiment: ", cnt
    #create background sequence set: 100 seqs 283 bp long
    set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},100)
    #compute softmax scores for background sequences in dataset
    gscr = MGlib.esfmax_score_seqs(set1,pssm,rpssm)
    #compute softmax scores for motif sequences
    mscr = MGlib.esfmax_score_seqs(mot.instances,pssm,rpssm)
    #get normal distributions for background and motif
    n_g=norm(mean(gscr), std(gscr))
    n_m=norm(mean(mscr), std(mscr))
    #create motif instances
    pmot1 = MGlib.sample_motif(mot,100)
    #insert sites in sequences
    while (e<len(set1)):
def main():
    """Gets a motif from file and reads it. It then generates synthetic data
       to represent a set of promoters (100) mapping to a particular eggNOG/COG,
       inserts into these sequences pseudosites (generated from the 
       distribution implicit in the PSSM). It then calls the PSSM evaluation
       function to score the sites using the softmax function and then the
       different functions to compute the likelihoods and the posterior
       MG_synth -M <Motif file> -O <out file prefix> -E <experiment> \
                -A <alpha mix ratio> -P <Regulation prior> -T <theta> \
                -V <verbose mode>
       Note: motifs are assumed to be in FASTA or 1-per-line text format
    #set default parameters
    motif_filename="CsoR.txt"   #input file
    out_filename="_o"           #o prefix for output
    verbose=0                   #verbose mode
    alpha=1.0/300.0             #mixing ratio for regulated model
    rproms=3.0                  #number of regulated promoters [prior]
    tproms=1811.0               #total number of promoters in genome [prior]
    experiment=2                #the experiment number
    #get cmd parameters
        opts, args=getopt.getopt(sys.argv[1:],"I:O:V")
    except getopt.GetoptError:
        print 'MG_synth -M <Motif file> -O <out file prefix> -E <experiment> \
                -A <alpha mix ratio> -P <Regulation prior> -T <theta> \
                -V <verbose mode>'
    #assign parameters
    for opt, arg in opts:
        if opt == '-M':
        elif opt == '-O':
        elif opt == '-E':
        elif opt == '-A':
        elif opt == '-P':
        elif opt == '-T':
        elif opt == '-V':
        elif opt == '-askme':
            motif_filename = raw_input('Enter the motif file name\n')
            out_filename = raw_input('Enter the output file name prefix\n')
            experiment = raw_input('Enter the experiment number\n')
            alpha = raw_input('Enter the alpha mixing ratio\n')
            PR = raw_input('Enter the prior probability for regulation\n')
            theta = raw_input('Enter the theta sensitivity threshold\n')
            verbose = raw_input('Enter verbose mode (1/0)\n')
    out_filename=motif_filename.split(".")[0] + out_filename + str(experiment)
    if verbose: print "Using: ", motif_filename, " as input"
    if verbose: print "Writing to (suffix): ", "[void]" if out_filename==""\
    else out_filename
    #open files for ouput
        out_file = open(out_filename + ".csv","w")
    except (IOError, OSError) as file_open_exception:
        print "*** Something went wrong while opening the output file"
        print "*** Error: ", file_open_exception.errno, " - ",\
    #open file for error recording
        err_file = open(out_filename+".err","w")
    except (IOError, OSError) as file_open_exception:
        print "Something went wrong while opening the error file"

    #compute priors
    PR=rproms/tproms               #prior probability of regulation
    PB=1.0-PR                      #prior probability of non-regulation
    PPR=PB/PR                      #prior probability ratio
    #read motif and assing 0.25 pseudocounts to PSWM
    #also assign background uniform distribution for the PSSM (default)
    mot = MGlib.read_motif(motif_filename)
    #save the pssm for the motif and the reverse complement
    #(so that they are not recalculated everytime we invoke motif.pssm)
    pssm = mot.pssm
    rpssm = pssm.reverse_complement()

    if (experiment==0):
        #Experiment 0:
        #10000 sequences, with 100% on average having random pseudo-sites inserted
        #all sites are inserted at the first position of the sequence
        #get individual sequence posteriors and write them together with the
        #score of the site inserted
        #create background sequence set: 100 seqs 283 bp long
        set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},10000)
        #compute softmax scores for background sequences in dataset
        gscr = MGlib.sfmax_score_seqs(set1,pssm,rpssm)
        #get normal distributions for background and motif
        n_g=norm(mean(gscr), std(gscr))
        n_m=norm(pssm.mean(), pssm.std())
        #create motif instances
        pmot1 = MGlib.sample_motif(mot,1000)

        #insert sites in sequences
        while (e<len(set1)):
            r = random.random()
            #determine if site is to be inserted and insert random site
            if (r<1): 
                set1[e] = random.choice(pmot1) + set1[e]
            #otherwise insert random sequence from own sequence start
            else :
                set1[e] = set1[e][:17] + set1[e]
            e = e+1

        #compute softmax scores for sequences in dataset
        #get log-likelihoods for sequences in dataset        
        #get per-sequence posterior for the sequences in dataset

        #write results to file
        while (e<len(set1)):

        return 0
    elif (experiment==1):
        #Experiment 1:
        #2x100 sequences, with the first 10 having random pseudo-sites inserted
        #all sites are inserted at the first position of the sequence
        #get individual sequence posteriors and write them together with the
        #score of the site inserted
        #first sequence set includes randomly distributed sites
        #second sequence set includes only one site
        #experiment is repeated 100 times
        for cnt in range(0,100):
            #create background sequence set: 100 seqs 283 bp long
            set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},100)
            set2 = set1[:]
            #compute softmax scores for background sequences in dataset
            gscr = MGlib.sfmax_score_seqs(set1,pssm,rpssm)
            #get normal distributions for background and motif
            n_g=norm(mean(gscr), std(gscr))
            n_m=norm(pssm.mean(), pssm.std())
            #create motif instances
            pmot1 = MGlib.sample_motif(mot,100)
            pmot2 = MGlib.sample_motif(mot,1)
            print cnt
            #insert sites in sequences
            while (e<len(set1)):
                r = random.random()
                #insert random site in first 10 sequences 
                if (e<11): 
                    set1[e] = random.choice(pmot1) + set1[e]
                    set2[e] = random.choice(pmot2) + set2[e]
                #otherwise insert random sequence from own sequence start
                else :
                    set1[e] = set1[e][:17] + set1[e]
                    set2[e] = set2[e][:17] + set2[e]
                e = e+1
            #compute softmax scores for sequences in dataset
            if verbose: print "varied"
            if verbose: 
                for s in scrs1: print s[0]
            if verbose: print "loners"
            if verbose: 
                for s in scrs2: print s[0]
            #get log-likelihoods for sequences in dataset        
            #get overall posterior for the sequences in dataset
            #write results to file
        return 0