Esempio n. 1
0
 def check(self, seqs_as_strs, alpha):
     # Using Seq objects:
     m = motifs.create([Seq(s) for s in seqs_as_strs], alpha)
     m.weblogo(os.devnull)
     # Using strings:
     m = motifs.create(seqs_as_strs, alpha)
     m.weblogo(os.devnull)
Esempio n. 2
0
def sequence_content_analysis(metrics, whitelist, barcode="r1", head=5000000):
    from Bio import motifs
    from Bio.Seq import Seq

    r1 = metrics.head(head).index.get_level_values(barcode).to_series()

    in_ = r1.isin(whitelist.tolist())
    r1_right = r1[in_]
    r1_wrong = r1[~in_]

    motif_right = motifs.create([Seq(x) for x in r1_right if "N" not in x])
    motif_wrong = motifs.create([Seq(x) for x in r1_wrong if "N" not in x])

    r = pd.DataFrame(motif_right.pwm)
    w = pd.DataFrame(motif_wrong.pwm)
    c = np.log2(w / r)

    fig, axis = plt.subplots(1, 3, figsize=(3 * 3, 3))
    kwargs = {"square": True}
    sns.heatmap(r.T, ax=axis[0], **kwargs)
    axis[0].set_title("Correct")
    sns.heatmap(w.T, ax=axis[1], **kwargs)
    axis[1].set_title("Wrong")
    kwargs = {"square": True, "cmap": "RdBu_r", "center": 0, "robust": True}
    sns.heatmap(c.T, ax=axis[2], **kwargs)
    axis[2].set_title("Difference")
    fig.savefig(args.output_prefix + f"barcode_{barcode}.sequence_content.svg", bbox_inches="tight", dpi=300)
Esempio n. 3
0
 def check(self, seqs_as_strs, alpha):
     # Using Seq objects and passing exactly the same alphabet:
     m = motifs.create([Seq(s, alpha) for s in seqs_as_strs], alpha)
     m.weblogo(os.devnull)
     # Using Seq objects but not passing alphabet:
     m = motifs.create([Seq(s, alpha) for s in seqs_as_strs])
     m.weblogo(os.devnull)
     # Using strings and passing alphabet:
     m = motifs.create(seqs_as_strs, alpha)
     m.weblogo(os.devnull)
 def check(self, seqs_as_strs, alpha):
     # Using Seq objects and passing exactly the same alphabet:
     m = motifs.create([Seq(s, alpha) for s in seqs_as_strs], alpha)
     m.weblogo(os.devnull)
     # Using Seq objects but not passing alphabet:
     m = motifs.create([Seq(s, alpha) for s in seqs_as_strs])
     m.weblogo(os.devnull)
     # Using strings and passing alphabet:
     m = motifs.create(seqs_as_strs, alpha)
     m.weblogo(os.devnull)
def createMotif(sequences, fname):
    '''
    Creates and saves a motif (logo) from a list of input sequences.
    Input: list of strings with the sequences; file path to save the logo.
    Output: an image file with the logo.
    '''
    print 'Generating motif ' + timeStamp()
    try:
        os.makedirs('figure')
    except OSError:
        if not os.path.isdir('figure'):
         raise
    from Bio.Seq import Seq
    from Bio import motifs
    from Bio.Alphabet import IUPAC
    import urllib2
    # m = motif.motif(alphabet=IUPAC.unambiguous_dna) # initialize motif
    instances = []
    for sequence in sequences:
        if len(sequence) < 40:
            print sequence
        instances.append(Seq(sequence, alphabet=IUPAC.ambiguous_dna))
    m = motifs.create(instances)
    flogo = 'figure/' + fname
    while True:
        # source: http://stackoverflow.com/a/9986206/1274242
        try:
            m.weblogo(flogo, format='SVG')
            break
        except urllib2.HTTPError, detail:
            if detail.errno == 500:
                time.sleep(5)
                continue
            else:
                raise
Esempio n. 6
0
def create_motif_from_fasta_file(fasta_filename,
                                 out_filename,
                                 generate_pssm=False):
    instances = []
    with open(fasta_filename) as in_handle:
        for title, seq in SimpleFastaParser(in_handle):
            instances.append(Seq(seq, IUPAC.protein))

    m = motifs.create(instances, IUPAC.protein)
    m.weblogo(out_filename,
              show_xaxis=False,
              show_yaxis=False,
              show_errorbars=False,
              unit_name='',
              show_fineprint=False,
              format='pdf')

    if generate_pssm:
        pssm_file = open(out_filename[:-10] + "_pssm.txt", "w+")
        for i in range(len(m.pwm)):
            for j in range(len(m.pwm[i])):
                pssm_file.write(str(m.pwm[i][j]))
                pssm_file.write("\t")

            pssm_file.write("\n")
        pssm_file.close()
Esempio n. 7
0
 def __init__(self, sites, TF, name, pseudocounts=1):
     self._TF = TF
     self._name = name
     instances = [Seq(site, unambiguous_dna) for site in sites]
     self._motif = motifs.create(instances)
     self._motif.pseudocounts = pseudocounts
     self._motif.name = self.TF.accession_number + '(%s)' % self.name
Esempio n. 8
0
def create_motifs(tf_instance_id, collect_tf_motifs):
    '''
    Searches json for the motif instance passed into the function and constructs motif object
    from the instance's aligned binding site. 

    Parameters
    ----------
    tf_instance_id: String

    Returns
    -------
    motif: motifs object
        Constructed motif object from aligned binding sites, name is set to tf instance id
    -1: Int
        Returns -1 when the tf_instance was not found
    '''
    entry = list(
        filter(lambda motif: motif['tf_instance'] == tf_instance_id,
               collect_tf_motifs['all_motifs']))
    if (len(entry) > 0):
        motif = motifs.create(entry[0]['aligned_binding_sites'])
        motif.name = entry[0]['tf_instance']
        return motif
    else:
        return -1
Esempio n. 9
0
def compute_positional_weight_matrix(seqs, length=None, pseudocounts=0.5):
    if not length:
        length = min(len(s) for s in seqs)
    alphabet = IUPAC.protein
    bio_seqs = [Seq(p[:length], alphabet=alphabet) for p in seqs]
    m = motifs.create(bio_seqs, alphabet=alphabet)
    return m.counts.normalize(pseudocounts=pseudocounts)
Esempio n. 10
0
def createlogo(infiles, outfile, title, scale):
    """
	Creates a sequence logo in PNG format from a textfile of 
	aligned sequences in flat format.

	Arguments:
	  - infile    - name of the flat format alignment file.
	  - outfile   - name of the png logo file.
	  - title     - title for the logo.
	  - scale     - yaxis scale, usually 1.0
	return: None
	"""
    # input a text file with all sequences aligned in flat format.
    for fil in infiles:
        instances = []
        for line in fil:
            # for each line in the file create a seq-object from it
            # and append this object to a list.
            instances.append(Seq(line.strip(), alphabet=IUPAC.ambiguous_dna))
    # create a motif-object. The last sequence in instances
    # is empty, so skip it.
    m = motifs.create(instances[:-1], alphabet=IUPAC.ambiguous_dna)
    # create the sequence logo
    m.weblogo(outfile,
              logo_title=title,
              yaxis_scale=scale,
              stack_width="large")  #,alphabet='ambiguous_dna_alphabet')
Esempio n. 11
0
def createMotif(sequences, fname):
    '''
    Creates and saves a motif (logo) from a list of input sequences.
    Input: list of strings with the sequences; file path to save the logo.
    Output: an image file with the logo.
    '''
    print 'Generating motif ' + timeStamp()
    try:
        os.makedirs('figure')
    except OSError:
        if not os.path.isdir('figure'):
         raise
    from Bio.Seq import Seq
    from Bio import motifs
    from Bio.Alphabet import IUPAC
    import urllib2
    # m = motif.motif(alphabet=IUPAC.unambiguous_dna) # initialize motif
    instances = []
    for sequence in sequences:
        if len(sequence) < 40:
            print sequence
        instances.append(Seq(sequence, alphabet=IUPAC.ambiguous_dna))
    m = motifs.create(instances)
    flogo = 'figure/' + fname
    while True:
        # source: http://stackoverflow.com/a/9986206/1274242
        try:
            m.weblogo(flogo, format='SVG')
            break
        except urllib2.HTTPError, detail:
            if detail.errno == 500:
                time.sleep(5)
                continue
            else:
                raise
Esempio n. 12
0
def createPSSM():
    print "Start PSSM"

    #sequencelist = sequencelist.replace("-", ".")
    list = []

    for seq_record in SeqIO.parse("fastatmp", "fasta", IUPAC.unambiguous_dna):
        list.append(str(seq_record.seq))

    #Blast typical sequence
    result_handle = NCBIWWW.qblast("blastn", "nt", list[0])
    save_file = open("my_blast.xml", "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    #motifs.create(test, alphabet=Gapped(IUPAC.unambiguous_dna))
    m = motifs.create(list, alphabet=Gapped(IUPAC.unambiguous_dna))
    print "motif created"


    pwm = m.counts.normalize(pseudocounts=0.25)
    print "PWM done"
    pssm = pwm.log_odds()
    print "PSSM done"
    print pssm
    return pssm
def main(*args, **kwargs):
    fpath = os.path.join(os.getcwd(),args[-1])
    instances = list()
    for record in SeqIO.parse(str(fpath),'fasta'):
        instances.append(record.seq)
    m = motifs.create(instances)

    consensus = m.consensus
    print consensus

    profile = m.counts
    print 'A:',
    for elem in profile['A']:
        print elem,

    print '\nC:',
    for elem in profile['C']:
        print elem,

    print '\nG:',
    for elem in profile['G']:
        print elem,

    print '\nT:',
    for elem in profile['T']:
        print elem,
Esempio n. 14
0
def forcemotif(sequences):
    from Bio import motifs
    from Bio.Seq import Seq
    if len(sequences) != 0:
        return (motifs.create(sequences))
    else:
        return ([])
Esempio n. 15
0
def printAlignmentInfo(alignment, alphabet):
    seqlist = []
    for record in alignment:
        seqlist.append(record.seq)

    m = motifs.create(seqlist, alphabet)
    pwm = m.counts.normalize()
    consensus = pwm.consensus

    summary_align = AlignInfo.SummaryInfo(alignment)

    consensus2 = summary_align.dumb_consensus()
    my_pssm = summary_align.pos_specific_score_matrix(consensus,
                                                      chars_to_ignore=['N'])

    print(alignment)

    print('first description: %s' % alignment[0].description)
    print('first sequence: %s' % alignment[0].seq)
    print('length %i' % alignment.get_alignment_length())

    print('matrix pwm %s' % pwm)
    print('consensus (motifs) %s' % consensus)

    print('matrix pssm %s' % my_pssm)
    print('consensus (AlignInfo.SummaryInfo) %s' % consensus2)

    return
Esempio n. 16
0
def createPSSM():
    print "Start PSSM"

    #sequencelist = sequencelist.replace("-", ".")
    list = []

    for seq_record in SeqIO.parse("fastatmp", "fasta", IUPAC.unambiguous_dna):
        list.append(str(seq_record.seq))

    #Blast typical sequence
    result_handle = NCBIWWW.qblast("blastn", "nt", list[0])
    save_file = open("my_blast.xml", "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()

    #motifs.create(test, alphabet=Gapped(IUPAC.unambiguous_dna))
    m = motifs.create(list, alphabet=Gapped(IUPAC.unambiguous_dna))
    print "motif created"

    pwm = m.counts.normalize(pseudocounts=0.25)
    print "PWM done"
    pssm = pwm.log_odds()
    print "PSSM done"
    print pssm
    return pssm
Esempio n. 17
0
def motif2bed(motif, fasta, reverse_strand=True, output=()):
    motif_bed = []
    m = motifs.create([Seq(motif)])
    #iterate over fasta file and search for motifs
    for record in SeqIO.parse(open(fasta, "r"), "fasta"):
        chrom = record.id
        #Check forward strand
        for pos, seq in m.instances.search(str(record.seq)):
            start_pos = pos
            stop_pos = pos + len(seq)
            motif_bed = motif_bed + [[
                str(chrom),
                str(start_pos),
                str(stop_pos),
                str(motif), ".", "+"
            ]]
        #Check reverse strand
        if reverse_strand:
            for pos, seq in m.reverse_complement().instances.search(
                    str(record.seq)):
                start_pos = pos
                stop_pos = pos + len(seq)
                motif_bed = motif_bed + [[
                    str(chrom),
                    str(start_pos),
                    str(stop_pos),
                    str(motif), ".", "-"
                ]]
    if output:
        with open(output, 'w') as file:
            file.writelines('\t'.join(i) + '\n' for i in motif_bed)
    else:
        return (motif_bed)
Esempio n. 18
0
def create_nucleotide_frequency_from_fasta(path_to_fasta, sequence_length=None, verbose=False):
    """Create a pandas.DataFrame with the frequency of each nucleotide
    on each position. Sequences are converted to uppercase DNA and saved
    as IUPAC.ambiguous_dna, however only the results for A, C, T and G are
    returned.

    :param path_to_fasta: path to the fasta file
    :sequence_length: assumed length of the input sequences. If not provided length of the first
                      sequence will be chosen. It will be check and any sequece of different 
                      length will be ignored
    :returns: 2-tuple: pandas.DataFrame, Bio.motifs.Motif

    """
    seqs = []
    number_of_ignored = 0
    seq_counter = 0
    if not sequence_length:
        for rec in SeqIO.parse(path_to_fasta, 'fasta'):
            sequence_length = len(rec.seq)
            break
        sys.stderr.write("Presumed sequence length was not provided. First encounterd sequence length" +
                         " will be used i.e. %i\n" % sequence_length)
    for rec in SeqIO.parse(path_to_fasta, 'fasta'):
        seq_counter += 1
        if len(rec.seq) != sequence_length:
            if verbose:
                sys.stderr.write("%s has wrong length (%i)." % (rec.id, len(rec.seq)) +
                                 " It will be ignored.\n")
            number_of_ignored += 1
        else:
            seqs.append(Seq.Seq(str(rec.seq).upper().replace("U", "T"), IUPAC.ambiguous_dna))
    motifs_obj = motifs.create(seqs, IUPAC.ambiguous_dna)
    frequency_df = DataFrame(motifs_obj.pwm)[["A", "C", "T", "G"]]
    sys.stderr.write("%i sequences out of %i was ignored because of the length issue.\n" % (number_of_ignored, seq_counter))
    return frequency_df, motifs_obj
Esempio n. 19
0
 def setUp(self):
     self.m = motifs.create(
         [
             Seq("UACAA"), Seq("UACGC"), Seq("UACAC"), Seq("UACCC"),
             Seq("AACCC"), Seq("AAUGC"), Seq("AAUGC")
         ],
         alphabet=IUPAC.unambiguous_rna,
     )
Esempio n. 20
0
 def setUp(self):
     self.m = motifs.create(
         [
             Seq("TACAA"), Seq("TACGC"), Seq("TACAC"), Seq("TACCC"),
             Seq("AACCC"), Seq("AATGC"), Seq("AATGC")
         ],
         alphabet=IUPAC.extended_dna,
     )
Esempio n. 21
0
def recursive_random(instances, motiflength, records):
    """
    The main function in de sampler, this is the recursive alogorithm that will keep throwing away sequences and getting a new bets match until there is regression
    :param instances: the first motif attempts
    :param motiflength: the motif length you are searching for
    :param records: the original sequences
    :return:
    """
    global gapSize
    old_total = check_solution(instances)

    for idx, instance in enumerate(instances):
        train_instances = copy.deepcopy(instances)
        leave_out = random.choice(train_instances)
        seq_index = instances.index(leave_out)
        if Config.max_gapsize == 0 or (gapList[seq_index] == 0 or gapList[seq_index] == 8):
            print("Leaving out %s" % leave_out)
        else:
            print("Leaving out %s" % leave_out[0:gapList[seq_index]] + "-" + leave_out[gapList[seq_index]:])
        train_instances.remove(leave_out)

        train_motifs = motifs.create(train_instances)
        profile = create_pssm(train_motifs)

        leftseqs = [records[seq_index]]
        new_instances = get_best_matches(leftseqs, profile, motiflength, seq_index)
        print("new best instance:")
        for new_instance in new_instances:
            if Config.max_gapsize == 0 or (gapList[seq_index] == 0 or gapList[seq_index] == 8):
                print(new_instance)
            else:
                print("gapsize: " + str(gapSize))
                print((new_instance[0:gapList[seq_index]] + "-" + new_instance[gapList[seq_index]:]))
            instances[seq_index] = new_instance
    # printing the result from this iteration
    total = check_solution(instances)
    profile = create_pssm(motifs.create(instances))
    print("new solution: %d" % total)
    print("new profile: ")
    print(profile)
    #     Check if there is no regression, if not continue the recursion else stop the program
    if total < old_total:
        return recursive_random(instances, motiflength, records)
    else:
        motif = motifs.create(instances)
        return (motif, total, gapSize)
def findmotif(s, t):
    """Prints all locations of s as a substring of t using 1-based numbering"""

    target_seq = Seq(s)
    motif = motifs.create([Seq(t)])

    for pos, seq in motif.instances.search(target_seq):
        print(str(pos + 1), end=" ")
Esempio n. 23
0
 def setUp(self):
     self.m = motifs.create(
         [
             Seq("ACDEG"), Seq("AYCRN"), Seq("HYLID"), Seq("AYHEL"),
             Seq("ACDEH"), Seq("AYYRN"), Seq("HYIID")
         ],
         alphabet=IUPAC.extended_protein,
     )
Esempio n. 24
0
def main():
    bases = ['A', 'C', 'G', 'T']
    seqs = []
    for i in SeqIO.parse('cons.fna', 'fasta'):
        seqs.append(i.seq)
    m = motifs.create(seqs)
    print(m.consensus)
    for j in bases:
        print(genCount(j, seqs, len(m)))
Esempio n. 25
0
    def out():
        infile = e1.get()

        list2 = []
        for record in SeqIO.parse(str(infile), "fasta"):
            list2.append(record.seq[0:int(lens)])
        m = motifs.create(list2)

        win.destroy()
Esempio n. 26
0
def get_consenus(data):
	inst = []
	for i in data:
		inst.append(Seq(data[i]))
	m = motifs.create(inst)
	print(m.consensus)
	count_matrix = m.counts
	for key, value in count_matrix.items():
		print(''.join(key + ': ' + ' '.join(map(str, value))))
def sym_permute_motif(cur_motif_sites):
    new_sites = sym_permute_sites(cur_motif_sites)
    new_motif_sites = []
    for new_site in new_sites:
        new_motif_sites.append(Seq(new_site, IUPAC.unambiguous_dna))
    new_motif = motifs.create(new_motif_sites)
    new_motif.pseudocounts = 1
    new_motif.background = None
    return new_motif
def sym_permute_motif(cur_motif_sites):
    new_sites = sym_permute_sites(cur_motif_sites)
    new_motif_sites = []
    for new_site in new_sites:
        new_motif_sites.append(Seq(new_site, IUPAC.unambiguous_dna))
    new_motif = motifs.create(new_motif_sites)
    new_motif.pseudocounts = 1
    new_motif.background = None
    return new_motif
Esempio n. 29
0
def create_logo_from_fasta(fasta_path, logo_path):
    sequences=[]
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = record.seq.upper()
        if "N" not in str(seq) and "Y" not in str(seq):
            sequences.append(seq)
    motif = motifs.create(sequences)
    motif.weblogo(logo_path)
    print "Logo is available in " + logo_path
    return 0
Esempio n. 30
0
 def setUp(self):
     self.m = motifs.create([
         Seq("TACAA"),
         Seq("TACGC"),
         Seq("TACAC"),
         Seq("TACCC"),
         Seq("AACCC"),
         Seq("AATGC"),
         Seq("AATGC")
     ])
Esempio n. 31
0
 def set_motifs(self, ppath):
     from Bio import motifs
     seqs = self.get_sequences_from_fasta(ppath)
     mot = motifs.create(seqs)
     self.mot = {
         'A': mot.pwm['A'],
         'C': mot.pwm['C'],
         'G': mot.pwm['G'],
         'T': mot.pwm['T']
     }
Esempio n. 32
0
def getMotif(new_rnglist):
    motif_list=[]
    for entry in new_rnglist:
        motif_seq=entry[4]
        motif_list.append(motif_seq)
    #make motif.  make lc uppercase
    clip_motif=motifs.create([x.upper() for x in motif_list if 'N' not in x],alphabet=IUPAC.unambiguous_dna)
    #set pseudocounts
    clip_motif.pseudocounts=0.5
    return clip_motif
def create_PWMdict(clusterdict):
    PWMdict = {}
    for key, value in clusterdict.items():
        if key not in PWMdict.keys():  #还没有
            PWMdict[key] = []
        instances = trans_motif(value)
        m = motifs.create(instances)
        PWMdict[key].append(m.counts.normalize(pseudocounts=0))
        #PWMdict[key].append(m.counts.normalize(pseudocounts={'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}))
    return PWMdict
Esempio n. 34
0
def main(args):
    """Consensus and Profile"""

    seqs = [record.seq for record in SeqIO.parse(args.dataset, 'fasta',
                                                 generic_dna)]
    profile = motifs.create(seqs)
    print(profile.consensus)
    for base in 'ACGT':
        print(base + ':',
              ' '.join(str(count) for count in profile.counts[base]))
Esempio n. 35
0
def visSeq(entry,degree,name):
    '''
    This function visulizes one single entry of the format
    (count,seqes[DEGREE],gaps,dirs)
    '''
    for i in range(degree):
        instances=[]
        for seq in entry[1][i]:
            instances.append(Seq(seq))
        m=motifs.create(instances)
        m.weblogo(name+str(i)+'.png',format='PNG',stack_width='large',unit_name='probability',resolution='300',color_scheme='color_classic')
Esempio n. 36
0
def check_solution(instances):
    """
    gets the score of a motif list
    :param instances: the motifs
    :return: a score, the lower this score the better
    """
    pssm = create_pssm(motifs.create(instances))
    old_scores = list()
    for instance in instances:
        old_scores.append(pssm.calculate(instance))
    return sum(old_scores)
def find(dic_all, ik):
    """
    take dic from read_fasta()
    -looks for stretches of identies 
    -looks for max stretch 
    """
    dic = dic_all
    dic_instances = {}
    #it=list(range(2,14,4))
    it = [2, 8, 14]
    #idxe=list(range(0,33))
    ttemp = pd.DataFrame()
    #ttemp=ttemp.reindex(idxe)
    count = 0
    for key, value in dic.items():
        #print (key, value)
        for t, p in value.items():
            #print(t, p)
            if t == 'piRNA':

                temp = p
            if t == 'target':
                instances = []
                for i in range(len(p)):
                    if i < len(p) - (ik + 1):
                        if p[i] == 'N':
                            pass
                        else:
                            kk = i + ik

                            count += 1

                            instances.append(Seq(p[i:kk]))

                    m = motifs.create(instances)
                    r = m.reverse_complement()  #reverse

                    ll = 0
                    for pos, seq in r.instances.search(temp):
                        if key not in dic_instances.keys():
                            dic_instances[key] = pos + 1
                            #print(dic_instances)
                        else:
                            dic_instances[key + str(ll)] = pos + 1
                            ll += 1
    tt = pd.DataFrame.from_dict(dic_instances, orient='index')  #.T
    tt['{}-mer'.format(ik)] = tt[0]
    tt = tt.drop(axis=1, labels=0)
    ttemp = pd.concat([ttemp, tt], axis=1, sort=False)

    print('instances:', count)
    print(len(dic_instances))
    return ttemp
def rename_tree2(path):
    names = [x.name for x in path]
    for n in range(len(names)):
        c = tree.common_ancestor({"name": "%s" % names[n]})
        files = [x.name for x in c.get_terminals()]
        sequences = [s.split('_')[1] for s in files]
        for sequence in sequences:
            Seq(sequence, alphabet=IUPAC.unambiguous_rna)
        m = motifs.create(sequences, alphabet=IUPAC.unambiguous_rna)
        newname = tree2.find_any({"name": "%s" % c.name})
        if newname is not None:
            newname.name = "%s" % m.degenerate_consensus
Esempio n. 39
0
File: pssm.py Progetto: FUUbi/PSSM
 def calcutaltePssm(self, filePath):
     instance = list()
     for seq_record in SeqIO.parse(filePath, "fasta"):
         dna_seq = Seq(str(seq_record.seq).upper())
         instance.append(dna_seq)
     # wmm
     m = motifs.create(instance)
     m.weblogo(os.path.splitext(os.path.basename(filePath))[0]+"Logo.png")
     pwm = m.counts.normalize(pseudocounts=0)
     pssm = pwm.log_odds()       # pssm
     print pssm
     return pssm
Esempio n. 40
0
def run(records, motiflength):
    # creat random instances of given motif size
    instanceref = get_random_instances(records, motiflength)
    print("found %d sequences" % len(instanceref))
    print("Got random instances:")
    motif = motifs.create(instanceref)
    print(motif)
    print("Starting profile")
    prof = create_pssm(motif)
    print(prof)
    motif = recursive_random(instanceref, motiflength, records)
    return motif
Esempio n. 41
0
    def createMotif(self, file_handle):
        records = parse(file_handle, "fasta")
        logofile = self.file_session + "_logo.png"
        seqs_motif = []

        for record in records:
            self.sequence = Seq(str(record.seq))
            seqs_motif.append(self.sequence)
        seqs = motifs.create(seqs_motif)
        print(seqs.counts)
        seqs.weblogo(logofile)
        print("Weblogo saved.")
def motif_find(dna, motif_to_search):
    """Finds all instances of a motif in a string, returns str of start positions"""

    instances = [Seq(motif_to_search)]
    m = motifs.create(instances)

    listOfStartPostitions = ""

    for pos, seq in m.instances.search(dna):
        print("{0}, {1}".format(pos+1, seq))
        listOfStartPostitions +="{0} ".format(pos+1)

    return print(listOfStartPostitions)
Esempio n. 43
0
    def create_pwm_arrays_from_grads(self,
                                     result_dir,
                                     data_dir,
                                     window_size=12,
                                     batch_size=128):

        from Bio import motifs
        from Bio.Seq import Seq

        # load sequences from file
        with open(os.path.join(result_dir, 'best_config', 'saliency.seqs'),
                  'r') as f:
            reader = csv.reader(f, delimiter='\t')
            seqs = [line[0] for line in reader]

        # identify windows of highest saliency for each sequence & extract subsequences
        salient_seqs = []
        max_scores = []

        with open(os.path.join(result_dir, 'best_config', 'saliency.scores'),
                  'r') as f:
            reader = csv.reader(f,
                                delimiter='\t',
                                quoting=csv.QUOTE_NONNUMERIC)
            for i in range(len(seqs)):
                sal_scores = abs(np.array(reader.next()))
                window_scores = np.convolve(sal_scores,
                                            np.ones(window_size, dtype=int),
                                            'valid')
                max_ind = np.argmax(window_scores)
                max_score = np.max(window_scores)
                salient_seqs.append(Seq(seqs[i][max_ind:max_ind +
                                                window_size]))
                max_scores.append(max_score)

        # filter out low scoring sequences??? skip for now...
        threshold = np.percentile(max_scores, 80)
        print('Threshold:' + str(threshold),
              'Median: ' + str(np.median(max_scores)))
        salient_seqs = [
            salient_seqs[i] for i in range(len(salient_seqs))
            if max_scores[i] >= threshold
        ]

        # create motif from subsequences using BioPython
        with open(os.path.join(result_dir, 'best_config', 'numpy.pwm'),
                  'w') as f:
            writer = csv.writer(f, delimiter=' ')
            motif = motifs.create(salient_seqs)
            for nuc in ['A', 'C', 'G', 'T']:
                writer.writerow(motif.pwm[nuc])
def read_motif(motif_filename, verb=0):
    """Reads a motif as a collection of sites from a file

    Reads a motif and uses the biopython.motifs class to store it. If the motif
    is in FASTA format, it uses the parser directly. Otherwise, it loads and 
    reads a concatenated text file and creates the motif.
    
    File type is determined by extension: 
        * FASTA for .fas, .fasta and .fa files
        * One-per-line text file otherwise
    
    Input:
        * The motif filename; required
        * Verbose mode (default=0)
    Returns:
        * the read motif
    """


    #create file handler for reading file
    try:
        motif_file = open(motif_filename,"r")
    except (IOError, OSError) as file_open_exception:
        print "*** The file name provided:", motif_filename, " does not exist"
        print "*** Error: ", file_open_exception.errno, " - ",\
                             file_open_exception.strerror       
        sys.exit()

    #Figure out file type based on extension, read sites and create motif
    extension = motif_filename.split('.')[-1]

    if extension not in ['fas', 'fasta', 'fa']:
        if verb: print 'Reading motif... raw text sequence mode assumed \
                        (one site per line, not FASTA parsing)'
        sites = []
        for line in motif_file:
            sites.append(Seq(line.rstrip('\n\r'),IUPAC.unambiguous_dna))
        mot = motifs.create(sites)
        if verb: print mot.degenerate_consensus
    else:
        if verb: print 'Reading motif... attempting to parse FASTA file'
        mot = motifs.read(motif_file,'sites')
        
    motif_file.close()
    return mot
Esempio n. 45
0
    def _get_pwm(self, input_motif=list()):
        # seperate headers from sequences
        headers, instances = [list(x) for x in zip(*input_motif)]
        motif_seq = list()

        if self.alphabet == 'protein':
            alphabet = IUPAC.protein
        elif self.alphabet == 'rna':
            alphabet = IUPAC.unambiguous_rna
        else:
            alphabet = IUPAC.unambiguous_dna

        if self.gap_in_alphabet is True:
            alphabet = Gapped(alphabet, "-")

        for i in instances:
            # motif as Bio.Seq instance
            motif_seq.append(Seq(i, alphabet))

        motif_obj = motifs.create(motif_seq)
        return motif_obj.counts.normalize(self.pseudocounts)
 def __init__(self, binding_sites, name="", pseudocounts=1):
     """ Creates a PSSM scorer object. Accepts binding sites in the form of
     a path to a text file containing one site per line, or a list of 
     Biopython Bio.Seq objects. """
     
     self.name = name
     self.alphabet = Alphabet.IUPAC.unambiguous_dna
     self.path = None
     
     if type(binding_sites) == str:
         self.seqs = [Seq.Seq(site.strip(), self.alphabet) for site in open(binding_sites).readlines()]
         self.name = os.path.splitext(os.path.basename(binding_sites))[0]
         self.path = binding_sites
     elif type(binding_sites) == list:
         self.seqs = binding_sites
         
     self.n = len(self.seqs)
     
     # Default name
     if len(self.name) == 0:
         self.name = "pssm_%dbp_%dseqs" % (self.m, len(self.seqs))
         
     # Construct motif
     self.motif = motifs.create(self.seqs)
     self.motif.pseudocounts = pseudocounts
     
     # Construct PSSM and reverse PSSM
     self.pssm = self.motif.pssm
     self.pssm_r = self.pssm.reverse_complement()
     self.m = self.pssm.length
     self.w = self.pssm.length
     self.length = self.pssm.length
     
     # Fast score primitives
     self.dict_pssm = dict(self.pssm)
     self.dict_pssm_r = dict(self.pssm_r)
     
     # Bayesian estimator
     self.estimator_initialized = False
Esempio n. 47
0
def createlogo(infiles, outfile, title, scale):
	"""
	Creates a sequence logo in PNG format from a textfile of 
	aligned sequences in flat format.

	Arguments:
	  - infile    - name of the flat format alignment file.
	  - outfile   - name of the png logo file.
	  - title     - title for the logo.
	  - scale     - yaxis scale, usually 1.0
	return: None
	"""
	# input a text file with all sequences aligned in flat format.
	for fil in infiles:
		instances = []
		for line in fil:
			# for each line in the file create a seq-object from it
			# and append this object to a list.
			instances.append(Seq(line.strip(), alphabet=IUPAC.ambiguous_dna))
	# create a motif-object. The last sequence in instances
	# is empty, so skip it.
	m = motifs.create(instances[:-1], alphabet=IUPAC.ambiguous_dna)
	# create the sequence logo
	m.weblogo(outfile, logo_title=title, yaxis_scale=scale, stack_width="large")#,alphabet='ambiguous_dna_alphabet')
Esempio n. 48
0
import numpy as np
from Bio import SeqIO
from Bio import motifs

handle = open("/home/xc406/data/hg19randomnorepeat.fa", "rU")
records = list(SeqIO.parse(handle, "fasta"))

instances = []
for record in records:
    instances.append(record.seq)
m = motifs.create(instances)
pwm = m.counts.normalize(pseudocounts={'A':0.2953, 'C': 0.2047, 'G': 0.2047, 'T': 0.2953})

na ='\t'.join(list(map(str,pwm['A'])))
nc ='\t'.join(list(map(str,pwm['C'])))
ng ='\t'.join(list(map(str,pwm['G'])))
nt ='\t'.join(list(map(str,pwm['T'])))
r1 = '\t'.join(['A:',na])
r2 = '\t'.join(['C:',nc])
r3 = '\t'.join(['G:',ng])
r4 = '\t'.join(['T:',nt])
r = '\n'.join(['M0000_0.80',r1,r2,r3,r4])

with open('randompwm.txt','w') as f:
    f.write(r)

with open('hg19chromdict.txt', 'r') as f:
        chromdict = pickle.load(f)

chromdict.rand
r = np.random.rand(10,4)
Esempio n. 49
0
    def add_attI(self):
        """
        Looking for Att1 sites and add them to this integron.
        """
        dist_atti = 500

        # attI1
        instances_attI1 = [Seq.Seq('TGATGTTATGGAGCAGCAACGATGTTACGCAGCAGGGCAGTCGCCCTAAAACAAAGTT')]
        attI1 = motifs.create(instances_attI1)
        attI1.name = "attI1"

        # attI2
        instances_attI2 = [Seq.Seq('TTAATTAACGGTAAGCATCAGCGGGTGACAAAACGAGCATGCTTACTAATAAAATGTT')]
        attI2 = motifs.create(instances_attI2)
        attI2.name = "attI2"

        # attI3
        instances_attI3 = [Seq.Seq('CTTTGTTTAACGACCACGGTTGTGGGTATCCGGTGTTTGGTCAGATAAACCACAAGTT')]
        attI3 = motifs.create(instances_attI3)
        attI3.name = "attI3"

        motif_attI = [attI1, attI2, attI3]

        if self.type() == "complete":
            if ((self.attC.pos_beg.values[0] - self.integrase.pos_end.values[0]) % self.replicon_size >
                    (self.integrase.pos_beg.values[0] - self.attC.pos_end.values[-1]) % self.replicon_size):
                # if integrase after attcs (on the right)

                left = int(self.attC.pos_end.values[-1])
                right = int(self.integrase.pos_beg.values[0])
            else:
                left = int(self.integrase.pos_end.values[-1])
                right = int(self.attC.pos_beg.values[0])
            strand_array = self.attC.strand.unique()[0]

        elif self.type() == "In0":
            left = int(self.integrase.pos_beg)
            right = int(self.integrase.pos_end)
            strand_array = "both"
        elif self.type() == "CALIN":
            left = int(self.attC.pos_beg.values[0])
            right = int(self.attC.pos_end.values[-1])
            strand_array = self.attC.strand.unique()[0]

        if left < right:
            seq_attI = self.replicon.seq[left - dist_atti:right + dist_atti]
        else:
            seq_attI1 = self.replicon.seq[left - dist_atti:self.replicon_size]
            seq_attI2 = self.replicon.seq[:right + dist_atti]
            seq_attI = seq_attI1 + seq_attI2

        for m in motif_attI:

            if strand_array == 1:
                mot = [m]
            elif strand_array == "both":
                mot = [m.reverse_complement(), m]
            else:
                mot = [m.reverse_complement()]

            for sa, mo in enumerate(mot):
                for pos, s in mo.instances.search(seq_attI):
                    tmp_df = pd.DataFrame(columns=self._columns)
                    tmp_df = tmp_df.astype(dtype=self._dtype)
                    tmp_df["pos_beg"] = [(left - dist_atti + pos) % self.replicon_size]
                    tmp_df["pos_end"] = [(left - dist_atti + pos + len(s)) % self.replicon_size]
                    tmp_df["strand"] = [strand_array] if strand_array != "both" else [sa * 2 - 1]
                    tmp_df["evalue"] = [np.nan]
                    tmp_df["type_elt"] = "attI"
                    tmp_df["annotation"] = "attI_%s" % (m.name[-1])
                    tmp_df["model"] = "NA"
                    tmp_df.index = [m.name]
                    tmp_df["distance_2attC"] = [np.nan]
                    self.attI = self.attI.append(tmp_df)
Esempio n. 50
0
    def add_promoter(self):
        """
        Looks for known promoters if they exists within your integrons element.
        It takes 1s for about 13kb.
        """
        dist_prom = 500  # pb distance from edge of the element for which we seek promoter

        ######## Promoter of integrase #########

        if self.has_integrase():
            # PintI1
            p_intI1 = motifs.create([Seq.Seq("TTGCTGCTTGGATGCCCGAGGCATAGACTGTACA")])
            p_intI1.name = "P_intI1"

            # PintI2
            # Not known

            # PintI3
            # Not known

            motifs_Pint = [p_intI1]

            seq_p_int = self.replicon.seq[int(self.integrase.pos_beg.min()) - dist_prom:
                                          int(self.integrase.pos_end.max()) + dist_prom]

            for m in motifs_Pint:
                if self.integrase.strand.values[0] == 1:
                    generator_motifs = m.instances.search(seq_p_int[:dist_prom])
                    for pos, s in generator_motifs:
                        tmp_df = pd.DataFrame(columns=self._columns)
                        tmp_df = tmp_df.astype(dtype=self._dtype)
                        tmp_df["pos_beg"] = [self.integrase.pos_beg.values[0] - dist_prom + pos]
                        tmp_df["pos_end"] = [self.integrase.pos_beg.values[0] - dist_prom + pos + len(s)]
                        tmp_df["strand"] = [self.integrase.strand.values[0]]
                        tmp_df["evalue"] = [np.nan]
                        tmp_df["type_elt"] = "Promoter"
                        tmp_df["annotation"] = "Pint_%s" %(m.name[-1])
                        tmp_df["model"] = "NA"
                        tmp_df.index = [m.name]
                        tmp_df["distance_2attC"] = [np.nan]
                        self.promoter = self.promoter.append(tmp_df)
                else:
                    generator_motifs = m.instances.reverse_complement().search(seq_p_int[-dist_prom:])
                    for pos, s in generator_motifs:
                        tmp_df = pd.DataFrame(columns=self._columns)
                        tmp_df = tmp_df.astype(dtype=self._dtype)
                        tmp_df["pos_beg"] = [self.integrase.pos_end.max() + pos]
                        tmp_df["pos_end"] = [self.integrase.pos_end.max() + pos + len(s)]
                        tmp_df["strand"] = [self.integrase.strand.values[0]]
                        tmp_df["evalue"] = [np.nan]
                        tmp_df["type_elt"] = "Promoter"
                        tmp_df["annotation"] = "Pint_%s" % (m.name[-1])
                        tmp_df["model"] = "NA"
                        tmp_df.index = [m.name]
                        tmp_df["distance_2attC"] = [np.nan]
                        self.promoter = self.promoter.append(tmp_df)

        ######## Promoter of K7 #########

        # Pc-int1
        motifs_Pc = []

        pc = SeqIO.parse(os.path.join(self.cfg.model_dir, "variants_Pc_intI1.fst"), "fasta")
        pseq = [i for i in pc]
        d = {len(i): [] for i in pseq}
        _ = [d[len(i)].append(i.seq.upper()) for i in pseq]
        for k, i in d.items():
            motifs_Pc.append(motifs.create(i))
            motifs_Pc[-1].name = "Pc_int1"

        # Pc-int2
        # Not known

        # Pc-int3

        pc_intI3 = motifs.create([Seq.Seq("TAGACATAAGCTTTCTCGGTCTGTAGGCTGTAATG"),
                                  Seq.Seq("TAGACATAAGCTTTCTCGGTCTGTAGGATGTAATG")])
        pc_intI3.name = "Pc_int3"
        motifs_Pc.append(pc_intI3)

        if self.type() == "complete":

            if ((self.attC.pos_beg.values[0] - self.integrase.pos_end.values[0]) % self.replicon_size >
                    (self.integrase.pos_beg.values[0] - self.attC.pos_end.values[-1]) % self.replicon_size):
                # if integrase after attcs (on the right)
                left = int(self.attC.pos_end.values[-1])
                right = int(self.integrase.pos_beg.values[0])
            else:
                left = int(self.integrase.pos_end.values[-1])
                right = int(self.attC.pos_beg.values[0])

            strand_array = self.attC.strand.unique()[0]

        elif self.type() == "In0":
            left = int(self.integrase.pos_beg.values[0])
            right = int(self.integrase.pos_end.values[-1])
            strand_array = "both"

        elif self.type() == "CALIN":
            left = int(self.attC.pos_beg.values[0])
            right = int(self.attC.pos_end.values[-1])
            strand_array = self.attC.strand.unique()[0]

        if left < right:
            seq_Pc = self.replicon.seq[left - dist_prom:right + dist_prom]
        else:
            seq_Pc1 = self.replicon.seq[left - dist_prom:self.replicon_size]
            seq_Pc2 = self.replicon.seq[:right + dist_prom]
            seq_Pc = seq_Pc1 + seq_Pc2

        for m in motifs_Pc:
            if strand_array == 1:
                mot = [m]
            elif strand_array == "both":
                mot = [m.reverse_complement(), m]
            else:
                mot = [m.reverse_complement()]

            for sa, mo in enumerate(mot):
                for pos, s in mo.instances.search(seq_Pc):
                    tmp_df = pd.DataFrame(columns=self._columns)
                    tmp_df = tmp_df.astype(dtype=self._dtype)
                    tmp_df["pos_beg"] = [(left - dist_prom + pos) % self.replicon_size]
                    tmp_df["pos_end"] = [(left - dist_prom + pos + len(s)) % self.replicon_size]
                    tmp_df["strand"] = [strand_array] if strand_array != "both" else [sa * 2 - 1]
                    tmp_df["evalue"] = [np.nan]
                    tmp_df["type_elt"] = "Promoter"
                    tmp_df["annotation"] = "Pc_%s" % (m.name[-1])
                    tmp_df["model"] = "NA"
                    tmp_df.index = [m.name]
                    tmp_df["distance_2attC"] = [np.nan]
                    self.promoter = self.promoter.append(tmp_df)
Esempio n. 51
0
def subs(s, t):
    m = motifs.create([Seq(t)])
    print ' '.join([str(pos + 1) for pos, seq in m.instances.search(Seq(s))])
Esempio n. 52
0
def run(argv=None):

	settings, args = process_command_line(None)

	fragment_as_seq_list, negative_pos_score_list, positive_pos_score_list = \
		format_file_input(settings.reads_fusion)

	# Score diagram
	pos_sum_dct, pos_count_dct = get_position_statistics(fragment_as_seq_list,
														 negative_pos_score_list,
														 positive_pos_score_list)

	stats_dict = {position: float(pos_sum_dct[position]) / float(pos_count_dct[position])
				  for position in pos_sum_dct.keys()}


	ligation_index = abs(min(stats_dict.keys()))
	lower_bound = min(stats_dict.keys())
	upper_bound = max(stats_dict.keys())

	if settings.distance:
		lower_bound = max(lower_bound, -settings.distance)
		upper_bound = min(upper_bound, settings.distance)

	# Generate average base score plot
	points = [[i, stats_dict[i]] for i in range (lower_bound, 0)] + \
			 [[i, stats_dict[i]] for i in range (1, upper_bound + 1)]

	plt.rcParams.update({"font.size": 18})
	plt.xlim(lower_bound - 1, upper_bound + 1)
	plt.ylim(0, 1)
	plt.hold(True)

	for pt in points:
		plt.plot([pt[0], pt[0]], [0,pt[1]], "b")

	if settings.score_diagram != None:
		plt.savefig(settings.score_diagram)

	# Generate count per position plot
	points = [[i, pos_count_dct[i]] for i in range (lower_bound, 0)] + \
			 [[i, pos_count_dct[i]] for i in range (1, upper_bound + 1)]

	plt.xlim(lower_bound - 1, upper_bound + 1)
	plt.ylim(0, max(pos_count_dct.values()))
	plt.hold(True)

	for pt in points:
		plt.plot([pt[0], pt[0]], [0,pt[1]], "r")

	if settings.count_diagram != None:
		plt.savefig(settings.count_diagram)

	# Generate logo
	if settings.logo_file != None:
		padded_fragments_list = pad_sequences(fragment_as_seq_list)

		merged_fragments_list = merge_frgaments(padded_fragments_list)

		chimera_motifs = motifs.create(merged_fragments_list, ALPHABET)

		print chimera_motifs
		print chimera_motifs.counts

		create_sequnce_file("logo_data", merged_fragments_list, lower_bound, upper_bound, ligation_index)

		os.system("/home/users/amirbar/.local/bin/weblogo -A rna -c classic --resolution 600 --errorbars NO -i %(start_index)s --format PNG < logo_data  > %(logo_file)s" % \
				  {"logo_file": settings.logo_file,
				   "start_index": str(lower_bound)})
alphabet = Gapped(IUPAC.ambiguous_dna)


sequences = []
handle = open("dataset/test.fasta","rU")
#record_index = SeqIO.index("dataset/test.fasta", "fasta")
for record in SeqIO.parse(handle,"fasta"):
	#print (record.id)
	#print (record.seq)

	#instances = sequences.append(record.seq)
#m = motifs.create(record.seq)
	
	sequences = list(record.seq)
	m = motifs.create(sequences)
	#counts = m.counts

	print (sequences)
	print (m)
	#print (counts, end="")
#print (m.counts)

	#alignment = AlignIO.read(open("dataset/test.fasta"), "fasta")
#alignment = AlignIO.read(dna,"fasta")


#summary_align = AlignInfo.SummaryInfo(alignment)
#consensus = summary_align.dumb_consensus(threshold = 0, ambiguous = 'N', consensus_alpha = alphabet, require_multiple = 2)

Esempio n. 54
0
def cons(stdin):
    m = motifs.create([record.seq for record in SeqIO.parse(stdin, 'fasta')])
    print m.consensus
    for c in 'ACGT':
        print c + ': ' + ' '.join(map(str, m.counts[c]))
Esempio n. 55
0
def build_motif(sites):
    """Builds a Biopython motifs.Motif object out of given sites."""
    motif = motifs.create(sites)
    motif.pseudocounts = 0.8  # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2647310/
    return motif
Esempio n. 56
0

def seq_iter(file):
    if file:
        found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
        if not found:
            print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]", file=sys.stderr)
            sys.exit(1)
        seq_format, is_gz = found.groups()
        if seq_format == 'fa':
            seq_format = 'fasta'
        if seq_format == 'fq':
            seq_format = 'fastq'

        fh = gzip.open(file, 'rt') if is_gz else open(file, 'r')
        for record in SeqIO.parse(fh, seq_format):
            yield record.seq
        fh.close()
    else:
        for line in sys.stdin:
            yield Seq(line.strip())


if __name__ == '__main__':
    args = parse_args()
    seqs = seq_iter(args.infile)
    seqs2 = [seq for seq in seqs if not 'N' in seq]
    m = motifs.create(seqs2)
    print(m.pwm)
    # print(m.pssm)
    # m.weblogo("motif.png")
Esempio n. 57
0
hba1 = "------------------------------------------------------CATAAACCCTGGC------------------------------------------------------------------------------GCGCTCGCGGCCCGGCACTCTTCTGGTCCCCA-CAGACTCAGAGAGAACCCACCATGGTGC---TGTCTCCTGCCGACAAGACCAACGTCAAGGCCGCCTGGGGTAAGGTCGGCGCGCACGCTGGCGAGTATGGTGCGGAGGCCCTGGAGAGGATGTTCCTGTCCTTCCCCACCACCAAGACCTACTTCCCGCACTTCG---ACCTGAGCCACGGCTCTG---------------CCCAGGTTAAGGGCCACGGCAAGAAGGTGGCCGACGCGCTGACCAACGCCGTGGCGCACGTGGACGACATGCCCAACGCGCTGTCCGCCCTGAGCGACCTGCACGCGCACAAGCTTCGGGTGGACCCGGTCAACTTCAAGCTCCTAAGCCACTGCCTGCTGGTGACCCTGGCCGCCCACCTCCCCGCCGAGTTCACCCCTGCGGTGCACGCCTCCCTGGACAAGTTCCTGGCTTCTGTGAGCACCGTGCTGACCTCCAAATACCGTTAAGCTGGAGCCTCGGTGGCCATGCTTCTTGCCCCTTGGGCCTCCCCCCAGCCCCTCCTCCCCTTCCTGCACCCGTACCCCCGTGGTC-TTTGAATAAAGTCTGAGTGGGCGGCAAAAAAAAAAAAAAAAAAAAAA----------"

hbb = "-------------------------------------------------------------------------------------------------------------------------------------------------ACATTTGC-TTCTGACACAACTGTGTTCACTAGCAACCTCAAA---CAGACACCATGGTGCATCTGACTCCTGAGGAGAAGTCTGCCGTTACTGCCCTGTGGGGCAAGGTGAACGTGGATGAAG------TTGGTGGTGAGGCCCTGGGCAGGCTGCTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCACTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAAGTGCTCGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACCTTTGCCACACTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCCTGGGCAACGTGCTGGTCTGTGTGCTGGCCCATCACTTTGGCAAAGAATTCACCCCACCAGTGCAGGCTGCCTATCAGAAAGTGGTGGCTGGTGTGGCTAATGCCCTGGCCCACAAGTATCACTAAGCTCG------CTTTCTTGCTGTCCAATTTCTATTAAAGGTTCCTTTGTTCCCTAAGTCCAACTACTAAACTGGGGGATATTATGAAGGGCCTTGAGCATCTGGATTCTGCCTAATAAAAAACATTTATTTTCATTGC-----"

hbd = "AGGGCAAGTTAAGGGAATAGTGGAATGAAGGTTCATTTTTCATTCTCACAAACTAATGAAACCCTGCTTATCTTAAACCAACCTGCTCACTGGAGCAGGGAGGACAGGACCAGCATAAAAGGCAGGGCAGAGTCGACTGTTGCTTACACTTTC-TTCTGACATAACAGTGTTCACTAGCAACCTCAAA---CAGACACCATGGTGCATCTGACTCCTGAGGAGAAGACTGCTGTCAATGCCCTGTGGGGCAAAGTGAACGTGGATGCAG------TTGGTGGTGAGGCCCTGGGCAGATTACTGGTGGTCTACCCTTGGACCCAGAGGTTCTTTGAGTCCTTTGGGGATCTGTCCTCTCCTGATGCTGTTATGGGCAACCCTAAGGTGAAGGCTCATGGCAAGAAGGTGCTAGGTGCCTTTAGTGATGGCCTGGCTCACCTGGACAACCTCAAGGGCACTTTTTCTCAGCTGAGTGAGCTGCACTGTGACAAGCTGCACGTGGATCCTGAGAACTTCAGGCTCTTGGGCAATGTGCTGGTGTGTGTGCTGGCCCGCAACTTTGGCAAGGAATTCACCCCACAAATGCAGGCTGCCTATCAGAAGGTGGTGGCTGGTGTGGCTAATGCCCTGGCTCACAAGTACCATTGAGATC-------CTGGACTGTTTCCTGATAACCATAAGAAGACCCTATTTCCCTAGATTCTATTTTCTGAACTTGGGAACACAATG-CCTACTTCAAGGGTATGGCTTCTGCCTAATAAAGAATGTTCAGCTCAACTTCCTGAT"

hbg1 = "-------------------------------------------------------------------------------------------------------------------------------------------------ACACTCGC-TTCTGGAACGTCTGAGGTTATCAATAAGCTCCTAGTCCAGACGCCATGGGTCATTTCACAGAGGAGGACAAGGCTACTATCACAAGCCTGTGGGGCAAGGTGAATGTGGAAGATG------CTGGAGGAGAAACCCTGGGAAGGCTCCTGGTTGTCTACCCATGGACCCAGAGGTTCTTTGACAGCTTTGGCAACCTGTCCTCTGCCTCTGCCATCATGGGCAACCCCAAAGTCAAGGCACATGGCAAGAAGGTGCTGACTTCCTTGGGAGATGCCACAAAGCACCTGGATGATCTCAAGGGCACCTTTGCCCAGCTGAGTGAACTGCACTGTGACAAGCTGCATGTGGATCCTGAGAACTTCAAGCTCCTGGGAAATGTGCTGGTGACCGTTTTGGCAATCCATTTCGGCAAAGAATTCACCCCTGAGGTGCAGGCTTCCTGGCAGAAGATGGTGACTGCAGTGGCCAGTGCCCTGTCCTCCAGATACCACTGAGCTC-------ACTGCCCATGATTCAGAGCTTTCAAGGATAGGCTTTATTCTGCAAGC----------------------------AATACAAATAATAAATCTATTCTGCTGAGAGATCAC---------------------"

ins = "------------------AGCCCTCCAGGACAGGCTGC-ATCAGAAGAGGCCATCAAGCA-GATCACTGTCCTTCTGCCATGGCCCTGTGGATGCGCCTCCTGCCCCTGCTGGCGCTGCTGGCCCTCTGGGGACCTGACCCAGCCGCAGCCTTTGTGAACCAACACCTGTGCGGCTCACACCTGGTGGAAGCTCTCTACCTAGTGTGCGGGGAACGAGGCTTCTTCTACACACCCAAGACCCGCCGGGAGGCAGAGGACCTGCAGGTGGGGCAGGTGGAGCTGGGCGGGGGCCCTGGTGCAGGCAGCCTGCAGCCCTTGGCCCTGGAGGGGTCCCTGCAGAAGCGTGGCATTGTGGAACAATGCTGTACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAGACGCAGCCCGCAGGCAGCCCCACACCCGCCGCCTCCTGCACCGAGAGAGATGGAATAAAGCCCTTGAACCAGC----AAAA"

ins2= "GGGGACCCAGTAACCACCAGCCCTAAGTGATCCGCTACAATCAAAAACCATCAGCAAGCAGGAAGGTTATTGTTTCAACATGGCCCTGTGGATGCGCTTCCTGCCCCTGCTGGCCCTGCTCTTCCTCTGGGAGTCCCACCCCACCCAGGCTTTTGTCAAGCAGCACCTTTGTGGTTCCCACCTGGTGGAGGCTCTCTACCTGGTGTGTGGGGAGCGTGGCTTCTTCTACACACCCATGTCCCGCCGTGAAGTGGAGGACCCACAAGTGGCACAACTGGAGCTGGGTGGAGGCCCGGGAGCAGGTGACCTTCAGACCTTGGCACTGGAGGTGGCCCAGCAGAAGCGTGGCATTGTAGATCAGTGCTGCACCAGCATCTGCTCCCTCTACCAGCTGGAGAACTACTGCAACTAGACCCA--CCACTACCCAGCCTAC--------CCCTCTGCAAT----------GAATAAAACCTTTGAATGAGCACAAAAAA"


exercise5instances = [
    Seq("AGATAA"),
    Seq("TGATAA"),
    Seq("AGATAG"),
    Seq("TGATAG"),
    Seq("TGATCA"),
    Seq("TTATCA"),
]

ex5motif = motifs.create(exercise5instances)

ex5PWM = ex5motif.pwm

ex5logo_heights = ex5motif.counts.normalize(pseudocounts = 1.0)

ex5PSSM = ex5logo_heights.log_odds()
Esempio n. 58
0
# from Bio.Seq import Seq

with open(argv[1], 'r') as input_file:
    contents = input_file.read().split('E-value')[2:]
    lines = '-' * 80
    stars = '*' * 80

    for item in contents:
        evalue = float(item.split(stars)[0].strip().split('=')[1].strip())
        # print item
        # print evalue
        if evalue <= 0.05:
            list_of_motifs = item.split('sites sorted by position p-value')[
                1].split(lines)[1].split('Site')[1].strip()
            # print list_of_motifs
            print ">matrix", evalue

            motif_list = []
            for line in list_of_motifs.split('\n')[1:]:
                line_list = filter(None, line.split(' '))
                motif_list.append(line_list[-2])
            #print len(motif_list)

            m = motifs.create(motif_list)
            trans = m.format("transfac")
            # print trans

            for l in trans.split('\n')[1:-3]:
                numbers = filter(None, l.split(' '))[1:-1]
                print (' '.join(numbers))
Esempio n. 59
0
print( 'species peak min max consensus')

for sp in speciesA:
    fname = os.path.join(base, sp + '.iesdb')
    for lbi in range(0, len(lb)): # for each length bin
        f = open(fname, "r")
        peakNo = str("%0.3d" %lbi)
        figName = os.path.join(baseOut, sp + '.' + peakNo + '.png')
        f.readline() # header
        fbA = []
        for line in f:
            line.strip()
            (idv, scaffold, altSeqNo, startLocs, endLocs, upstreamFlank, downstreamFlank,length, isFloating, merged, inCDS, inInter, inIntron, front, back, seq) = line.split("\t")
            l = int(length)
            if isFloating == '0' and merged == '0' and l >= lengthMin[lbi] and l < lengthMax[lbi]:
                # exclude if have an N
                if re.search('N', front):
                    continue
                fbA.append(Seq(front[0:5]))
        f.close()

        fbm = motifs.create(fbA)
        title = sp + ': ' + str(lb[lbi]) + '[' + str(lengthMin[lbi]) + '-' + str(lengthMax[lbi]) + ']'
        print( sp + ' ' + str(lb[lbi]) + ' ' + str(lengthMin[lbi]) + ' ' + str(lengthMax[lbi]) + ' ' + fbm.consensus)
        if os.path.exists(figName):
            continue # do not redo figures
#        print(title)
#        fbm.weblogo(figName, logo_title = title)
#        quit()