Exemple #1
0
def probabilities_blosum_62_2():
    seqs = sequence.readFastaFile("./files/simple_seqs/simple_2.fasta")

    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum62LatestProbs,
                             log_transform=True)

    return phmm
Exemple #2
0
def durbin_blosum_50_2():

    seqs = sequence.readFastaFile("./files/simple_seqs/durbin_2.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum50,
                             log_transform=True)

    return phmm
Exemple #3
0
def annotateThis(args):

    outputfile = output(args)

    if '.fa' in args.annotateFile or '.fasta' in args.annotateFile:
        outputfile = outputfile + '.csv'
        print("this is a FASTA file")
        annot = sequence.readFastaFile(args.annotateFile,
                                       sequence.Protein_Alphabet,
                                       ignore=True,
                                       parse_defline=False)

        with open(outputfile, 'w', newline='') as f:
            fieldnames = ['Name', 'Sequence', 'Annots']
            thewriter = csv.DictWriter(f, fieldnames=fieldnames)

            thewriter.writeheader()
            for seq in annot:
                s = ''.join(seq.sequence)
                thewriter.writerow({
                    'Name': seq.name,
                    'Sequence': s,
                    args.annotateKeyword: args.annotateKeyword
                })

    elif '.csv' in args.annotateFile:
        print("this is a CSV file")
        outputfile = outputfile + '.csv'
        with open(args.annotateFile, newline='') as f:
            reader = csv.reader(f)
            header = next(reader)
            nameCol = 0
            seqCol = 0
            dictionary = {}

            for h in range(len(header)):
                if header[h] == 'Name':
                    nameCol = h
                elif header[h] == 'Sequence':
                    seqCol = h

            for row in reader:
                dictionary[row[nameCol]] = row[seqCol]

        with open(outputfile, 'w', newline='') as f:
            fieldnames = ['Name', 'Sequence', 'Annots']
            thewriter = csv.DictWriter(f, fieldnames=fieldnames)
            thewriter.writeheader()
            for name, seq in dictionary.items():
                thewriter.writerow({
                    'Name': name,
                    'Sequence': seq,
                    'Annots': args.annotateKeyword
                })
Exemple #4
0
def two_col_62_2():

    seqs = sequence.readFastaFile("./files/custom_seqs/2_col.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.basic_params, [profile1, profile2],
                             sub_matrix.blosum62,
                             log_transform=True)

    return phmm
Exemple #5
0
def borodovsky_blosum_50_2():

    seqs = sequence.readFastaFile("./files/simple_seqs/borodovsky.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.borodovsky_4_7, [profile1, profile2],
                             sub_matrix.blosum62LatestProbs,
                             log_transform=False)

    return phmm
Exemple #6
0
def ox_104t17_1():

    seqs = sequence.readFastaFile(
        "./files/qscore_corrections/ox_104t17_1.fasta")
    profile1 = aln_profile.AlignmentProfile([seqs[0]])
    profile2 = aln_profile.AlignmentProfile([seqs[1]])

    phmm = align.load_params(params.qscore_params, [profile1, profile2],
                             sub_matrix.blosum62EstimatedWithX,
                             log_transform=True)

    return phmm
Exemple #7
0
def read(args):

    outputfile = output(args)

    orig_dict = {}

    if '.csv' in args.input:
        print("this is a CSV file")
        outputfile = outputfile + '.fa'
        with open(args.input, newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                orig_dict[row[0]] = row[1]
        seq_list = [
            sequence.Sequence(sequence=seq, name=seqname)
            for seqname, seq in orig_dict.items()
        ]
        sequence.writeFastaFile(outputfile, seq_list)

    elif '.tab' in args.input or '.tsv' in args.input:
        print("this is a TAB/TSV file")
        outputfile = outputfile + '.fa'
        with open(args.input) as tsv:
            for line in csv.reader(tsv, dialect="excel-tab"):
                orig_dict[line[0]] = line[1]

        seq_list = [
            sequence.Sequence(sequence=seq, name=seqname)
            for seqname, seq in orig_dict.items()
        ]
        sequence.writeFastaFile(outputfile, seq_list)

    elif '.fa' in args.input or '.fasta' in args.input:
        print("this is a FASTA file")
        outputfile = outputfile + '.csv'
        db100 = sequence.readFastaFile(args.input,
                                       sequence.Protein_Alphabet,
                                       ignore=True,
                                       parse_defline=False)

        with open(outputfile, 'w', newline='') as f:
            fieldnames = ['Name', 'Sequence']
            thewriter = csv.DictWriter(f, fieldnames=fieldnames)

            thewriter.writeheader()
            for seq in db100:
                s = ''.join(seq.sequence)
                thewriter.writerow({'Name': seq.name, 'Sequence': s})
Exemple #8
0
'''
Created on 29/03/2014

@author: jacekrad
'''
import sequence as seq


ex5_filename = "sigpep_at.fa"
ex6_filename = "lipmet_at.fa"
ex7_filename = "ex7.fa"

""" read sequences from questions 5 & 6 into corresponding lists """ 
sequences_q5 = seq.readFastaFile(ex5_filename)
sequences_q6 = seq.readFastaFile(ex6_filename)

print "Q5 sequence has ", len(sequences_q5), " entries"
print "Q6 sequence has ", len(sequences_q6), " entries"

ids_q5 = []
ids_q6 = []
for sequence in sequences_q5:
    ids_q5.append(sequence.name)
for sequence in sequences_q6:
    ids_q6.append(sequence.name)
    
common_ids = set(ids_q5).intersection(set(ids_q6))

print len(common_ids), " common matches found"

""" save the common entries into a FASTA file as well as a dictionary
Exemple #9
0
    for i in range(len(calls)):  # go through each position
        supported.append(calls[i] and diff[i] > 0)
    return supported
    
def getScores(seq, index=0):
    """ Create a score list for a sequence by referencing the Chou-Fasman table.
    """
    return [cf_dict[s.upper()][index] for s in seq]


""" -------------------------------------------
Below is test code 
------------------------------------------- """

# Read some protein sequence data
prot = sequence.readFastaFile('prot2.fa', symbol.Protein_Alphabet)
# read the secondary structure data for the proteins above (indices should agree)
sstr = sequence.readFastaFile('sstr3.fa', symbol.DSSP3_Alphabet)

#prot = [sequence.Sequence('PNKRKGFSEGLWEIENNPTVKASGY', symbol.Protein_Alphabet, '2NLU_r76')]
#sstr = [sequence.Sequence('CCCCHHHHHHHHHHHCCCCCCCCCC', symbol.DSSP3_Alphabet, '2NLU_s76')]

#prot = [sequence.Sequence("SEQSICQARAAVMVYDDANKKWVPAGGSTGFSRVHIYHHTGNNTFRVVGRKIQDHQVVIN" +\
#    "CAIPKGLKYNQATQTFHQWRDARQVYGLNFGSKEDANVFASAMMHALEVLN", symbol.Protein_Alphabet, "1EVH")]
#sstr = [sequence.Sequence("CEEEEEEEEEEEEEEECCCCEEEEHHHCCCCEEEEEEEECCCCEEEEEEEECCCCCEEEEEEE" +\
#    "CCCCCCECCCCCEEEEECCCCEEEEEECCHHHHHHHHHHHHHHHHHHC", symbol.DSSP3_Alphabet, "1EVH")]

tp = 0  # number of true positives (correctly identified calls)
tn = 0  # number of true negatives (correctly missed no-calls)
fp = 0  # number of false positives (incorrectly identified no-calls)
fn = 0  # number of false negatives (incorrectly missed calls)
Exemple #10
0
'''
Created on 30/03/2014

Assessment question 5
Exercises 8 & 9

@author: jacekrad
'''

import sequence as seq
from collections import Counter
from webservice import *

sequences = seq.readFastaFile("mystery1.fa")

all_ids = []

""" for all the IDs in the sequences found in mystery1.fa get a ID mapping
    from P_REFSEQ_AC to ACC and for each IS in the map (dictionary) add it to
    the list of all IDs
"""
for sequence in sequences:
    ids = idmap(sequence.name, 'P_REFSEQ_AC', 'ACC')
    for value in ids.values():
        all_ids.append(value)


""" get a list of unique IDs from all the IDs """
unique_ids = list(set(all_ids))

combined_GOterms = []
Exemple #11
0
                input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1
                outvec = self.nn1.feedforward(input)
                d = prob.Distrib(self.outp_alpha)
                for k in range(len(outvec)):
                    d.observe(self.outp_alpha[k], outvec[k])
                predsyms[i + W / 2] = d.getmax()    # use the symbol with the highest probability
            return sequence.Sequence(predsyms, self.outp_alpha)

##################################################################################################################
#      Example applications of ML methods including NNs and Naive Bayes for secondary structure prediction.      #
##################################################################################################################

if __name__=='__main__': # examples to run unless this module is merely "imported"
    import os, time
    os.chdir('/Users/mikael/workspace/binf/data')  # Note you will need to change this to find your directory of choice
    prot = sequence.readFastaFile('prot2.fa', symbol.Protein_Alphabet)  # proteins
    sstr = sequence.readFastaFile('sstr3.fa', symbol.DSSP3_Alphabet)    # secondary structure of prot
    # separate training and test data
    prot_trn = prot[0::2] # even-numbered indices
    prot_tst = prot[1::2] # odd-numbered indices
    sstr_trn = sstr[0::2] # even-numbered indices
    sstr_tst = sstr[1::2] # odd-numbered indices
    W = 15

if __name__=='__main__':   # NN (should read "__main__" for it to be executed on "Run")  
    nHid = 30
    nn = SeqNN(W, symbol.Protein_Alphabet, symbol.DSSP3_Alphabet, nHid, cascade = W)
    #nn.nn = ml.readNNFile('sstr3.nn')
    #print "Successfully loaded network"
    start = time.time()
    print nn.observeAll(prot_trn, sstr_trn, eta = 0.01, niter = 20)
Exemple #12
0
def align_seqs(inpath,
               outpath,
               aln_type,
               params=parameters.basic_params,
               subsmat=sub_matrix.blosum62EstimatedWithX_dict,
               log_transform=True):

    print("params are")
    print(params)

    # Read sequences in
    seqs = sequence.readFastaFile(inpath, alphabet=Protein_Alphabet_wB_X_Z)

    print(len(seqs))

    if len(seqs) == 2:
        aln_order = [("N0", [seqs[0].name, seqs[1].name])]

    else:

        # Calculate guide tree
        guide_tree = gt.get_guide_tree(seqs, random=False)
        print(guide_tree.ascii_art())

        # Get the alignment order
        aln_order = gt.get_aln_order(guide_tree)
        # print (aln_order)

    print(aln_order)

    seq_dict = {x.name: x for x in seqs}

    # Predecessors start off blank
    predecessors = [{}, {}]

    # Create alignment in order from guide tree
    for node in aln_order:

        # Get the current node name and list of sequences under that node
        curr_node = node[0]
        curr_seqs = node[1]

        # List to store the aligned sequences in
        aligned = []

        # While the node has sequences underneath yet to be aligned
        while curr_seqs:

            # Get a sequence
            seq = curr_seqs.pop()

            # Make it into a profile if it isn't one already
            if type(seq_dict[seq]) != aln_profile.AlignmentProfile:
                profile = aln_profile.AlignmentProfile([seq_dict[seq]])
            else:
                profile = seq_dict[seq]

            # Add sequence to the aligned list
            aligned.append(profile)

            # if len(alns) > 1:
            #     new_align = "-align-".join(alns)
            #     alns = []
            #     alns.append(new_align)

            # If we have two profiles it is time to align
            if len(aligned) > 1:

                pair_hmm = load_params(params, aligned, subsmat, log_transform,
                                       predecessors)

                if aln_type == 'viterbi':

                    pair_hmm.performViterbiAlignment(po=False)
                    aligned_profile = pair_hmm.get_alignment(
                        type_to_get='viterbi')

                elif aln_type == 'poviterbi':

                    pair_hmm.performViterbiAlignment(po=True)
                    aligned_profile = pair_hmm.get_alignment(
                        type_to_get='viterbi')

                elif aln_type == 'mea':

                    pair_hmm.performMEAAlignment(po=False)
                    aligned_profile = pair_hmm.get_alignment(type_to_get='mea')

                elif aln_type == 'pomea':

                    pair_hmm.performMEAAlignment(po=True)
                    aligned_profile = pair_hmm.get_alignment(type_to_get='mea')

                # Clear the previous unaligned sequences
                aligned = []

                # Add the aligned sequences
                aligned.append(aligned_profile)

        # print ('wowza')
        # print (aligned[0])
        # print(aligned[0].predecessors)

        seq_dict[curr_node] = aligned[0]

        # print('alignment is ')
        # print(aligned_profile)

    with open(outpath, 'w') as outfile:
        outfile.write(str(aligned_profile))

    return aligned_profile
Exemple #13
0
        sys.exit(2)
    FILENAME =      None
    DISCOVER_MODE = False
    SCAN_MODE =     False
    WORD_WIDTH =    8
    PEAK_WIDTH =    100
    PEAK_MARGIN =   100
    MOTIF_ID =      'MA0112.2'
    JASPAR_FILE =   'JASPAR_matrices.txt'
    for o, a in optlst:
        if   o == '-h': usage(sys.argv[0])
        elif o == '-f': FILENAME = a
        elif o == '-d': DISCOVER_MODE = True
        elif o == '-w': WORD_WIDTH = int(a)
        elif o == '-p': PEAK_WIDTH = int(a)
        elif o == '-m': PEAK_MARGIN = int(a)
        elif o == '-s': SCAN_MODE = True; MOTIF_ID = a
        elif o == '-j': JASPAR_FILE = a
    if FILENAME == None:
        usage(sys.argv[0], "Filename not specified")
        sys.exit(3)
    seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
    if DISCOVER_MODE:
        print("Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN))
        countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
    elif SCAN_MODE:
        scanMotifReport(seqs, MOTIF_ID)
    else:
        usage(sys.argv[0], "No run mode selected")

Exemple #14
0
def motifSearch(args, motifs):
    names = dict()
    # Create a dictionary for each motif,
    # which associates columns with a match to the number of sequences with that match
    for s in args.input:
        if (args.type == 'a'):
            aln = sequence.readFastaFile(s,
                                         sequence.Protein_Alphabet,
                                         gappy=True,
                                         ignore=True,
                                         parse_defline=False)
            ali = sequence.Alignment(aln)
        else:
            ali = sequence.readFastaFile(s,
                                         sequence.Protein_Alphabet,
                                         ignore=True,
                                         parse_defline=False)

        dictionary = dict()

        for a in ali:
            T1 = False
            if 'T1' in a.info:
                seqName = a.name + "+T1=yes"
                T1 = True
            if 'T2' in a.info:
                seqName = a.name + "+T2=yes"
            elif T1 == False:
                seqName = a.name
            #print(seqName)
            #seqName = str(a).split(":")[0]
            seqSequence = str(a).split(":")[1].strip()
            thisset = set()

            for m in motifs:
                number = 0
                for i in range(len(args.motif)):
                    if str(m) == args.motif[i]:
                        number = i
                if (args.type == 'a'):
                    result = m.search(a, gappy=True)
                else:
                    result = m.search(a)

                if (len(result) > 1):
                    for r in result:  #position, matched string, score
                        motifStart, foundMotif, n = r
                        addThis = ('motif' + str(number + 1) + ',' +
                                   str(motifStart) + ',' + str(foundMotif) +
                                   ',' + str(len(foundMotif) + motifStart))
                        thisset.add(addThis)
                elif (len(result) == 1):
                    motifStart, foundMotif, n = str(result).split(",")
                    addThis = ('motif' + str(number + 1) + ',' +
                               str(motifStart[2:]) + ',' + str(foundMotif) +
                               ',' +
                               str(len(foundMotif) + int(motifStart[2:])))
                    thisset.add(addThis)
                else:
                    pass
            thisset.add('Sequence,' + seqSequence + ', ' + ', ')
            dictionary[seqName] = thisset
        names[s] = dictionary
    #print(names)
    return (names, dictionary)
Exemple #15
0
        self.alignment = a
        return q

    def getForeground(self):
        """ Return the probability distributions for columns in the discovered alignment. """
        return self.q
    
    def getBackground(self):
        """ Return the probability distributions for the background used in the discovery. """
        return self.p

# Example 1: Find the peroxisome targeting signal
if __name__=='__main__0':
    import os
    os.chdir('/Users/mikael/workspace/binf/data/')  # set to the directory where you keep your files 
    seqs = sequence.readFastaFile('pex2.fa', symbol.Protein_Alphabet)
    W = 3
    pseudo = prob.readDistrib('blosum62.distrib')
    gibbs = GibbsMotif(seqs, W)
    q = gibbs.discover(pseudo)
    p = gibbs.getBackground()
    
    # Let's display the results, i.e. the best matches to the found motif
    a = getAlignment(seqs, q, p)
    k = 0
    for seq in seqs:
        print "%s \t%d \t%s" % (str(seq), a[k], seq[a[k]:a[k]+W])
        k += 1

    # save the motif in two files: one for the foreground distributions and one with the background
    prob.writeDistribs(q, 'pex2q.distrib')
Exemple #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Input FASTA file",
                        required=True)
    parser.add_argument("-db",
                        "--database",
                        help="Database output file name",
                        required=True)
    parser.add_argument("-r",
                        "--redundancy",
                        nargs='*',
                        help="List of redundancy levels",
                        default=[90, 80, 70])
    parser.add_argument("-t1", "--tier1", help="User's Tier1 sequences")
    parser.add_argument("-t2", "--tier2", help="User's Tier2 sequences")
    parser.add_argument("-ml",
                        "--maxlength",
                        help="Max length that the sequence can be",
                        default=800)
    parser.add_argument("-e",
                        "--eval",
                        nargs='*',
                        help="List of evalues",
                        default=[1e-100, 1e-75, 1e-50, 1e-20, 1e-10, 1e-5])
    args = parser.parse_args()

    tier2 = {}
    tier2_short = {}
    tier2_annots = {
    }  # annotations that we want to include in the final dataset

    if args.tier2:
        print("tier2 sequences have been provided")

        if '.fa' in args.tier2 or '.fasta' in args.tier2:
            print("tier2 sequences are FASTA file")
            tier2db = sequence.readFastaFile(args.tier2,
                                             sequence.Protein_Alphabet,
                                             ignore=True,
                                             parse_defline=False)
            print(str(len(tier2_list)) + " sequences in tier2")
            tier2_list = {}  # map from "long" name to actual entry
            tier2_map_short = {}  # map from "short" name to entry
            for s in tier2db:
                tier2_list[s.name] = s
                tier2_map_short[sequence.parseDefline(s.name)[0]] = s
        else:
            print("Please provide FASTA file for tier-2")

    if args.tier1:
        tier1 = {}
        tier1_annots = {
        }  # annotations that we want to include in the final dataset
        print("Tier-1 sequences have been provided")
        if '.fa' in args.tier1 or '.fasta' in args.tier1:

            print("Tier-1 sequences are provided as a FASTA file")
            tier1db = sequence.readFastaFile(args.tier1,
                                             sequence.Protein_Alphabet,
                                             ignore=True,
                                             parse_defline=False)
            tier1_list = {}
            for s in tier1db:
                tier1_list[s.name] = "".join(s.sequence)
            print("Tier-1 has " + str(len(tier1_list)) + " sequences")

        else:
            print("Please provide FASTA file for tier-1")

    db100 = sequence.readFastaFile(args.input,
                                   sequence.Protein_Alphabet,
                                   ignore=True,
                                   parse_defline=False)
    db100_map = {}  # map from "long" name to actual entry
    db100_map_short = {}  # map from "short" name to entry
    for s in db100:
        db100_map[s.name] = s
        db100_map_short[sequence.parseDefline(s.name)[0]] = s
    print("Database has " + str(len(db100_map)) + " sequences")

    for rr in args.redundancy:
        rs = str(rr)
        os.system('cd-hit -i ' + args.input + ' -c 0.' + rs + ' -T 5 -o db' +
                  rs + ' -d 0')

    selected = {}
    for rr in args.redundancy:
        selected[rr] = []
        filename = 'db' + str(rr) + '.clstr'
        clusters = readCDHIT(filename)
        for c in clusters:
            picked_one = False
            shortest = None
            reviewed = None
            for name in clusters[c]:
                if name in db100_map:
                    seq = db100_map[name]
                    if shortest:
                        if len(seq) < len(shortest) and not disqualified(
                                seq, args):
                            shortest = seq
                    elif not disqualified(seq, args):
                        shortest = seq
                    if seq.name.startswith('sp|') and not disqualified(
                            seq, args):
                        reviewed = seq
                    if name in tier1_list:
                        #print("this one orig" + str(seq))
                        selected[rr].append(seq)
                        picked_one = True
                else:
                    pass
                    #print('Did not find', name)
            # If no Tier-1, prefer "reviewed", then shortest length
            if not picked_one and reviewed:
                selected[rr].append(reviewed)
            elif not picked_one and shortest:
                selected[rr].append(shortest)

    for rr in args.redundancy:
        filename = 'db' + str(rr) + '.fa'
        sequence.writeFastaFile(filename, selected[rr])

    for rr in args.redundancy:
        os.system('makeblastdb -dbtype prot -in db' + str(rr) +
                  '.fa -out db-' + str(rr))

    # for rr in args.redundancy:
    #     for evalue in args.evalue:
    #         result_file = "dataset-" + str(rr) + '-'+ str(evalue)
    #         cmd1 = "blastp -db db-" + str(rr) + " -outfmt 3 -num_descriptions 20000 -num_alignments 0 -num_threads 5 -query " + args.tier1 + " -out " + result_file + ".txt -evalue " + str(evalue)
    #         print(cmd1)
    #         os.system(cmd1)

    grab = False

    for rr in args.redundancy:
        for evalue in args.eval:
            c = 0
            tpsIdentifier = set([])
            seqs = []
            result_file = "dataset-" + str(rr) + '-' + str(evalue)
            f = open(result_file + '.txt', 'rt')
            for row in f:
                if row.startswith('Sequences'):
                    grab = True
                    continue
                if grab == True:
                    if row.startswith('Lambda'):
                        grab = False
                    if not row.strip() == "":
                        identifier = row.split(' ')[0]
                        if identifier != "Lambda":
                            tpsIdentifier.add(identifier)

            for name in tpsIdentifier:
                try:
                    seq = db100_map[name]
                    info = ''
                    seqs.append(
                        sequence.Sequence(seq.sequence, seq.alphabet, seq.name,
                                          info))
                except:
                    pass
            sequence.writeFastaFile(result_file + ".fa", seqs)
            print(result_file + " has " + str(len(seqs)) + "sequences")

    print('Done')

    totalSeqCount = []
    c = 0
    for evalue in args.eval:
        for rr in args.redundancy:
            output = []
            ev = str(evalue)
            ev = ev[1:]
            red = str(rr)
            result_file = "dataset-" + str(rr) + '-' + str(evalue)
            a = sequence.readFastaFile(result_file + '.fa',
                                       sequence.Protein_Alphabet,
                                       ignore=True,
                                       parse_defline=False)

            names = set([])
            for s in a:
                names.add(s.name)
            tier1_cnt = 0
            tier2_cnt = 0
            seqs = []
            for name in names:
                try:
                    seq = db100_map[name]
                    info = ''
                    if name in tier1_list:
                        tier1_cnt += 1
                        #info = seq.info + ' ' + tier1_annots[name]
                    elif name in tier2:
                        tier2_cnt += 1
                        #info = seq.info + ' ' + tier2_annots[name]
                    seqs.append(
                        sequence.Sequence(seq.sequence, seq.alphabet, seq.name,
                                          info))
                except:
                    pass
                #print('Did not find', name)
            print('Processed', len(seqs), 'for', result_file, ' Tier-1:',
                  tier1_cnt, ' Tier-2:', tier2_cnt)
            output = [ev, red, len(seqs)]
            totalSeqCount.append(output)

    plotSeqs(totalSeqCount)
Exemple #17
0
'''
Created on 30/03/2014

@author: jacekrad
'''

import sequence as seq
import util

q6b_filename="q6b.fasta"

util.searchAndSave("surface+protein+AND+organism:1280", q6b_filename)

sequences = seq.readFastaFile(q6b_filename)

print len(sequences), " total sequences"

matched_sequences = []
for sequence in sequences:
    if "RAFKPS" in str(sequence.sequence):
        matched_sequences.append(sequence)

""" print the final results """        
print len(matched_sequences), " matched sequences:"
for sequence in matched_sequences:
    print sequence
Exemple #18
0
    f.write('\n')
    f.close()


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print('Usage: evodiv <tree> <alignment> <nodes> where', \
        "\n\t<tree> is a completely labelled Newick file of a phylogenetic tree", \
        "\n\t<alignment> is a FASTA or Clustal file with a sequence for each label in tree", \
        "\n\t<nodes> is a FASTA file with a sequence entry for each label for which a variability is determined", \
        "\n\tVariability is saved to file with the sequence entry's name .txt")
        sys.exit(1)
    tree = phylo.readNewick(sys.argv[1])
    try:
        seqs = sequence.readFastaFile(sys.argv[2],
                                      alphabet=sequence.Protein_wX,
                                      gappy=True)
        aln = sequence.Alignment(seqs)
    except:
        aln = sequence.readClustalFile(sys.argv[2],
                                       sequence.Protein_Alphabet_wX)
    tree.putAlignment(aln)
    select = sequence.readFastaFile(sys.argv[3],
                                    alphabet=sequence.Protein_wX,
                                    gappy=True)
    nodes = []
    for selected in select:
        nodename = selected.name
        nodes.append(nodename)
    for nodename in nodes:
        node = tree.findLabel(nodename)
Exemple #19
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="FASTA file to query from",
                        required=True)
    parser.add_argument("-q",
                        "--query",
                        help="Query FASTA file",
                        required=True)
    parser.add_argument("-db",
                        "--database",
                        help="Database output file name",
                        required=True)
    parser.add_argument("-r",
                        "--reference",
                        help="Reference database ",
                        default="uniprotkb")
    parser.add_argument("-o",
                        "--output",
                        help="Output path",
                        default="matchmyseqs")

    args = parser.parse_args()

    seqDict = {}
    tier1seq = ''
    representative = ''
    fasta = {}
    seqsforCSV = {}
    progress = 0
    tier1 = {}
    tier1_annots = {
    }  # annotations that we want to include in the final dataset

    os.system('makeblastdb -dbtype prot -in ' + args.input + ' -out ' +
              args.database)

    db = sequence.readFastaFile(args.input,
                                sequence.Protein_Alphabet,
                                ignore=True,
                                parse_defline=False)
    db_map = {}  # map from "long" name to actual entry
    db_map_short = {}  # map from "short" name to entry
    for s in db:
        db_map[s.name] = s
        db_map_short[sequence.parseDefline(s.name)[0]] = s
    print("Database size is " + str(len(db_map)))

    print(
        "Blast started, this might take a bit depending on your dataset size")
    os.system("blastp -db " + args.database +
              " -outfmt 3 -num_descriptions 1 -num_alignments 0 -query " +
              args.query + " -out query.txt")

    if args.reference == 'uniprotkb':
        os.system(
            "grep -e \"^[st][pr]|\" query.txt | cut -d\' \' -f1 > UniProt_query.tab"
        )

        # Extract the resulting sequence identifiers
        repSeqNames = set([])
        f = open('UniProt_query.tab', 'rt')
        for row in f:
            repSeqNames.add(sequence.parseDefline(row.strip())[0])
        f.close()
        print(str(len(repSeqNames)),
              " representative sequences have been found")

        #Annot the representative sequences
        notfound = []
        for name in repSeqNames:
            if name in db_map_short:
                s = db_map_short[name]
                seqsforCSV[s.name] = "".join(s)
            else:
                notfound.append(name)
        print('Matched',
              len(repSeqNames) - len(notfound), 'of', len(repSeqNames))

        with open("query.txt", newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) > 0 and row[0].startswith('Query'):
                    querySeq = (str(row).split("=")[1][:-2].strip())
                elif len(row) > 0 and (row[0].startswith('tr|')
                                       or row[0].startswith('sp|')):
                    representative = (str(row).split(" ")[0][2:].strip())
                    seqDict[querySeq] = representative

    elif args.reference == 'refseq':
        grab = False
        repSeqNames = set([])

        with open("query.txt", newline='') as f:
            reader = csv.reader(f)
            for row in reader:
                if len(row) > 0 and row[0].startswith('Query'):
                    querySeq = (str(
                        row[0]).split("=")[1][:-2].strip().split(" ")[0])
                elif len(row) > 0 and row[0].startswith('Sequences'):
                    grab = True
                    continue
                elif grab == True:
                    if len(row) > 0 and not row[0].strip() == "":
                        representative = (row[0].split('.')[0] + "." +
                                          row[0].split('.')[1].split(" ")[0])
                        repSeqNames.add(representative)
                        seqDict[querySeq] = representative
                        grab = False
            #print(len(repSeqNames))

            notfound = []

            for name in repSeqNames:
                if name in db_map_short:
                    s = db_map_short[name]
                    seqsforCSV[s.name] = "".join(s)
                else:
                    notfound.append(name)
            print('Matched',
                  len(repSeqNames) - len(notfound), 'of', len(repSeqNames))

            print(len(repSeqNames),
                  " representative sequences found for " + args.query)

    # done25 = False
    # done50 = False
    # done75 = False
    # for s,rep in seqDict.items():
    # 	total = (len(seqDict))
    # 	seq = (sequence.getSequence(rep,'uniprotkb'))
    # 	seqsforCSV[rep] = str(seq).split(":")[1].strip()
    # 	elem = rep + str(seq)
    # 	progress+=1
    # 	if (progress/total)*100 > 25 and not done25:
    # 		print("25% done")
    # 		done25 = True
    # 	elif (progress/total)*100 > 50 and not done50:
    # 		print("50% done")
    # 		done50 = True
    # 	elif (progress/total)*100 > 75 and not done75:
    # 		print("75% done")
    # 		done75 = True

    faOut = args.output + '.fa'

    seq_list = [
        sequence.Sequence(sequence=seq, name=seqname)
        for seqname, seq in seqsforCSV.items()
    ]

    sequence.writeFastaFile(faOut, seq_list)

    csvOut = args.output + '.csv'

    with open(csvOut, 'w', newline='') as f:
        fieldnames = ['Name', 'Representative', 'Sequence']
        thewriter = csv.DictWriter(f, fieldnames=fieldnames)

        thewriter.writeheader()
        for given, rep in seqDict.items():
            thewriter.writerow({
                'Name': given,
                'Representative': rep,
                'Sequence': seqsforCSV[rep]
            })
Exemple #20
0
    JASPAR_FILE = "JASPAR_matrices.txt"
    for o, a in optlst:
        if o == "-h":
            usage(sys.argv[0])
        elif o == "-f":
            FILENAME = a
        elif o == "-d":
            DISCOVER_MODE = True
        elif o == "-w":
            WORD_WIDTH = int(a)
        elif o == "-p":
            PEAK_WIDTH = int(a)
        elif o == "-m":
            PEAK_MARGIN = int(a)
        elif o == "-s":
            SCAN_MODE = True
            MOTIF_ID = a
        elif o == "-j":
            JASPAR_FILE = a
    if FILENAME == None:
        usage(sys.argv[0], "Filename not specified")
        sys.exit(3)
    seqs = sequence.readFastaFile(FILENAME, sym.DNA_Alphabet_wN)
    if DISCOVER_MODE:
        print "Discover (f=%s; w=%d; p=%d; m=%d)" % (FILENAME, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
        countWordsReport(seqs, WORD_WIDTH, PEAK_WIDTH, PEAK_MARGIN)
    elif SCAN_MODE:
        scanMotifReport(seqs, MOTIF_ID)
    else:
        usage(sys.argv[0], "No run mode selected")
Exemple #21
0
def run_qscore(name,
               aln_type,
               parameters,
               specific_files=None,
               save=False,
               outpath=""):
    base_dir = "./bench1.0/" + name

    in_dir = base_dir + "/in/"
    ref_dir = base_dir + "/ref/"
    out_dir = "./qscore_alignments/" + aln_type + "_" + name

    qscore_dict = defaultdict(dict)

    files = os.listdir(in_dir)

    file_count = 0

    start_time = timeit.default_timer()

    now = datetime.now()

    dt_string = now.strftime("%Y/%m/%d_%H:%M")

    # Add trailing slash to output directory if it isn't there
    outpath = outpath + "/" if outpath[-1] != "/" else outpath

    param_name = f"t={parameters['tau']}e={parameters['epsilon']}d={parameters['delta']}x={parameters['emissionX']}y={parameters['emissionY']}"

    output_file = "./qscore_alignments/" + aln_type + "_" + name + param_name + ".csv"

    if os.path.exists(outpath + name + ".p"):
        curr_dict = pickle.load(open(outpath + name + ".p", "rb"))
    else:
        curr_dict = {param_name: {}}

    if os.path.exists(outpath + name + "_best.p"):
        best_dict = pickle.load(open(outpath + name + "_best.p", "rb"))
    else:
        best_dict = {}

    if os.path.exists(outpath + "time.p"):
        time_dict = pickle.load(open(outpath + "time.p", "rb"))
    else:
        time_dict = {}

    failures = []

    with open(output_file, 'w+') as output:

        writer = csv.writer(output,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['Tool', 'Dataset', 'Name', 'Q', 'TC', 'M', 'C'])

        # If we don't already have a directory created to save the alignments, lets make one
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        for file in files:

            failed = False

            if file != ".DS_Store":

                seqs = sequence.readFastaFile(in_dir + file,
                                              alphabet=Protein_Alphabet_wB_X_Z)

                for seq in seqs:
                    if any(skip in seq.sequence for skip in aa_skip):
                        print("failed on " + seq.name)
                        failures.append(file)
                        failed = True

                if not failed:

                    qscore_dict[file] = defaultdict(dict)

                    if not specific_files or file in specific_files:

                        if param_name not in curr_dict:
                            curr_dict[param_name] = {}

                        # print (curr_dict)
                        file_count += 1

                        single_time = timeit.default_timer()

                        print(file)

                        # change_params = {'tau': 0.000002, 'epsilon': 0.0001, 'delta': 0.0002, 'emissionX': 0.2, 'emissionY':
                        #     0.2}
                        # change_params = {'tau': 0.00000000002, 'epsilon': 0.000175, 'delta': 0.00031, 'emissionX':
                        #     0.002,
                        #                  'emissionY':
                        #     0.002}
                        #
                        # change_params = {'tau': 0.1, 'epsilon': 0.02, 'delta': 0.01, 'emissionX':
                        #     0.5,
                        #                  'emissionY':
                        #     0.5}
                        # Update parameters using Baum Welch
                        for seq_order in list(itertools.combinations(seqs, 2)):
                            profiles = [
                                aln_profile.AlignmentProfile([x])
                                for x in seq_order
                            ]

                            # change_params = bw.runBaumWelch(parameters, profiles, aln_type)

                        print(parameters)
                        # print (change_params)

                        aligned_profile = align.align_seqs(
                            in_dir + file,
                            out_dir + "/" + file + ".aln",
                            aln_type=aln_type,
                            params=parameters,
                            subsmat=sub_matrix.blosum62EstimatedWithX_dict,
                            log_transform=log_transform)

                        process = subprocess.Popen(
                            "qscore -test %s -ref %s -cline -modeler" %
                            (out_dir + "/" + file + ".aln", ref_dir + file),
                            stderr=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            shell=True)

                        out = process.communicate()[0]
                        errcode = process.returncode

                        print('running')
                        print(errcode)

                        scores = [
                            x.strip()
                            for x in out.decode('utf-8').split(";")[2:]
                        ]

                        # scores = [x.split("=")[1] for x in scores]

                        # print (aligned_profile)
                        print(file)

                        print('\nScores be')
                        print(scores)

                        for score in scores:
                            score_type = score.split("=")[0].strip()
                            score_value = score.split("=")[1].strip()
                            qscore_dict[file][score_type] = score_value

                        curr_dict[param_name][file] = (scores, aligned_profile)

                        update_best_dict(best_dict, file, scores, param_name)

                        if scores and "=" in scores[0]:
                            writer.writerow([
                                aln_type + "_" + param_name + "_log=" +
                                str(log_transform), name, file,
                                scores[0].split("=")[1],
                                scores[1].split("=")[1],
                                scores[2].split("=")[1],
                                scores[3].split("=")[1]
                            ])

                        else:
                            failures.append(file)

                        # if file not in curr_dict[param_name].keys():
                        #     curr_dict[param_name][file] = (scores, aligned_profile)
                        # else:
                        #     curr_dict[param_name][file] = (scores, aligned_profile)
                        #

                        total_seconds = timeit.default_timer() - start_time
                        single_seconds = timeit.default_timer() - single_time

                        if save:

                            pickle.dump(
                                curr_dict,
                                open(outpath + aln_type + "_" + name + ".p",
                                     "wb"))
                            pickle.dump(
                                best_dict,
                                open(
                                    outpath + aln_type + "_" + name +
                                    "_best.p", "wb"))

                    if save:

                        if name in time_dict:
                            if total_seconds < time_dict[name][0]:
                                time_dict[name] = (total_seconds, dt_string)
                                print("New best time - " +
                                      utilities.format_time(total_seconds))
                        else:
                            time_dict[name] = (total_seconds, dt_string)
                            print("New best time - " +
                                  utilities.format_time(total_seconds))

                    pickle.dump(
                        time_dict,
                        open(outpath + aln_type + "_" + "time.p", "wb"))
    print('These files failed ')
    print(failures)
    return qscore_dict
Exemple #22
0
import mea_poa.parameters as parameters
import mea_poa.sub_matrix as sub_matrix
import itertools
import mea_poa.alignment_profile as aln_profile
import mea_poa.baum_welch as bw
import sequence
from sym import Alphabet

Protein_Alphabet_wB_X_Z = Alphabet('ABCDEFGHIKLMNPQRSTVWYXZ')
alignment = 'Not calculated'
po_alignment = 'Not calculated'
# seq = "../../tests/files/simple_seqs/borodovsky.fasta"

#
seq = "../../tests/files/custom_seqs/tree_check.fasta"
seqs = sequence.readFastaFile(seq, alphabet=Protein_Alphabet_wB_X_Z)

change_params = {
    'tau': 0.02,
    'epsilon': 0.05,
    'delta': 0.02,
    'emissionX': 0.92,
    'emissionY': 0.2
}

change_params = {
    'tau': 0.002,
    'epsilon': 0.05,
    'delta': 0.02,
    'emissionX': 0.5,
    'emissionY': 0.5