def cluster(blastdb, taxdb):
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    muscle_cline = MuscleCommandline(clwstrict=True)
    genes = []

    for iter in c.execute(
            "SELECT Gene_name from blast WHERE decision = 'To cluster analysis/Chosen' GROUP BY Gene_name"
    ):
        genes.append(iter[0])

    for gene in genes:
        input_dic = {}
        multiple_dic = {}
        two_dic = {}
        problem_dic = {}
        finalseqs = set()
        multfinalseqs = []
        unresolved = []
        for iter in c.execute(
                "SELECT tc_id, GI from blast where decision = 'To cluster analysis/Chosen' AND Gene_name = '"
                + gene + "'order by tc_id"):
            if iter[0] not in input_dic:
                input_dic[iter[0]] = [str(iter[1])]
            else:
                output = input_dic[iter[0]]
                output.append(str(iter[1]))
                input_dic[iter[0]] = output

        for i in input_dic:
            GIs_list = input_dic[i]
            if len(GIs_list) > 2:
                multiple_dic[i] = GIs_list
            if len(GIs_list) == 2:
                two_dic[i] = GIs_list

        for i in multiple_dic:
            identities = []
            #get consensus of all
            align = alignment_reg(multiple_dic[i], blastdb, False, gene, c)
            summary_align = AlignInfo.SummaryInfo(align)
            consensus = summary_align.gap_consensus(threshold=.5,
                                                    ambiguous='N')
            consensus_record = SeqRecord(consensus, id="Consensus_all")
            for m in multiple_dic[i]:
                #align each gene to consensus
                seqs = []
                iterator = get_seqs_from_sqldb_GI_no_gene([m], "hseq", blastdb,
                                                          c)
                for seq in iterator:
                    seqs.append(seq)
                handle_string = StringIO()
                SeqIO.write(seqs, handle_string, "fasta")
                SeqIO.write(consensus_record, handle_string, "fasta")
                data = handle_string.getvalue()
                stdout, stderr = muscle_cline(stdin=data)
                align = AlignIO.read(StringIO(stdout), "clustal")
                count = 0
                gaps = 0
                for col in range(0, len(align[0])):
                    column = align[:, col]
                    if "-" not in column:
                        if column[1:] == column[:-1]:
                            count = count + 1
                    else:
                        gaps = gaps + 1
                iden = 100 * (count / float((len(align[0]) - gaps)))
                identities.append(iden)
            if identities.count(max(identities)) == 1:
                GI_to_pick = multiple_dic[i][identities.index(max(identities))]
                c.execute(
                    "UPDATE blast SET Decision='Closest to consensus in cluster analysis/Chosen' WHERE GI = '"
                    + GI_to_pick + "';")
                GIS_not_picked = list(set(multiple_dic[i]) - set([GI_to_pick]))
                GIS_not_picked_str = str(GIS_not_picked).replace("[",
                                                                 "(").replace(
                                                                     "]", ")")
                c.execute(
                    "UPDATE blast SET Decision='Further from consensus in cluster analysis/Not chosen' WHERE GI IN "
                    + GIS_not_picked_str + ";")
            else:
                GI_to_pick = [
                    multiple_dic[i][m] for m, x in enumerate(identities)
                    if x == max(identities)
                ]
                GI_to_pick_str = str(GI_to_pick).replace("[", "(").replace(
                    "]", ")")
                c.execute(
                    "UPDATE blast SET Decision='Pick one randomly/Chosen' WHERE GI IN "
                    + GI_to_pick_str + ";")
                GIS_not_picked = list(set(multiple_dic[i]) - set(GI_to_pick))
                GIS_not_picked_str = str(GIS_not_picked).replace("[",
                                                                 "(").replace(
                                                                     "]", ")")
                c.execute(
                    "UPDATE blast SET Decision='Pick one randomly/Not chosen' WHERE GI IN "
                    + GIS_not_picked_str + ";")

        for i in two_dic:
            twodic_str = str(two_dic[i]).replace("[", "(").replace("]", ")")
            c.execute(
                "UPDATE blast SET Decision='Pick one randomly/Chosen' WHERE GI IN "
                + twodic_str + ";")

    conn.commit()
    conn.close()
Esempio n. 2
0
def test_resolved_seqs(infile, blastdb, taxdb):
    import sqlite3, sys, time
    from Bio.Blast import NCBIWWW, NCBIXML
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    error_dic = {}
    blast_dic_nums = {}
    blast_dic_tcids = {}
    seqs_to_blast = []
    finalseqs = set()
    multseqs = []
    #do self blast and print to file
    with open(infile) as fasta_file:
        print("Blasting " + infile)
        records = fasta_file.read()
    numqueries = records.count('>')
    error = True
    while error == True:
        try:
            result_handle = NCBIWWW.qblast(
                "blastn",
                "nt",
                records,
                entrez_query=
                '((Papilionoidea[Organism]) OR Hedylidae[Organism]) OR Hesperiidae[Organism]',
                word_size=28,
                hitlist_size=100)
            error = False
        except:
            error = True
    #get rid of this extra step of printing xml using NCBIXML
    with open(infile + ".xml", "w") as save_file:
        save_file.write(result_handle.read())
        result_handle.close()
    #open self blast file for parsing
    #make dictionary of query species: query GI of those that don't have the top hit as the same species

    with open(infile + ".xml") as p:
        print("Parsing blast output")
        blast_recs = NCBIXML.parse(p)
        count = 0
        for rec in blast_recs:
            count += 1
            print(str(round(float(count) / float(numqueries) * 100, 2)) + "%")
            #figure out a new way to do this
            queryAcc = str(rec.query.split()[0])
            for iter in c.execute("SELECT GI FROM blast WHERE accession='" +
                                  queryAcc + "'"):
                queryGI = (str(iter[0]))
            hitdic = {}
            hitSp = set()
            for alignment in rec.alignments:
                for hsp in alignment.hsps:
                    identity = float(hsp.identities) / float(hsp.align_length)
                if alignment.title.split("|")[1] == queryGI:
                    pass
                else:
                    hitdic[str(alignment.title.split("|")[1])] = identity
            maxiden = max(hitdic.values())
            hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden]
            for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" +
                                  queryGI + "'"):
                querySp = (str(iter[0]))
            for i in hitGIs:
                for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" +
                                      i + "'"):
                    hitSp.add(str(iter[0]))
            #only look at top 5 if it doesnt hit the top hit - faster
            if querySp not in hitSp:
                hitSp = set()
                if len(hitdic.values()) > 5:
                    maxiden = sorted(hitdic.values(), reverse=True)[0:5]
                else:
                    maxiden = hitdic.values()
                hitGIs = [
                    GI for GI, iden in hitdic.iteritems() if iden in maxiden
                ]
                for i in hitGIs:
                    for iter in c.execute(
                            "SELECT tc_id FROM blast WHERE GI='" + i + "'"):
                        hitSp.add(str(iter[0]))
                if querySp not in hitSp:
                    error_dic[querySp] = queryGI
                else:
                    finalseqs.add(queryGI)
            else:
                finalseqs.add(queryGI)
    count = 0
    #error_dic['21204'] = '316994286'
    ##go through error dictionary and align the 'same' gene/species to see if they're weird looking
    newseqs = set()
    print("Checking nonmatching sequences")
    for tc_id in error_dic:
        count += 1
        print(str(round(float(count) / float(len(error_dic)) * 100, 2)) + "%")
        list_of_GIs = []
        for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" +
                              error_dic[tc_id] + "'"):
            gene_name = str(iter[0])
        for iter in c.execute("SELECT GI FROM blast WHERE tc_id = '" + tc_id +
                              "' and Gene_name = '" + gene_name + "'"):
            list_of_GIs.append(str(iter[0]))
        first_GI = list_of_GIs[0]
        other_GIs = list_of_GIs[1:]
        pair_check = []
        #align each seq to the first one - first one acts as the 'default' direction
        for x, GI in enumerate(other_GIs):
            GI_pair = [first_GI, GI]
            alignment = alignment_reg(GI_pair)
            iden = identity_calc(alignment)
            if iden < 90:
                #                print("Low Aligned Identity: " + str(iden))
                alignment = alignment_rev_comp(GI_pair)
                iden = identity_calc(alignment)
                if iden < 90:
                    #                    print("Low Reverse Complement Aligned Identity: " + str(iden))
                    alignment = alignment_comp(GI_pair)
                    iden = identity_calc(alignment)
                    if iden < 90:
                        #                        print("Low Complement Aligned Identity: " + str(iden))
                        pair_check.append(0)
                    else:
                        pair_check.append(1)
#                        print("Complement iden: " + str(iden) + " so pair is fine")
                else:
                    pair_check.append(1)
#                    print("Reverse Complement iden: " + str(iden) + " so pair is fine")
            else:
                pair_check.append(1)
#                print("High Aligned Identity: " + str(iden) + " so pair is fine")
#        print(pair_check)
        if all(i == 1 for i in pair_check):
            finalseqs.add(error_dic[tc_id])
            newseqs.add(error_dic[tc_id])
        else:
            idens, start_stop = tiling(list_of_GIs, gene_name)
            current_start = -1
            current_stop = -1
            result = []
            if all(i > 70 for i in idens):
                for start, stop in sorted(start_stop):
                    if start > current_stop:
                        result.append((start, stop))
                        current_start, current_stop = start, stop
                    else:
                        current_stop = max(current_stop, stop)
                        result[-1] = (current_start, current_stop)
                if len(result) == len(start_stop):
                    #                    print("Seqs align to different regions of probe, choosing all")
                    for x in list_of_GIs:
                        finalseqs.add(x)
                        newseqs.add(x)
                else:
                    #                    print('Seqs overlap: Printing to file for hand checking')
                    with open('these_seqs_overlap.txt', 'a') as a:
                        a.write(str(list_of_GIs) + '\n')
            else:
                #                print('Somethings up with a sequence - printing to check - will blast')
                pair_check.append(0)
                blast_dic_nums[list_of_GIs[0]] = len(list_of_GIs)
                blast_dic_tcids[list_of_GIs[0]] = tc_id
                seqs_to_blast.append(list_of_GIs)
                with open('seqs_to_be_blasted.txt', 'a') as a:
                    a.write(str(list_of_GIs) + '\n')
    if len(seqs_to_blast) > 0:
        print(
            "Blasting error seqeuences (seqs don't align together and one doesn't align to whole)"
        )
        seqs_to_blast_flat = [
            item for sublist in seqs_to_blast for item in sublist
        ]
        try:
            hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums,
                                 blast_dic_tcids, c)
        except:
            time.sleep(5)
            hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums,
                                 blast_dic_tcids, c)
#        print(hits_all)
        print("Parsing taxonomy for error sequences")
        for x, list_of_GIs in enumerate(seqs_to_blast):
            hits = hits_all[x]
            tc_id = blast_dic_tcids[list_of_GIs[0]]
            #if theres only one lowest taxonomy hit and its not itself, change
            if hits.count(min(hits)) == 1 and error_dic[tc_id] != list_of_GIs[
                    hits.index(min(hits))]:
                finalseqs.add(list_of_GIs[hits.index(min(hits))])
#                print(str(list_of_GIs[hits.index(min(hits))]) + " had closer taxonomy hit")
            elif hits.count(min(
                    hits)) == 1 and error_dic[tc_id] == list_of_GIs[hits.index(
                        min(hits))]:
                finalseqs.add(error_dic[tc_id])
#                print(str(list_of_GIs[hits.index(min(hits))]) + " was previously chosen")
            else:  #there are multiple lowest taxonomy hits
                #                print('Taxonomies had multiple closest hits')
                index_pos = []
                count = 0
                for x in hits:
                    if x == min(hits):
                        index_pos.append(count)
                    count += 1
                mult_GIs = [list_of_GIs[x] for x in index_pos]
                GI_to_pick = resolve_seqs(mult_GIs)
                #if theres only one chosen and it wasnt the one already picked...add to change dic
                if len(GI_to_pick) == 1 and error_dic[tc_id] != GI_to_pick[0]:
                    finalseqs.add(GI_to_pick[0])
#                    print(str(GI_to_pick) + " chosen")
#if theres only one max length and it was already picked
                elif len(
                        GI_to_pick) == 1 and error_dic[tc_id] == GI_to_pick[0]:
                    #                    print(str(GI_to_pick) + " was previously chosen")
                    finalseqs.add(GI_to_pick[0])
                else:
                    #this only happens if the originally picked one is crappy and the rest are the same
                    #                    print("Multiple choices: " + str(GI_to_pick))
                    multseqs.append(error_dic[tc_id])
                    #Go to cluster analysis

    print('length of resolved=' + str(len(finalseqs)))
    print('length of not resolved = ' + str(len(multseqs)))

    with open("final_GIs.txt", "a") as o:
        for m in finalseqs:
            o.write(str(m) + "\n")

    #to cluster
    with open("multiple_gene_choices.txt", "a") as o:
        for m in multseqs:
            o.write(str(m) + "\n")

    conn.close()
Esempio n. 3
0
        print(identities)
        print('Multiple closest')
        problem_dic[i] = multiple_dic[i]
        GI_to_pick = [
            multiple_dic[i][m] for m, x in enumerate(identities)
            if x == max(identities)
        ]
        print(GI_to_pick)
        multfinalseqs.append(GI_to_pick)

for i in two_dic:
    print(i)
    #align the two seqs
    list_of_GIs = two_dic[i]
    print(list_of_GIs)
    alignment = alignment_reg(list_of_GIs)
    iden = identity_calc(alignment)
    if iden < 95:
        print("Low Aligned Identity: " + str(iden))
        alignment = alignment_rev_comp(list_of_GIs)
        iden = identity_calc(alignment)
        if iden < 95:
            #get taxonomy for query(main species)
            print("Low Reverse Complement Aligned Identity: " + str(iden))
            alignment = alignment_comp(list_of_GIs)
            iden = identity_calc(alignment)
            if iden < 95:
                print("Low Complement Aligned Identity: " + str(iden))
                #add tiling thing
                gene_name = '_'.join(i.split('_')[1:])
                idens, start_stop = tiling(list_of_GIs, gene_name)
Esempio n. 4
0
    genes.add(re.split('_|\|', i)[1])

count = 0
#deal with lengths of COI and add to dic to try and resolve
for i in dic_COI:
    count += 1
    print(float(count) / float(len(dic_COI)))
    lengths = [int(m.split('_')[1]) for m in dic_COI[i]]
    individual = [dic_COI[i][x] for x, l in enumerate(lengths) if l < 2000]
    whole = [
        dic_COI[i][x] for x, l in enumerate(lengths) if l > 2000 and l < 3000
    ]
    mito = [dic_COI[i][x] for x, l in enumerate(lengths) if l > 3000]
    if len(mito) > 0:
        GIs_to_align = [mito[0].split("_")[0], 'GU365907']
        alignment = alignment_reg(GIs_to_align)
        iden = identity_calc(alignment)
        if iden > 80:
            #have to do span to account for random small blocks that dont align
            span = 0
            #get start
            for l in range(len(alignment[0])):
                col = alignment[:, l]
                if '-' not in col:
                    span += 1
                if span == 10:
                    break
                elif span > 0 and '-' in col:
                    span = 0
            start = l - 8
            span = 0
Esempio n. 5
0
def cluster(blastdb, taxdb):
    Entrez.email = "*****@*****.**"
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    muscle_cline = MuscleCommandline(clwstrict=True)
    input_dic = {}
    multiple_dic = {}
    two_dic = {}
    problem_dic = {}
    finalseqs = set()
    multfinalseqs = []
    unresolved = []

    with open("multiple_gene_choices.txt") as o:
        line = o.readline()
        while line:
            input_dic[line.split("\t")[0]] = line.strip().split(
                "\t")[1].replace("[", "").replace("]", "").replace("'", "")
            line = o.readline()
    for i in input_dic:
        GIs = input_dic[i]
        GIs_list = GIs.split(", ")
        if len(GIs_list) > 2:
            multiple_dic[i] = GIs_list
        if len(GIs_list) == 2:
            two_dic[i] = GIs_list

    for i in multiple_dic:
        identities = []
        joined_GIs = ",".join(multiple_dic[i])
        handle = Entrez.efetch(db="nucleotide",
                               rettype="fasta",
                               retmode="text",
                               id=joined_GIs)
        seqs = SeqIO.parse(handle, "fasta")
        handle_string = StringIO()
        SeqIO.write(seqs, handle_string, "fasta")
        data = handle_string.getvalue()
        stdout, stderr = muscle_cline(stdin=data)
        align = AlignIO.read(StringIO(stdout), "clustal")
        summary_align = AlignInfo.SummaryInfo(align)
        consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N')
        consensus_record = SeqRecord(consensus, id="Consensus_all")
        for m in multiple_dic[i]:
            error = True
            while error == True:
                try:
                    handle = Entrez.efetch(db="nucleotide",
                                           rettype="fasta",
                                           retmode="text",
                                           id=m)
                    error = False
                except:
                    print('Error, trying again')
                    time.sleep(10)
            seqs = SeqIO.read(handle, "fasta")
            handle_string = StringIO()
            SeqIO.write(seqs, handle_string, "fasta")
            SeqIO.write(consensus_record, handle_string, "fasta")
            data = handle_string.getvalue()
            stdout, stderr = muscle_cline(stdin=data)
            align = AlignIO.read(StringIO(stdout), "clustal")
            count = 0
            gaps = 0
            for col in range(0, len(align[0])):
                column = align[:, col]
                if "-" not in column:
                    if column[1:] == column[:-1]:
                        count = count + 1
                else:
                    gaps = gaps + 1
            iden = 100 * (count / float((len(align[0]) - gaps)))
            identities.append(iden)
        if identities.count(max(identities)) == 1:
            finalseqs.add(multiple_dic[i][identities.index(max(identities))])
        else:
            problem_dic[i] = multiple_dic[i]
            GI_to_pick = [
                multiple_dic[i][m] for m, x in enumerate(identities)
                if x == max(identities)
            ]
            multfinalseqs.append(GI_to_pick)

    for i in two_dic:
        #align the two seqs
        list_of_GIs = two_dic[i]
        alignment = alignment_reg(list_of_GIs)
        iden = identity_calc(alignment)
        if iden < 95:
            #            print("Low Aligned Identity: " + str(iden))
            alignment = alignment_rev_comp(list_of_GIs)
            iden = identity_calc(alignment)
            if iden < 95:
                #get taxonomy for query(main species)
                #               print("Low Reverse Complement Aligned Identity: " + str(iden))
                alignment = alignment_comp(list_of_GIs)
                iden = identity_calc(alignment)
                if iden < 95:
                    #                   print("Low Complement Aligned Identity: " + str(iden))
                    #add tiling thing
                    gene_name = '_'.join(i.split('_')[1:])
                    idens, start_stop = tiling(list_of_GIs, gene_name)
                    current_start = -1
                    current_stop = -1
                    result = []
                    if all(m > 70 for m in idens):
                        for start, stop in sorted(start_stop):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        if len(result) == len(start_stop):
                            #                           print("Seqs align to different regions of probe, choosing all")
                            multfinalseqs.append(list_of_GIs)
                        else:
                            #                           print('Seqs overlap: Printing to file for hand checking')
                            with open('these_seqs_overlap_cluster.txt',
                                      'a') as a:
                                unresolved.append(list_of_GIs)
                                a.write(str(list_of_GIs) + '\n')
                    else:
                        #get taxonomy for query(main species)
                        print("Parsing taxonomy for error sequences")
                        hits = blast(i, list_of_GIs, c)
                        #if theres only one lowest taxonomy hit, change
                        if hits.count(min(hits)) == 1:
                            finalseqs.add(
                                str(two_dic[i][hit_levels.index(min(hits))]))
#                            print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit")
                        else:  #there are multiple lowest taxonomy hits
                            multfinalseqs.append(two_dic[i])
                            problem_dic[i] = two_dic[i]
#                           print('Taxonomies had the multiple closest hits')
                else:
                    multfinalseqs.append(two_dic[i])
#                    print("Complement iden: " + str(iden) + " so pair is fine")
            else:
                multfinalseqs.append(two_dic[i])
#                print("Reverse Complement iden: " + str(iden) + " so pair is fine")
        else:
            multfinalseqs.append(two_dic[i])
#            print("High Aligned Identity: " + str(iden) + " so pair is fine")

    print("length of resolved = " + str(len(finalseqs)))
    print("length of choose multiple = " + str(len(multfinalseqs)))
    print("length of unresolved = " + str(len(unresolved)))

    with open("final_GIs.txt", "a") as o:
        for m in finalseqs:
            o.write(str(m) + "\n")

    with open("choose_mult.txt", "a") as o:
        for m in [num for pair in multfinalseqs for num in pair]:
            o.write(str(m) + "\n")
Esempio n. 6
0
 print(tc_id)
 list_of_GIs = []
 for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" +
                       error_dic[tc_id] + "'"):
     gene_name = str(iter[0])
 for iter in c.execute("SELECT GI FROM blast WHERE tc_id = '" + tc_id +
                       "' and Gene_name = '" + gene_name + "'"):
     list_of_GIs.append(str(iter[0]))
 first_GI = list_of_GIs[0]
 other_GIs = list_of_GIs[1:]
 pair_check = []
 #align each seq to the first one - first one acts as the 'default' direction
 for x, GI in enumerate(other_GIs):
     print("Pair #" + str(x + 1))
     GI_pair = [first_GI, GI]
     alignment = alignment_reg(GI_pair)
     iden = identity_calc(alignment)
     if iden < 90:
         print("Low Aligned Identity: " + str(iden))
         alignment = alignment_rev_comp(GI_pair)
         iden = identity_calc(alignment)
         if iden < 90:
             print("Low Reverse Complement Aligned Identity: " + str(iden))
             alignment = alignment_comp(GI_pair)
             iden = identity_calc(alignment)
             if iden < 90:
                 print("Low Complement Aligned Identity: " + str(iden))
                 pair_check.append(0)
             else:
                 pair_check.append(1)
                 print("Complement iden: " + str(iden) + " so pair is fine")
def test_resolved_seqs(infile, blastdb, taxdb):
    ent_query = get_blast_query(taxdb)
    import sqlite3, sys, time, subprocess
    from Bio.Blast import NCBIWWW, NCBIXML
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling
    from cleanlib.databasing import get_seqs_from_sqldb, export_fasta, create_blast_db, local_blast
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    error_dic = {}
    blast_dic_nums = {}
    blast_dic_tcids = {}
    seqs_to_blast = []
    finalseqs = set()
    multseqs = []
    #open self blast file for parsing
    #make dictionary of query species: query GI of those that don't have the top hit as the same species
    
    with open(infile) as p:
        print("Parsing reciprocal blast output from " + infile)
        blast_recs=NCBIXML.parse(p)
        count = 0
        numqueries = subprocess.check_output("grep -c '>' "+ infile.split(".xml")[0], shell=True)
        for rec in blast_recs:
            count += 1
            print(str(round(float(count)/float(numqueries)*100, 2))+ "%")
            queryAcc = str(rec.query.split()[0])
            #this broke when ncbi updated
            for iter in c.execute("SELECT GI FROM blast WHERE accession='" + queryAcc + "';"):
                queryGI = (str(iter[0]))
            #print(queryGI)
            #hitdic is GIs and idens of 20 in .xml for each rec in blast_rec
            hitdic = {}
            hitSp = set()
            count1 = 0
            for alignment in rec.alignments:
                    for hsp in alignment.hsps:
                        identity=float(hsp.identities)/float(hsp.align_length)
                        #print(identity)
                    if alignment.hit_def == queryAcc:
                        pass
                    else:
                        for iter in c.execute("SELECT GI FROM blast WHERE accession='" + alignment.hit_def + "'"):
                            hitGI = (str(iter[0]))
                        count1 += 1
                        hitdic[str(hitGI)] = identity
                        if count1 >= 20:
                            break
                # if alignment.title.split("|")[1] == queryGI:
                #     pass
                # else:
                #     hitdic[str(alignment.title.split("|")[1])] = identity
            
            if len(hitdic.values()) == 0:
                #####################chose next one#######################
                print('No matching hits from blast')
                c.execute("UPDATE blast SET Decision='Chosen, but then did not blast to anything/Not chosen' WHERE GI = '" + queryGI + "';")
                pass
            else:
                ##first looks at just the top iden value
                maxiden = max(hitdic.values())
                try:
                    hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden] #python2
                except:
                    hitGIs = [GI for GI, iden in hitdic.items() if iden == maxiden] #python3
                for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + queryGI + "'"):
                    querySp = (str(iter[0]))
                for i in hitGIs:
                    for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"):
                        hitSp.add(str(iter[0]))
                ##if it doesn't work, look at top 5 idens - ignore top hit b/c already looked at
                if querySp not in hitSp:
                    hitSp = set()
                    if len(hitdic.values()) > 5:
                        maxiden = sorted(set(hitdic.values()), reverse = True)[1:5]
                    else:
                        maxiden = hitdic.values()
                    try:
                        hitGIs = [GI for GI, iden in hitdic.iteritems() if iden in maxiden] #python2
                    except:
                        hitGIs = [GI for GI, iden in hitdic.items() if iden in maxiden] #python3
                    #get hit species (actually tc_ids)
                    for i in hitGIs:
                        for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"):
                            hitSp.add(str(iter[0]))
                    #if species of query not in the list of hit species
                    if querySp not in hitSp:
                        #will overwrite tiling info
                        #c.execute("UPDATE blast SET decision='Chosen, but does not reciprocal blast' WHERE GI='" + queryGI + "';")
                        if querySp not in error_dic:
                            error_dic[querySp] = [queryGI]
                        else:
                            output = error_dic[querySp]
                            output.append(querySp)
                            error_dic[querySp] = output
                        
                    else:
                        c.execute("UPDATE blast SET decision='Longest or most info, good top hit/chosen' WHERE GI='" + queryGI + "';")
                else:
                    c.execute("UPDATE blast SET decision='Longest or most info, good top hit/chosen' WHERE GI='" + queryGI + "';")
    count = 0
    #error_dic is dictionary that has species = GI that doesn't match species when self blast
    # error_dic = {'81': [908332352], '56': '227435787', '60': '224939202', '39': '497153612', '157': '224939198', '87': '695134008', '628': '695134046', '184': '1174533403', '116': '558477133', '121': '316994106', '206': '545690375', '429': '1160414548', '431': '227436029', '480': '695134034', '105': '545690685', '651': '695134010', '413': '529377043', '373': '443611395', '252': '529388651', '354': '529386663', '335': '443611449', '584': '295078734', '375': '1559462858', '523': '1051545064', '350': '443611513', '410': '302487953', '527': '316994100', '244': '575502624', '355': '443611465', '595': '317467717', '380': '443611323', '357': '529374537', '577': '317467813', '363': '675401803', '406': '443611367', '372': '73533766', '656': '299833006'}
    ##go through error dictionary and align the 'same' gene/species to see if they're weird looking
    print("Aligning sequences where chosen does not reciprocal blast to another")
    for tc_id in error_dic:
        for tile in error_dic[tc_id]:
            count += 1
            print(str(round(float(count)/float(len(error_dic))*100, 2))+"%")
            list_of_GIs = [tile]
            for iter in c.execute("SELECT Gene_name, Decision FROM blast WHERE GI ='" + tile + "'"):
                gene = str(iter[0])
                tilenum = str(iter[1].split()[7])    
            for iter in c.execute("SELECT GI FROM blast WHERE Decision = 'Short or less info, tile "+tilenum+"/Not chosen' and tc_id = '"+tc_id+"' and Gene_name = '"+gene+"'"):
                list_of_GIs.append(str(iter[0]))
            #first is always querygi
            first_GI = list_of_GIs[0]
            other_GIs = list_of_GIs[1:]
            pair_check = []
            #align each seq to the first one - first one acts as the 'default' direction
            for x, GI in enumerate(other_GIs):
                GI_pair = [first_GI, GI]
                alignment = alignment_reg(GI_pair, blastdb, False, gene, c)
                iden = identity_calc(alignment)
                if iden < 90:
    #                print("Low Aligned Identity: " + str(iden))
                    alignment = alignment_rev_comp(GI_pair, blastdb, False, gene, c)
                    iden = identity_calc(alignment)
                    if iden < 90: 
    #                    print("Low Reverse Complement Aligned Identity: " + str(iden))
                        alignment = alignment_comp(GI_pair, blastdb, False, gene, c)
                        iden = identity_calc(alignment)
                        if iden < 90:
    #                        print("Low Complement Aligned Identity: " + str(iden))
                            pair_check.append(0)
                        else:
                            pair_check.append(1)
    #                        print("Complement iden: " + str(iden) + " so pair is fine")
                    else:
                        pair_check.append(1)
    #                    print("Reverse Complement iden: " + str(iden) + " so pair is fine")
                else:
                    pair_check.append(1)
    #                print("High Aligned Identity: " + str(iden) + " so pair is fine")
            #print(pair_check)
            #1 is when the GI aligned to chosen one just fine, 0 is when it doesn't
            if all(i == 1 for i in pair_check):
                c.execute("UPDATE blast SET decision='Sequence did not have same top blast species, but all aligned correctly, tile "+tilenum+"/Chosen' WHERE GI='" + tile + "';")
            else:
                ##either all are 0s if first one is wrong or one is zero where one is wrong
    #           print('Somethings up with a sequence - printing to check - will blast')
                seqs_to_blast.append(list_of_GIs)
                list_of_GIs_str = str(list_of_GIs).replace("[", "(").replace("]", ")")
                c.execute("UPDATE blast SET Decision='Chosen, but does not reciprocal blast and all do not align' WHERE GI IN " + list_of_GIs_str + ";") 
    # seqs_to_blast = [['695134046', '317467897'], ['316994106', '295069424', '414303142'], ['695134034', '71836048'], ['302487953', '73533700', '66096804'], ['575502624', '1723904125', '1723904119'], ['73533766', '443611441', '443611439'], ['296792260', '1531247074', '1531246852', '1531245174', '1531244766', '1531244542', '1531244412', '870902494', '520760133', '520760131', '331691381', '331690706', '331690580', '331690574', '331690568', '296467088', '296467084', '331691379', '331690792', '974999640', '974999638', '1531247630', '1531247410', '1531247308', '1531247180', '1531247130', '1531246686', '1531246650', '1531246450', '1531246148', '1531246132', '1531245926', '1531245702', '1531245624', '1531245592', '1531245462', '1531245454', '1531245440', '1531245354', '1531244888', '1531244834', '1531244804', '1531244664', '1531244614', '1531244586', '1531244178', '1531244132', '1531244016', '520760163', '520760159', '520760149', '331691355', '331690624', '331690576', '331690564', '331690544', '307641798', '296469132', '296468960', '296467194', '296466760', '304270861', '331690704', '63030162', '974999636', '227436371', '1532637164', '1532637082', '1532637024', '1531247684', '1531247306', '1531247040', '1531246466', '1531246208', '1531246108', '1531245848', '1531245842', '1531245638', '1531245614', '1531245574', '1531245376', '1531244734', '1531244658', '1531244080', '1531244012', '1531243910', '633896164', '520760155', '520759941', '331691337', '331690536', '300203102', '156619418', '156619410', '974999630', '296467200', '304270883', '1532637150', '1532637144', '1532637064', '1532637028', '1532637018', '730837173', '633896166', '520760157', '520760151', '331691341', '331691339', '331690582', '331690562', '1557923373', '1557923370', '156619416', '156619412', '1532637074', '1532637022', '1532637016', '156619414', '156619408', '730837157', '633896160', '1532637100', '1532637090', '1532637072', '1532637038', '633896162', '1042273875', '520760209', '313173207', '296468852', '88604889', '1532637050', '520760225', '520760219', '520760213', '296468054', '1532637044', '520760221', '520760193', '296792296', '312928999', '1557923367', '1557923364', '156619420', '331690820', '156619406', '520760211', '227436369', '296463378', '296463050', '227436365', '227436367', '296463374', '870901906', '870902242', '299833006', '76008949', '296464328', '296465368', '296464998', '296465182', '296464956', '296465198', '296463914']]
    ##sequences don't align together and chosen hit doesn't blast to same species (one area of tiling)
    if len(seqs_to_blast) > 0:
        for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" + seqs_to_blast[0][0] + "'"):
            gene = str(iter[0])
        seqs_to_blast_flat = [item for sublist in seqs_to_blast for item in sublist]
        #queryGI: nearest tax rank of hit
        nearest_hit_taxo_dic = blast_all(seqs_to_blast_flat, c, gene, taxdb, blastdb)
        print("Parsing taxonomy for error sequences")
        for i in seqs_to_blast:
            hit_taxonomies = [nearest_hit_taxo_dic[x] for x in i]
            #want max tcid - that is closest to query
            maxtcid = max(hit_taxonomies)
            GI_to_pick = [GI for GI in enumerate(hit_taxonomies) if iden == maxtcid]
            if len(GI_to_pick) == 1:
                c.execute("UPDATE blast SET decision='Blast hits have closest taxonomy/Chosen' WHERE GI='" + GI_to_pick[0] + "';")
            else:
                GI_to_pick_str = str(GI_to_pick).replace("[", "(").replace("]", ")")
                c.execute("UPDATE blast SET Decision='To cluster analysis/Chosen' WHERE GI IN " + GI_to_pick_str + ";")    
                    
                    
    conn.commit()
    conn.close()
Esempio n. 8
0
def resolve_seqs(blastdb):
    import os, sys, sqlite3, re, time, itertools
    from Bio import Seq, Entrez, SeqIO
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    Entrez.email = "*****@*****.**"
    GI_nums_all = set()
    GI_nums_single = set()
    GI_nums_all_COI = set()
    GI_nums_single_COI = set()
    genes = set()
    dic = {}
    dic_COI = {}
    dic_single = {}
    dic_mult = {}
    count2 = 1
    records = []
    #this gets list of all taxa/genes regardless if they have multiple gene choices or not
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;"
    ):
        GI_nums_all.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this gets a list of all taxa/genes if they only have one gene choice
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;"
    ):
        GI_nums_single.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this give me all the GIs that have multiple gene choices
    GI_nums = GI_nums_all - GI_nums_single

    #do the same with COI
    #this gets list of all taxa/genes regardless if they have multiple gene choices or not
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name == 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;"
    ):
        GI_nums_all_COI.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this gets a list of all taxa/genes if they only have one gene choice
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name = 'COI_trnL_COII' GROUP BY tc_id HAVING COUNT(*) =1;"
    ):
        GI_nums_single_COI.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this give me all the GIs that have multiple gene choices
    GI_nums_mult_COI = GI_nums_all_COI - GI_nums_single_COI
    for i in GI_nums_single_COI:
        if int(i.split("_")[-1]) > 3000:
            GI_nums_mult_COI.add(i)

    #makes a dictionary of lists for each taxa/gene choice
    for i in GI_nums:
        if i.split("|")[0] in dic.keys():
            dic_list = dic[i.split("|")[0]]
            dic_list.append(i.split("|")[1])
            dic[i.split("|")[0]] = dic_list
        else:
            dic[i.split("|")[0]] = [i.split("|")[1]]
        genes.add(re.split('_|\|', i)[1])
    #same for COI
    for i in GI_nums_mult_COI:
        if i.split("|")[0] in dic_COI.keys():
            dic_list = dic_COI[i.split("|")[0]]
            dic_list.append(i.split("|")[1])
            dic_COI[i.split("|")[0]] = dic_list
        else:
            dic_COI[i.split("|")[0]] = [i.split("|")[1]]
        genes.add(re.split('_|\|', i)[1])

    count = 0
    #deal with lengths of COI and add to dic to try and resolve
    print("Trying to resolve COI/COII sequences")
    for i in dic_COI:
        count += 1
        print(str(round(float(count) / float(len(dic_COI)) * 100, 2)) + '%')
        lengths = [int(m.split('_')[1]) for m in dic_COI[i]]
        individual = [dic_COI[i][x] for x, l in enumerate(lengths) if l < 2000]
        whole = [
            dic_COI[i][x] for x, l in enumerate(lengths)
            if l > 2000 and l < 3000
        ]
        mito = [dic_COI[i][x] for x, l in enumerate(lengths) if l > 3000]
        if len(mito) > 0:
            GIs_to_align = [mito[0].split("_")[0], 'GU365907']
            alignment = alignment_reg(GIs_to_align)
            iden = identity_calc(alignment)
            if iden > 80:
                #have to do span to account for random small blocks that dont align
                span = 0
                #get start
                for l in range(len(alignment[0])):
                    col = alignment[:, l]
                    if '-' not in col:
                        span += 1
                    if span == 10:
                        break
                    elif span > 0 and '-' in col:
                        span = 0
                start = l - 8
                span = 0
                #get stop
                for l in reversed(range(len(alignment[0]))):
                    col = alignment[:, l]
                    if '-' not in col:
                        span += 1
                    if span == 10:
                        break
                    elif span > 0 and '-' in col:
                        span = 0
                end = l + 10
                handle = Entrez.efetch(db="nucleotide",
                                       rettype="fasta",
                                       retmode="text",
                                       id=mito[0].split("_")[0],
                                       seq_start=start - 1,
                                       seq_stop=end - 2)
                record = SeqIO.read(handle, "fasta")
                records.append(record)
            else:
                print('Low iden when matching COI to whole mito')
                with open("COI_hand_check.txt", "a") as a:
                    a.write(i + '\n')
        elif len(whole) > 0:
            #pick whole
            if len(whole) == 1:
                GI_nums_single.add(i + "|" + whole[0])
            else:
                #to pipeline
                dic[i] = whole
        if len(individual) > 0:
            #pick individual
            if len(individual) == 1:
                GI_nums_single.add(i + "|" + individual[0])
            else:
                #                print(individual)
                result = []
                ranges = {}
                current_start = -1
                current_stop = -1
                whole_length = 0
                #do tiling
                for m in [x.split('_')[0] for x in individual]:
                    iden, start_stop = tiling([m], 'COI_trnL_COII')
                    start, end = start_stop[0]
                    #uses gi for danaus chrysippus COI_trnL_COII
                    if iden > 70:
                        #make dic of lists
                        if (start, end) in ranges.keys():
                            ranges_dic_list = ranges[(start, end)]
                            ranges_dic_list.append(m)
                            ranges[(start, end)] = ranges_dic_list
                        else:
                            ranges[(start, end)] = [m]
                    else:
                        print('Alignment below 70')
                        print(GIs_to_align)
                if len(ranges) == 0:
                    print('All alignments below 70, printing to file')
                    with open("COI_hand_check.txt", "a") as a:
                        a.write(i + '\n')
                #get merged range
                for start, stop in sorted(ranges.keys()):
                    if start > current_stop:
                        result.append((start, stop))
                        current_start, current_stop = start, stop
                    else:
                        current_stop = max(current_stop, stop)
                        result[-1] = (current_start, current_stop)
                for n in result:
                    whole_length += n[1] - n[0] + 1
                #go through each combination of ranges to get 95% of whole range
                for L in range(1, len(ranges) + 1):
                    max_perc = 0
                    for subset in itertools.combinations(ranges.keys(), L):
                        comb_length = 0
                        #for each combination, get merged length
                        current_start = -1
                        current_stop = -1
                        result = []
                        for start, stop in sorted(subset):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        for x in result:
                            comb_length += x[1] - x[0] + 1
                        if whole_length >= comb_length:
                            perc = float(comb_length) / float(whole_length)
                            if perc > max_perc:
                                max_perc = perc
                                max_subset = set()
                                max_subset.add(subset)
                            elif perc == max_perc:
                                max_subset.add(subset)
                            else:
                                pass
                        else:
                            pass
                    # goes through all combinations in a level before breaking
                    if max_perc > .95:
                        break
                final_tiling = [(0, 0)] * L
                for combination in max_subset:
                    for x, comb_frag in enumerate(sorted(combination)):
                        if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][
                                1] - final_tiling[x][0] + 1:
                            final_tiling[x] = comb_frag
#              print(final_tiling)
                possible_GIs = [ranges[x] for x in final_tiling]
                #                print(possible_GIs)
                count = 0
                for m in possible_GIs:
                    if len(m) == 1:
                        GI_nums_single.add(i + "|" + m[0] + "_0")
                    else:
                        if count == 0:
                            dic[i] = m
                            count += 1
                        else:
                            dic[i + "_" + str(count)] = m
                            count += 1
                        # merge ranges - if same number of ranges as original, keep all,
                        # want the least number to overlap 95% of whole range
                        # try each comb from low to high and if hits 95%, choose those
                        # if multiple higher than 95%, choose one with best %
                        # if multiple with same % and same numb of combs- multiple
    print(dic)
    #   sys.exit()
    SeqIO.write(records, "mito_COI.fa", "fasta")
    #pulls out the GIs with first, the longest number of ATCGs and second, the longest length and makes dictionary
    print("Trying to resolve all other sequences")
    count = 0
    for i in dic:
        GIlist = []
        for n in dic[i]:
            GIlist.append(n.split("_")[0])
        dic[i] = resolve_seqs(GIlist)
        print(str(round((float(count) / float(len(dic))) * 100, 2)) + "%")
        count += 1
    #splits the ones that still have multiple (so the longest had multiple choices) and the ones that are resolved
    for i in dic:
        if len(dic[i]) > 1:
            dic_mult[i] = dic[i]
        else:
            dic_single[i] = dic[i]
    for i in genes:
        finalGInums_only1 = set()
        finalGInums_longest = set()
        for n in dic_single.keys():
            if i == re.split('_|\|', n)[1]:
                finalGInums_longest.add(''.join(dic_single[n]))
        for n in GI_nums_single:
            if i == re.split('_|\|', n)[1]:
                finalGInums_only1.add(re.split('_|\|', n)[-2])
        with open("final_GIs.txt", "a") as o:
            for m in finalGInums_only1:
                o.write(str(m) + "\n")
        #this needs to go to blast_sp_parse.py
        with open(i + "_accession_nums_resolved.txt", "w") as o:
            for m in finalGInums_longest:
                o.write(str(m) + "\n")
    #this needs to go to cluster.py
    with open("multiple_gene_choices.txt", "w") as w:
        for i in dic_mult:
            w.write(i + "\t" + str(dic_mult[i]) + "\n")
    conn.close()