Example #1
0
def alignment_reg(align_GIs, blastdb, qseqbool, gene, c):
    #qseqbool is a boolian that says whether or not the first GI is a qseq or not
    seqs = []
    if qseqbool == True:
        iterator = get_seqs_from_sqldb_GI(align_GIs[:1], "qseq", blastdb, gene,
                                          c)
        handle_string = StringIO()
        for seq in iterator:
            seqs.append(seq)
        iterator = get_seqs_from_sqldb_GI(align_GIs[1:], "hseq", blastdb, gene,
                                          c)
        handle_string = StringIO()
        for seq in iterator:
            seqs.append(seq)
    if qseqbool == False:
        iterator = get_seqs_from_sqldb_GI(align_GIs, "hseq", blastdb, gene, c)
        handle_string = StringIO()
        for seq in iterator:
            seqs.append(seq)

    SeqIO.write(seqs, handle_string, "fasta")
    data = handle_string.getvalue()
    stdout, stderr = muscle_cline(stdin=data)
    align = AlignIO.read(StringIO(stdout), "clustal")
    return (align)
Example #2
0
def resolve_seqs(list_of_GIs, blastdb, gene, c):
    #give a list of GIs, checks the amount DNA/length, returns list of max
    amount_DNA = []
    length_DNA = []
    list_of_accs = []
    GI_to_pick = []
    iterator = get_seqs_from_sqldb_GI(list_of_GIs, "hseq", blastdb, gene, c)
    for seq_record in iterator:
        list_of_accs.append(seq_record.id)
        amount_DNA.append(
            seq_record.seq.count("A") + seq_record.seq.count("T") +
            seq_record.seq.count("C") + seq_record.seq.count("G"))
    for i in list_of_accs:
        for iter in c.execute(
                "SELECT hit_length FROM blast WHERE accession = '" + i + "';"):
            length_DNA.append(int(iter[0]))
    if amount_DNA.count(max(amount_DNA)) == 1:
        acc_to_pick = [list_of_accs[amount_DNA.index(max(amount_DNA))]]
    elif length_DNA.count(max(length_DNA)) == 1:
        acc_to_pick = [list_of_accs[length_DNA.index(max(length_DNA))]]
    else:
        try:
            sums = [
                amount_DNA[i] + length_DNA[i] for i in xrange(len(amount_DNA))
            ]  #python2
        except:
            sums = [
                amount_DNA[i] + length_DNA[i] for i in range(len(amount_DNA))
            ]  #python3
        acc_to_pick = [
            list_of_accs[i] for i, x in enumerate(sums) if x == max(sums)
        ]
    for i in acc_to_pick:
        for iter in c.execute("SELECT GI from blast where accession = '" + i +
                              "';"):
            GI_to_pick.append(str(iter[0]))
    return (GI_to_pick)
def pullseqs(blastdb, email):
    from Bio import Entrez, SeqIO
    import os, sys, time, sqlite3
    from random import randint
    from cleanlib.databasing import get_seqs_from_sqldb_GI
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    GI_gene_dic = {}
    mitochloro_gene_dic = {}
    tc_ids_random = set()
    genes = set()
    Entrez.email = email

    # Ambiguous species/not chosen
    # Better tiling/Not chosen
    # Closest to consensus in cluster analysis/Chosen
    # Further from consensus in cluster analysis/Not chosen
    # Longest or most info, good top hit/chosen
    # Mito or chloro sequence/Chosen
    # Only choice/chosen
    # Only or best choice in tiling analysis/chosen
    # Pick one randomly/Chosen
    # Sequence did not have same top blast species, but all aligned correctly, tile 1/Chosen
    # Short or less info, tile 1/Not chosen
    # Short or less info/Not chosen
    # Species not in taxonomy/not chosen

    for iter in c.execute(
            "SELECT GI, Gene_name FROM blast WHERE Decision IN ('Closest to consensus in cluster analysis/Chosen', 'Longest or most info, good top hit/chosen', 'Only choice/chosen', 'Only or best choice in tiling analysis/chosen') OR Decision LIKE 'Sequence did not have same top blast species, but all aligned correctly%'"
    ):
        GI_gene_dic[str(iter[0])] = iter[1]
        genes.add(iter[1])

    for iter in c.execute(
            "SELECT GI, Gene_name FROM blast WHERE Decision IN ('Mito or chloro sequence/Chosen')"
    ):
        mitochloro_gene_dic[str(iter[0])] = iter[1]
        genes.add(iter[1])

    for gene in genes:
        #get regular sequences
        pick_random_dic = {}
        records = []
        #get regular

        seqids = [i for i in GI_gene_dic if GI_gene_dic[i] == gene]

        #choose random
        for iter in c.execute(
                "SELECT tc_id, GI FROM blast WHERE Decision IN ('Pick one randomly/Chosen') AND Gene_name = '"
                + gene + "' ORDER BY tc_id"):
            if iter[0] not in pick_random_dic:
                pick_random_dic[iter[0]] = [iter[1]]
            else:
                output = pick_random_dic[iter[0]]
                output.append(iter[1])
                pick_random_dic[iter[0]] = output

        for i in pick_random_dic:
            choice = randint(0, len(pick_random_dic[i]) - 1)
            GI_choice = pick_random_dic[i][choice]
            seqids.append(str(GI_choice))

        #pull the seqs
        seqlists = [seqids[i:i + 200] for i in range(0, len(seqids), 200)]
        for i in seqlists:
            error = False
            seqids_sub = ",".join(i)
            try:
                handle = Entrez.efetch(db="nucleotide",
                                       rettype="fasta",
                                       retmode="text",
                                       id=seqids_sub)
            except:
                error = True
            while error is True:
                try:
                    print("Error, trying again")
                    time.sleep(5)
                    handle = Entrez.efetch(db="nucleotide",
                                           rettype="fasta",
                                           retmode="text",
                                           id=seqids_sub)
                    error = False
                except:
                    pass
            for seq_record in SeqIO.parse(handle, "fasta"):
                records.append(seq_record)
        #get mito/chloro
        GI_mito_GI = [
            i for i in mitochloro_gene_dic if mitochloro_gene_dic[i] == gene
        ]
        if len(GI_mito_GI) != 0:
            iterator = get_seqs_from_sqldb_GI(GI_mito_GI, "hseq", blastdb,
                                              gene, c)
            for seq in iterator:
                records.append(seq)

        # print(str(round((float(len(records))/float(len(seqids)))*100, 2)) + "%")

        SeqIO.write(records, gene + "_cleaned.fa", "fasta")
    conn.close()
Example #4
0
def blast_all(blast_list, c, gene, taxdb, blastdb):
    #first do blast of all (so we only do one blast), then get taxonomy for each query, then get taxonomies for top 10 hits
    hit_levels_return = {}
    hitdic = {}
    print(
        "Blasting error sequences (seqs do not align together and the one picked does not blast to other)"
    )
    iterator = get_seqs_from_sqldb_GI(blast_list, "hseq", blastdb, gene, c)
    export_fasta(iterator, "error_seqs.fa")
    local_blast(gene + "_db.fa", "error_seqs.fa")
    with open("error_seqs.fa.xml") as p:
        count = 0
        print("Parsing blast output")
        blast_records = NCBIXML.parse(p)
        #this iterates in a wierd way - note next at the end - this is due to the structure of ncbi's blast returns
        for rec in blast_records:
            count += 1
            print(
                str(round(float(count) / float(len(blast_list)) * 100, 2)) +
                "%")
            #get GI - blast doesnt pull down GI nums anymore
            queryacc = str(rec.query.split()[0])
            for iter in c.execute(
                    "SELECT GI, tc_id FROM blast WHERE accession = '" +
                    queryacc + "'"):
                rec_GI = str(iter[0])
                sp_tc_id = str(iter[1])
            #get taxonomy for tc_id
            query_taxonomy = [int(sp_tc_id)]
            while 0 not in query_taxonomy:
                for iter in c.execute(
                        "SELECT tc.parent_id FROM taxon_concepts tc, ranks r WHERE tc_id = '"
                        + str(sp_tc_id) + "' AND tc.rank_id = r.rank_id"):
                    sp_tc_id = iter[0]
                    query_taxonomy.append(sp_tc_id)
            hit_levels_all = []
            count1 = 0
            for alignment in rec.alignments:
                for hsp in alignment.hsps:
                    identity = float(hsp.identities) / float(hsp.align_length)
                if alignment.hit_def == queryacc:
                    pass
                else:
                    count1 += 1
                    for iter in c.execute(
                            "SELECT GI FROM blast WHERE accession='" +
                            alignment.hit_def + "'"):
                        hitGI = (str(iter[0]))
                    hitdic[str(hitGI)] = identity
                    if count1 >= 5:
                        break

                #hitdic is GI: identity of all hits for a query
                #get species for all the top hits for each GI - hitsp
                #get a set of all the sp_id for all taxonomies in top 5 hits. Can iterate through query taxonomy and choose closest sp_id then can compare queries
            all_hits_taxonomy = set()
            for hitGI in hitdic:
                for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" +
                                      hitGI + "'"):
                    one_hit_taxonomy = [int(iter[0])]
                    while 0 not in one_hit_taxonomy:
                        for iter in c.execute(
                                "SELECT tc.parent_id FROM taxon_concepts tc, ranks r WHERE tc_id = '"
                                + str(iter[0]) +
                                "' AND tc.rank_id = r.rank_id"):
                            sp_tc_id = iter[0]
                            one_hit_taxonomy.append(sp_tc_id)
                            all_hits_taxonomy.add(sp_tc_id)
                #all_hits_taxonomy[hitGI] = one_hit_taxonomy
            for rank in query_taxonomy:
                if rank in all_hits_taxonomy:
                    hit_levels_return[rec_GI] = rank
                    break
    return (hit_levels_return)