Exemple #1
0
def detect_mutations(input: str, db: str, out: str):
    os.system(
        f"blastx -query {input} -subject {db} -subject_besthit -max_hsps 1 -outfmt 5 > tmp.blast.xml"
    )
    with open("tmp.blast.xml", "r") as f:
        with open(out, "w") as fh:
            done = 0
            for p in parse(f):
                mutations = []
                for alignment in p.alignments:
                    al_mutations = []
                    hsps = alignment.hsps[0]
                    query = hsps.query
                    subject = hsps.sbjct
                    if (len(query)) != len(subject):
                        continue
                    for i in range(len(subject)):
                        if query[i] not in ['X', '-', '*'
                                            ] and query[i] != subject[i]:
                            al_mutations.append(
                                f"{alignment.title.split('|')[0]} {subject[i]}{i + hsps.sbjct_start}{query[i]}"
                            )

                    mutations.append(al_mutations)
                    done += 1
                fh.write(f"{p.query};")
                fh.write(','.join(
                    list(itertools.chain.from_iterable(mutations))))
                fh.write("\n")
    os.unlink("tmp.blast.xml")
def create_blast_dict():

    #seq = acc2sequence(acc)

    with open(blast_cache, mode="r") as f:

        records = [record for record in parse(f)]
        for record in records:
            homologs = [{
                "accession":
                alignment.accession,
                "identity":
                round((hsp.identities / hsp.align_length) * 100, 2),
                "coverage":
                round((hsp.align_length / record.query_length) * 100, 2)
            } for alignment in record.alignments for hsp in alignment.hsps]

        with open(dict_cache, mode="wb") as f:
            pickle.dump(homologs, f)
            print("cached the homologs dictionary")
Exemple #3
0
def calc_immutability(consensus_filename, protein_seqs_filename, CUTOFF):
    out = NcbitblastnCommandline(subject=consensus_filename,
                                 query=protein_seqs_filename,
                                 out="results.xml",
                                 outfmt=5)  # evalue=?
    out(
    )  # NCBI tblastn is run locally to align proteins to the consensus sequence

    consensus_seq = SeqIO.parse(consensus_filename,
                                "fasta").__next__().seq._data
    protein_alignments = [
        x.alignments[0].hsps[0] for x in parse(open(out.out, "r"))
    ]

    mutability_score = [UNCONSERVED_SCORE for _ in range(len(consensus_seq))]
    for alignment in protein_alignments:
        position = alignment.sbjct_start - 1

        # print("\ncalculating for",alignment.sbjct)
        for aa in alignment.sbjct:
            assert aa != " ", "empty aa in alignment subject"

            if aa != "X":
                codon0 = consensus_seq[position:position + 3]
                # print(aa, end="")
                aa0 = "none"
                try:
                    aa0 = codon_table[codon0]
                except:
                    position += 3
                    continue  #We do not calculate if there is an ambiguous nucleotide in the consensus seq. We could
                    #have calculated for the other nucleotides in the codon however it requires more
                    #detailed calculation.

                # print(position,codon0,aa0,end="|")

                for pos_in_codon in range(3):
                    # codonX=codon0
                    if mutability_score[position +
                                        pos_in_codon] != UNCONSERVED_SCORE:
                        continue  # if this position was previously processed, there is no reason to process again. SKIP IT. This might be the case if there are several proteins aligned to here.

                    mutability_score[position + pos_in_codon] = 0

                    for putative_nuc in SUBSTITUTION_MODEL[
                            codon0[pos_in_codon]].keys():

                        codonX = replace(codon0, putative_nuc, pos_in_codon)
                        aaX = codon_table[codonX]

                        if aaX != "_":  #and aa0!=aaX: #To include self conversion sounds rigth way as both matrices include these in their calculations. Asuming any deleterious mutation would not be conserved thus not needed for consideration.

                            try:
                                aa_score = aa_score_method(aa0, aaX)
                            except:
                                aa_score = aa_score_method(
                                    aaX, aa0
                                )  #pam30 is a triangular matrix. so we check reversed
                            subs_score = SUBSTITUTION_MODEL[
                                codon0[pos_in_codon]][putative_nuc]
                            # print(aa0,"-->",aaX,":",aa_score*subs_score,end="|")

                            mutability_score[
                                position +
                                pos_in_codon] += aa_score * subs_score
            position += 3

    print("\nconsevation scores complete:")

    standart_dev = std([x for x in mutability_score if x != None])
    average = mean([x for x in mutability_score if x != None])

    printlines = ["", "", "", "", "", ""]
    for i in range(len(consensus_seq)):
        if i % 100 == 0:
            print(i, end=" " * (100 - len(str(i))))
            printlines[0] += str(i) + " " * (100 - len(str(i)))
    print()
    for i in range(len(consensus_seq)):
        if i % 100 == 0:
            print("|", end="")
            printlines[1] += "|"
        else:
            print(" ", end="")
            printlines[1] += " "
    print()

    prot = ""  #TODO: automatically find where protein starts, add all one by one. This code below assumes all has ORF +1.It would not work for all cases
    for i in range(2, len(consensus_seq), 3):
        d = "   "
        if mutability_score[i] != None:
            try:
                d = " " + codon_table[consensus_seq[i:i + 3]] + " "
            except:
                pass
        prot += d
    print("  " + prot)
    # printlines[2]="  "+prot

    for i in range(len(consensus_seq)):
        print(consensus_seq[i], end="")
        printlines[3] += consensus_seq[i]
    print()
    for i in range(len(consensus_seq)):
        d = "_"
        if mutability_score[i] != None:
            d = " "
            if mutability_score[i] < average + standart_dev * 2:
                d = "░"
            if mutability_score[i] < average + standart_dev:
                d = "▒"
            if mutability_score[i] < average - standart_dev:
                d = "▓"
            if mutability_score[i] < average - standart_dev * 2:
                d = "█"
        print(d, end="")
        printlines[4] += d
    print()
    immutable_seq = ""
    for i in range(len(consensus_seq)):
        d = "N"
        if mutability_score[i] != None and mutability_score[i] < CUTOFF:
            d = consensus_seq[i]
        print(d, end="")
        printlines[5] += d

        immutable_seq += d

    # print()
    # for i in range(0,30000,100):
    #     print("\n".join(printlines[i:i+100]))

    print("\n<<<REPORT>>>")

    print("settings - Score for unconserved nucleotides:", UNCONSERVED_SCORE,
          "Cutoff score:", CUTOFF)
    print("max score:", max([x for x in mutability_score if x != None]),
          "min score:", min([x for x in mutability_score if x != None]))
    print("mean score", average, "standart dev:", standart_dev)
    print("number of non-Ns in consensus seq:",
          len([x for x in consensus_seq if x != "N"]))
    print("number of non-Ns in immutable seq:",
          len([x for x in immutable_seq if x != "N"]))
    print("scores of coding regions summed:",
          sum([x for x in mutability_score if x != None]))
    # print("scores of all regions summed:", sum([x for x in mutability_score]))
    return immutable_seq, mutability_score
Exemple #4
0
from Bio.Blast.NCBIWWW import qblast
from Bio.Blast.NCBIXML import parse
from Bio import SeqIO

records = SeqIO.parse("./apoe.fas", "fasta")

PROGRAM = 'blastp'
DATABASE = 'nr'
for rec in records:
    # query NCBI Blast API
    xml_result = qblast(PROGRAM, DATABASE, rec.seq)
    # Parse xml result
    results = parse(xml_result)
    # iterate over each result
    for record in results:

        for alignment in record.alignments:
            print(alignment)
Exemple #5
0
    def _main(self):
        email = '*****@*****.**'
        genome_dir = '/home/allis/Dropbox/Science/Микра/Thermococcus/sequence/GenBank/Thermococcales/Thermococcus/'
        genome = 'Thermococcus_barophilus_Ch5.gb'
        gene = 'TBCH5v1_1369'  #cooS
        database = 'nr'
        segment = [3200, 12000]

        seq = SeqLoader.load_file(os.path.join(genome_dir, genome))
        if not seq: raise RuntimeError('No genome loaded')
        seq = seq[0]

        index = get_indexes_of_genes(seq, gene)
        if not index: raise RuntimeError('No gene found')

        feature = seq.features[index[0]]
        query = feature.extract(seq)

        segments_file = 'CO-clusters.gb'
        #get cluster variants if needed
        if not os.path.isfile(segments_file):
            blast_file = 'blast.results.xml'
            if os.path.isfile(blast_file):
                blast = list(parse(open(blast_file)))
            else:
                blast = BlastCLI.blast_seq(query,
                                           database,
                                           100,
                                           remote=True,
                                           task='blastn',
                                           parse_results=True,
                                           save_results_to='blast.results.xml')
            if not blast: raise RuntimeError('Blast returned no results')
            flt = BlastFilter(lambda hsp, r: hsp.align_length > 700,
                              filter_hsps=True)
            flt(blast)
            queries = []
            for ali in BlastCLI.iter_alignments(blast):
                q = BlastCLI.Query(ali,
                                   'hsp',
                                   start_offset=segment[0],
                                   end_offset=segment[1])
                if q: queries.append(q)
                print(queries[-1])

            segments = BlastWWW.fetch_queries(email, queries)
            safe_write(segments, segments_file)
            for r in segments:
                print('[%s] %s: %dbp' % (r.id, pretty_rec_name(r), len(r)))
            return 0

        #find primers in alignments of the selected features
        local_files = [
            os.path.join(genome_dir, f)
            for f in ('Thermococcus_barophilus_DT4-complete-genome.gb',
                      'Thermococcus_ST-423.gb', 'Thermococcus_CH1-complete.gb')
        ]
        loader = SeqLoader(self.abort_event)
        segments = loader.load_files([segments_file] + local_files)
        fprimers, transF_ali = find_primers(
            segments, 'transF',
            dict(plen=(20, 30),
                 max_mismatches=5,
                 min_first_matches=3,
                 AT_first=True))
        rprimers, cooS_ali = find_primers(segments,
                                          'cooS',
                                          dict(plen=(20, 30),
                                               max_mismatches=4,
                                               min_first_matches=3,
                                               AT_first=True),
                                          reverse=True)
        if not fprimers:
            print('\nNo forward primers found')
            return 1
        if not rprimers:
            print('\nNo reverse primers found')
            return 1
        print('\nForward primers:')
        for p in fprimers:
            print('%s: %s' % (p.id, p))
        print('\nReverse primers:')
        for p in rprimers:
            print('%s: %s' % (p.id, p))
        print()
        #add primers to alignments and save them
        transF_ali = PrimerFinder.add_primers_to_alignment(
            fprimers, transF_ali)
        cooS_ali = PrimerFinder.add_primers_to_alignment(rprimers,
                                                         cooS_ali,
                                                         reverse=True)
        AlignmentUtils.save(transF_ali, 'transF.aln')
        AlignmentUtils.save(cooS_ali, 'cooS.aln')