예제 #1
0
파일: BLAST.py 프로젝트: bopopescu/Eind2
 def blast(self, filename, seq, seq_id):
     """
     :param filename:
     :param seq:
     :param seq_id:
     :return:
     """
     try:
         open(filename, 'r')
         return False
     except FileNotFoundError:
         print("File not found starting blast")
         result_handle = qblast(self.blastmethod,
                                self.database,
                                seq,
                                format_type=self.format,
                                expect=self.evalue,
                                matrix_name=self.matrix,
                                hitlist_size=10)
         file = open(filename, "w+")
         result_xml = result_handle.readlines()
         file.writelines(result_xml)
         file.close()
         print("blast complete")
         return True
예제 #2
0
def coronablast(coronaseqrec):

    spikeblast = qblast("blastn", "nt", coronaseqrec.seq)

    with open("results.xml", "w") as save_file:
        blast_results = spikeblast.read()
        save_file.write(blast_results)

    return
예제 #3
0
def blastIt(sequence):
    res = qblast("blastp",
                 "pdb",
                 sequence,
                 auto_format=None,
                 composition_based_statistics=None,
                 db_genetic_code=None,
                 endpoints=None,
                 entrez_query='(none)',
                 expect=10.0,
                 filter=None,
                 gapcosts=None,
                 genetic_code=None,
                 hitlist_size=50,
                 i_thresh=None,
                 layout=None,
                 lcase_mask=None,
                 matrix_name=None,
                 nucl_penalty=None,
                 nucl_reward=None,
                 other_advanced=None,
                 perc_ident=None,
                 phi_pattern=None,
                 query_file=None,
                 query_believe_defline=None,
                 query_from=None,
                 query_to=None,
                 searchsp_eff=None,
                 service=None,
                 threshold=None,
                 ungapped_alignment=None,
                 word_size=None,
                 alignments=500,
                 alignment_view=None,
                 descriptions=500,
                 entrez_links_new_window=None,
                 expect_low=None,
                 expect_high=None,
                 format_entrez_query=None,
                 format_object=None,
                 format_type='XML',
                 ncbi_gi=True,
                 results_file=None,
                 show_overview=None,
                 megablast=None)
    data = NCBIXML.parse(res)
    for record in data:
        if record.alignments:  #skip queries with no matches
            print "QUERY: %s" % record.query[:60]
            for align in record.alignments:
                for hsp in align.hsps:
                    #                    if hsp.expect < E_VALUE_THRESH:
                    print "MATCH: %s " % align.title[:60]
                    print hsp.expect
    return res
예제 #4
0
def blast(sequence):

    with open('cache/blast_tmp2.xml', mode="w+") as f:
        print('entering blast function')

        blastStart = time.time()
        blast_results = qblast("blastp", "nr", sequence, hitlist_size=5000)
        blastEnd = time.time()

        print('finished blast. Took ' + str(blastEnd - blastStart) +
              " seconds.")
        f.write(blast_results.read())
        print('cached blast result')
def bio_blaster(
        input_file: str, file_format: str, output_file: str, index: Optional[str] = None,
        program: str = 'blastn', database: str = 'nt', gi_format: bool = True, size: int = 10
) -> int:
    """Blasting sort of automated.
    :param input_file: The file path which file contains a/the sequence that is to be blasted.
    :param file_format: The format of the input_file.
    :param output_file: Where to store the results.
    :param index: If the file contains multiple seqs and you want to analise 1 place its ref code here.
    :param program: What blast program to use.
    :param database: Which database to use.
    :param gi_format: Whether to request the gi_format.
    :param size: The amount of results to request.
    :return: Number of files made.
    """
    # noinspection SpellCheckingInspection
    assert file_format in (  # https://biopython.org/wiki/SeqIO
        'abi', 'abi-trim', 'ace', 'cif-atom', 'cif-seqres', 'clustal', 'embl', 'fasta', 'fasta-2line', 'fastq-sanger',
        'fastq', 'fastq-solexa', 'fastq-illumina', 'gck', 'genbank', 'gb', 'ig', 'imgt', 'nexus', 'pdb-seqres',
        'pdb-atom', 'phd', 'phylip', 'pir', 'seqxml', 'sff', 'sff-trim', 'snapgene', 'stockholm', 'swiss', 'tab',
        'qual', 'uniprot-xml', 'xdna'
    )
    # noinspection SpellCheckingInspection
    assert program in (  # https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc92
        'blastn', 'blastp', 'blastx', 'tblast', 'tblastx'
    )
    assert size > 0
    if output_file[-4:] == '.xml':
        output_file = output_file[:-4]
    # Bio = Biopython but Pycharm doesn't know.
    # noinspection PyPackageRequirements
    from Bio import SeqIO
    # noinspection PyPackageRequirements
    from Bio.Blast.NCBIWWW import qblast
    record_dict = SeqIO.index(input_file, format=file_format)
    record: str = ''
    count: int = 1
    if index is not None:
        assert index in record_dict.keys()
        record = record_dict[index].format("fasta")
    else:
        for seq in record_dict:
            record += record_dict[seq].format("fasta") + '\n'
    del record_dict
    result_handle = qblast(     # The actual blasting see print(help(qblast))
        program=program, database=database, sequence=record, ncbi_gi=gi_format, hitlist_size=size, megablast=False
    )
    with open(output_file + '.xml', "a") as out_handle:  # Saving the results
        out_handle.write(result_handle.read())
    result_handle.close()   # The result handle is like an open file and must be closed.
    return count
def blaster(protSeq, orgnID="Mus musculus"):
    """take in an amino acid sequence and return the best matching gene name from the organism defined"""

    from Bio.Blast.NCBIWWW import qblast
    from Bio.Blast import NCBIXML
    from sys import exit

    print("\nconnecting to BLAST server. this will take some time...")
    i = 1
    while i < 4:  # BLAST sometimes returns empty results. if so, try once more, it happens quite rarely and resending the query seems to fix it.
        print("attempt number " + str(i))
        i += 1
        resX = qblast("blastp",
                      "refseq_protein",
                      protSeq,
                      entrez_query=orgnID + "[organism]")
        resO = NCBIXML.read(resX)
        if resO.descriptions != []: break
    if resO.descriptions == []:
        print(
            "connection unsuccessful. The BLAST server is acting up. Try again later."
        )
        exit(0)

    else:
        print("connection successful")

    print(resO.descriptions[0])
    descO = resO.descriptions[0]
    if descO.e < 0.01:
        try:
            descID = descO.title.split("|")[
                3]  # not sure why I picked element 3 here
        except IndexError:
            descID = descO.title.split("|")[1]

        if "." in descID: return descID.split(".")[0]
        else: return descID

    else: return "-"
예제 #7
0
파일: preprocess.py 프로젝트: jwayne/mol455
def fetch_homologs(refseq_id):
    """
    Query BLAST for homologs, using a blastp search.

    @param refseq_id:
        The accession number to get homologs for.  Either protein or DNA is OK.
    @return:
        List of protein accession numbers of homologs.
    """
    sys.stderr.write("\nSTEP: fetch homologs(%s)\n" % refseq_id)

    # Determine if protein or dna
    # http://www.ncbi.nlm.nih.gov/Sitemap/sequenceIDs.html
    if refseq_id[2].isalpha():
        prot_id = refseq_id
    else:
        _, prot_ids = _fetch_dna_records([refseq_id])
        prot_id = prot_ids[0]

    # If I don't do this, I seem to get--
    # ValueError: Error message from NCBI: Message ID#68 Error:
    # Error occurred while trying to set up a Blast Object from CGI context:
    # CFastaReader: Segmented set not properly terminated around line 1
    prot_id = prot_id.split(".")[0]

    # Query BLAST for homologs
    sys.stderr.write("\tQuerying BLAST, please be patient (may take minutes)...\n")
    blast_res = qblast("blastp", "nr", prot_id)
    blast_nr = NCBIXML.read(blast_res)
    prot_ids = [alignment.accession for alignment in blast_nr.alignments]
    sys.stderr.write("\tBLAST query successful, %s homologs found\n" % len(prot_ids))

    # Write to disk.
    fname_prot_ids = "homologs.protids"
    if os.path.exists(fname_prot_ids):
        raise Exception("File %s aleady exists" % (fname_prot_ids))
    with open(fname_prot_ids, "w") as f:
        for prot_id in prot_ids:
            f.write(prot_id + "\n")
    return fname_prot_ids
예제 #8
0
def blaster(protSeq, orgnID="H**o sapiens"):
    """take in an amino acid sequence and return the best matching gene name from the organism defined"""

    from Bio.Blast.NCBIWWW import qblast
    from Bio.Blast import NCBIXML
    from sys import exit

    print("\nconnecting to BLAST server. this will take some time...")
    i = 1
    while i < 4:  # BLAST sometimes returns empty results. if so, try once more, it happens quite rarely and resending the query seems to fix it.
        print("attempt number " + str(i))
        i += 1
        resX = qblast("blastp",
                      "refseq_protein",
                      protSeq,
                      entrez_query=orgnID + "[organism]",
                      descriptions=100,
                      alignments=100)
        resO = NCBIXML.read(resX)
        if resO.descriptions != []: break
    if resO.descriptions == []:
        print(
            "connection unsuccessful. The BLAST server is acting up. Try again later."
        )
        exit(0)

    else:
        print("connection successful")

    print(resO.descriptions[0])
    descO = resO.descriptions[0]
    if descO.e < 1e-137:  # set identification threshold here. 0.01 still returns hits in most cases, 1e-140 is based on ptpn22 similarity between mouse and human
        descID = descO.title.split("|")[3]
        if "." in descID: return descID.split(".")[0]
        else: return descID

    else: return "-"
예제 #9
0
    def find_seq_homologues(self, return_raw=False):
        """
        Uses NCBI BLAST to look for structures deposited in the PDB database
        that share __sequence__ homology with the target protein/chain.
        Bridges to Bio.BLAST.NCBIWWW.
        """

        # Get sequence from structure/chain
        # We could use Bio.PDB.Polypeptide?

        from Bio.SCOP.Raf import to_one_letter_code

        s = self
        seq_iter = s.get_residues()
        seq_str = ''

        for aa in seq_iter:
            if aa.resname in to_one_letter_code:
                seq_str += to_one_letter_code[aa.resname]

        # Use BLAST to find homologous sequences with associated
        # structures in the PDB database.
        # Perhaps include local BLAST?

        from Bio.Blast.NCBIWWW import qblast

        # Adapt for short query sequences if needed
        # From http://www.ncbi.nlm.nih.gov/blast/producttable.shtml#shortp

        if len(seq_str) < 15:
            word_size = 2
            expect = 20000
            matrix_name = 'PAM30'
            filter = None
        else:
            word_size = 3
            expect = 10.0
            matrix_name = 'BLOSUM62'
            filter = 'SEG'

        query_result = qblast("blastp", "pdb", seq_str, word_size, expect,
                              matrix_name, filter)

        if return_raw:
            return query_result

        # Parse BLAST result to yield results
        # PDBID : (E-Value, Identity, Positives, Gaps, Alignment)

        from Bio.Blast import NCBIXML

        blast_records = NCBIXML.read(query_result)

        results = []

        for alignment in blast_records.alignments:
            for hsps in alignment.hsps:
                id_perc = "%s/%s" % (hsps.identities, alignment.length)
                pos_perc = "%s/%s" % (hsps.positives, alignment.length)
                gaps_perc = "%s/%s" % (hsps.gaps, alignment.length)

                pdb_id = alignment.title.split('|')[3]
                e_value = hsps.expect

                results.append(
                    (pdb_id, "%2.5e" % e_value, id_perc, pos_perc, gaps_perc,
                     '\n'.join([hsps.query, hsps.match, hsps.sbjct])))

        return results
예제 #10
0
def cazy2class(prefix,F,remote=False):
	''' 
	will take the cazy database (dictionary provided) and try to fetch subfamilies and place them
	as classifiers.
	'''
	print 'You chose to use CAZY database to classify GH13 family into subfamilies'\
          ' this will take a while, since have to go over BLAST results, etc..'
	cls=open(prefix+'.cls','w')
	# import database
	db=pickle.load(open('CazyDB.bin'))
	names = get_names(prefix+'.gm')
	for n in names:
		print 'Processing %s...'%(n)
		if remote:
			Entrez.email = '*****@*****.**'
			print '\tBlasting (Running remotely)...'
			n=n[:-1]+'_'+n[-1]
			while 1:
				try:
					b=qblast('blastp','nr',n,perc_ident=90,expect=1,gapcosts='11 1')
					print '\tBlast Done... \n\t\tAnalysing entries...'
					break
				except:
					print 'Some problem in NCBI, sleeping for 10...'
					time.sleep(10)
		else:
			print '\tBlasting (Running locally)...'
			fi=open('temp.fasta','w')
			fi.write('>%s\n%s'%(n,F.seqs[F.chains[n[:4]]]))
			fi.close()
			#blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001,
			#                                     outfmt=5, out="temp.xml",max_target_seqs=50, 
			#                                     num_alignments=50,num_threads=4)
			bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta  -evalue 0.0001 -max_target_seqs 50 '\
			         '-seg yes -num_threads 4  -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml',
			         shell=True)
			bl.wait()
			print '\tBlast Done... \n\t\tAnalysing entries...'
			b=open('temp.xml')
		blast_record = NCBIXML.read(b)
		rm = Popen('rm temp.*',shell=True)
		rm.wait()
		nohit=True
		while nohit:
			for a in blast_record.alignments:
				print '\t\t\t%s'%(a.accession)
				h=a.hsps[0]
				if float(h.identities)/float(h.align_length) >=0.9:
					ans,k = dict_lookup(a.accession,db)
					if ans:
						cls.write(str(db[k])+';')
						print '\t\t\t\tAccession number found in CAZY!, Subfamily %s'%(db[k])
						nohit=False
						break
					else:
						if blast_record.alignments.index(a)+1 == len(blast_record.alignments):
							cls.write('%s;'%(n))
							nohit=False
							print '\tNo relative found in CAZY'
							break
				elif blast_record.alignments.index(a)+1 == len(blast_record.alignments):
					cls.write('%s;'%(n))
					nohit=False
					print '\tNo relative found in CAZY'
					break
	cls.write('\n')
	cls.close()
예제 #11
0
from Bio.Blast.NCBIWWW import qblast
from Bio.Blast.NCBIXML import parse
from Bio import SeqIO

records = SeqIO.parse("./apoe.fas", "fasta")

PROGRAM = 'blastp'
DATABASE = 'nr'
for rec in records:
    # query NCBI Blast API
    xml_result = qblast(PROGRAM, DATABASE, rec.seq)
    # Parse xml result
    results = parse(xml_result)
    # iterate over each result
    for record in results:

        for alignment in record.alignments:
            print(alignment)
예제 #12
0
def ncbiblast():
    if session.username == None:
        redirect(URL(r=request, f='../account/log_in'))
    form = FORM(TABLE(TR("Job Title: ", 
                        INPUT(_type="text", _name="title")),
                      TR("Sequence:  ", 
                        TEXTAREA(_type="text", _name="sequence",
                                 requires=IS_NOT_EMPTY())),
                      TR("Program: ", 
                        SELECT("blastn", "blastp", "blastx", "tblastn",
                               "tblastx",
                               _name="program")),
                      TR("Database: ", 
                        SELECT("Non-redundant GenBank (nr)", 
                        "NCBI Reference Sequence (refseq)",
                        "SWISS-PROT protein sequence (last update) (swissprot)",
                        "Patent division of GenPept (pat)", 
                        "Protein Data Bank (pdb)",
                        "Protein - environmental samples (env_nr)",
                        "RNA - NCBI Reference Sequence (refseq_rna)",
                        "Genomic - NCBI Reference Sequence (refseq_genomic)",
                        "ESTs from GenBank + EMBL + DDBJ (est)",
                        "Mouse subset of ESTs (est_mouse)", 
                        "Human subset of ESTs (est_human)",
                        "Non-mouse non-human subset of ESTs (est_others)",
                        "Genome Survey Sequences (gss)", 
                        "Complete chromosomes (chromosome)",
                        "Whole Genome Shotgun sequences (wgs)",
                        "Nucleotide - environmental samples (env_nr)",
                        _name="database")),
                      TR("Translation Table: ", 
                        SELECT("Standard (1)",
                        "Vertebrate Mitochondria (2)",
                        "Yeast Mitochondria (3)",
                        "Mold, Protozoan, and Coelenterate Mitochondria (4)",
                        "Mycoplasma/Spiroplasma (4)",
                        "Invertebrate Mitochondria (5)",
                        "Ciliate, Dasycladacean and Hexamita Nuclear (6)",
                        "Echinoderm and Flatworm Mitochondria (9)",
                        "Euplotid Nuclear (10)",
                        "Bacterial, Archaeal and Plant Plastid (11)",
                        "Alternative Yeast Nuclear (12)",
                        "Ascidian Mitochondria (13)",
                        "Alternative Flatworm Mitochondria (14)",
                        "Blepharisma Nuclear (15)",
                        "Chlorophycean Mitochondria (16)",
                        "Trematode Mitochondria (21)",
                        "Scenedesmus obliquus Mitochondria (22)",
                        "Thraustochytrium Mitochondria (23)",
                            _name="gcode")),
                      TR("Matrix: ", 
                        SELECT("BLOSUM62", "BLOSUM80", "BLOSUM45", 
                        "PAM30", "PAM70",_name="matrix")),
                      TR("Maximum number of hits to return: ", 
                        SELECT("50", "100", "200", "500", "1000", "2000",
                               "5000", "10000", "20000", "50000",
                               _name="hitlist_size")),
                      TR("Number of random hits expected: ", 
                        INPUT(_type="text", _name="expect", value=10)),
                      TR("Word size: ", 
                        INPUT(_type="text", _name="word_size", value=3)),
                      TR("",INPUT(_type="submit", _value="SUBMIT"))))
    if form.accepts(request.vars,session):
        from Bio.Blast.NCBIWWW import qblast
        from Bio.Blast import NCBIXML
        sequence = seqClean(fasta_to_raw(form.vars.sequence.upper()))
        rec = NCBIXML.parse(qblast(form.vars.program, 
                                   ncbi_db[form.vars.database], 
                                   sequence,
                                   matrix_name=form.vars.matrix,
                                   hitlist_size=int(form.vars.hitlist_size),
                                   expect=float(form.vars.expect),
                                   word_size=int(form.vars.word_size),
                                   db_genetic_code=genetic_code[form.vars.gcode])).next()
        session['title'] = form.vars.title
        session['sequence'] = sequence
        session['database'] = form.vars.database
        session['program'] = form.vars.program
        session['matrix'] = form.vars.matrix
        session['gcode'] = form.vars.gcode
        session['hitsize'] = form.vars.hitlist_size
        session['expect'] = form.vars.expect
        session['word_size'] = form.vars.word_size
        session['data'] = [{'Title':row.title, 'Score':str(row.score), 
                            'E-value':str(row.e)} 
                           for row in rec.descriptions]
        redirect(URL(r=request, f='ncbiblast_output'))
    return dict(form=form)
예제 #13
0
if __name__ == '__main__':
    from common import file_parser
    from Bio import SeqIO
    from Bio.Blast.NCBIWWW import qblast

    parser = file_parser(prog_desc='Perform a BLAST query',
                         file_desc='A file containing a protein sequence')
    args = parser.parse_args()

    sequence = open(args.file, 'rU').read()
    result_handle = qblast('blastp', 'nr', sequence)
    print(result_handle.read())
    result_handle.close()
예제 #14
0
from Bio.Blast.NCBIWWW import qblast

query = (">s1\n"
         "MHEIKYITIDEADVLLTEEHEETTRFICQSANRDRQISLFSATTSERLDNFFDKVESSQQ\n"
         "IEVVAGEAKMPTTIDHIYIQVNPRDKVKTLYRLAQVENMRAIVFVNTIGRLNTVYEKLNH\n"
         "DGVKISALHGDLSKLQRQESVRDFKKGETSLLLATDVAARGIDLPNLPAIIQFDMAQSLT\n"
         "QYVHRSGRTGRMGEQGAAISLVTDREARELKQMVKENDVKMIEQIVKFGHLIDPQKTK")

r = qblast("blastp", "nr_v5", query).getvalue()
print(r)
예제 #15
0
def ncbiblast():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    form = FORM(TABLE(TR("Job Title: ", 
                        INPUT(_type="text", _name="title")),
                      TR("Sequence:  ", 
                        TEXTAREA(_type="text", _name="sequence",
                                 requires=IS_NOT_EMPTY())),
                      TR("Program: ", 
                        SELECT("blastn", "blastp", "blastx", "tblastn",
                               "tblastx",
                               _name="program")),
                      TR("Database: ", 
                        SELECT("Non-redundant GenBank (nr)", 
                        "NCBI Reference Sequence (refseq)",
                        "SWISS-PROT protein sequence (last update) (swissprot)",
                        "Patent division of GenPept (pat)", 
                        "Protein Data Bank (pdb)",
                        "Protein - environmental samples (env_nr)",
                        "RNA - NCBI Reference Sequence (refseq_rna)",
                        "Genomic - NCBI Reference Sequence (refseq_genomic)",
                        "ESTs from GenBank + EMBL + DDBJ (est)",
                        "Mouse subset of ESTs (est_mouse)", 
                        "Human subset of ESTs (est_human)",
                        "Non-mouse non-human subset of ESTs (est_others)",
                        "Genome Survey Sequences (gss)", 
                        "Complete chromosomes (chromosome)",
                        "Whole Genome Shotgun sequences (wgs)",
                        "Nucleotide - environmental samples (env_nr)",
                        _name="database")),
                      TR("Translation Table: ", 
                        SELECT("Standard (1)",
                        "Vertebrate Mitochondria (2)",
                        "Yeast Mitochondria (3)",
                        "Mold, Protozoan, and Coelenterate Mitochondria (4)",
                        "Mycoplasma/Spiroplasma (4)",
                        "Invertebrate Mitochondria (5)",
                        "Ciliate, Dasycladacean and Hexamita Nuclear (6)",
                        "Echinoderm and Flatworm Mitochondria (9)",
                        "Euplotid Nuclear (10)",
                        "Bacterial, Archaeal and Plant Plastid (11)",
                        "Alternative Yeast Nuclear (12)",
                        "Ascidian Mitochondria (13)",
                        "Alternative Flatworm Mitochondria (14)",
                        "Blepharisma Nuclear (15)",
                        "Chlorophycean Mitochondria (16)",
                        "Trematode Mitochondria (21)",
                        "Scenedesmus obliquus Mitochondria (22)",
                        "Thraustochytrium Mitochondria (23)",
                            _name="gcode")),
                      TR("Matrix: ", 
                        SELECT("BLOSUM62", "BLOSUM80", "BLOSUM45", 
                        "PAM30", "PAM70",_name="matrix")),
                      TR("Maximum number of hits to return: ", 
                        SELECT("50", "100", "200", "500", "1000", "2000",
                               "5000", "10000", "20000", "50000",
                               _name="hitlist_size")),
                      TR("Number of random hits expected: ", 
                        INPUT(_type="text", _name="expect", value=10)),
                      TR("Word size: ", 
                        INPUT(_type="text", _name="word_size", value=3)),
                      TR("",INPUT(_type="submit", _value="SUBMIT"))))
    if form.accepts(request.vars,session):
        from Bio.Blast.NCBIWWW import qblast
        from Bio.Blast import NCBIXML
        sequence = seqClean(fasta_to_raw(form.vars.sequence.upper()))
        rec = NCBIXML.parse(qblast(form.vars.program, 
                                   ncbi_db[form.vars.database], 
                                   sequence,
                                   matrix_name=form.vars.matrix,
                                   hitlist_size=int(form.vars.hitlist_size),
                                   expect=float(form.vars.expect),
                                   word_size=int(form.vars.word_size),
                                   db_genetic_code=genetic_code[form.vars.gcode])).next()
        session['title'] = form.vars.title
        session['sequence'] = sequence
        session['database'] = form.vars.database
        session['program'] = form.vars.program
        session['matrix'] = form.vars.matrix
        session['gcode'] = form.vars.gcode
        session['hitsize'] = form.vars.hitlist_size
        session['expect'] = form.vars.expect
        session['word_size'] = form.vars.word_size
        session['data'] = [{'Title':row.title, 'Score':str(row.score), 
                            'E-value':str(row.e)} 
                           for row in rec.descriptions]
        redirect(URL(r=request, f='ncbiblast_output'))
    return dict(form=form)
예제 #16
0
def cazy2class(prefix, F, remote=False):
    ''' 
	will take the cazy database (dictionary provided) and try to fetch subfamilies and place them
	as classifiers.
	'''
    print 'You chose to use CAZY database to classify GH13 family into subfamilies'\
             ' this will take a while, since have to go over BLAST results, etc..'
    cls = open(prefix + '.cls', 'w')
    # import database
    db = pickle.load(open('CazyDB.bin'))
    names = get_names(prefix + '.gm')
    for n in names:
        print 'Processing %s...' % (n)
        if remote:
            Entrez.email = '*****@*****.**'
            print '\tBlasting (Running remotely)...'
            n = n[:-1] + '_' + n[-1]
            while 1:
                try:
                    b = qblast('blastp',
                               'nr',
                               n,
                               perc_ident=90,
                               expect=1,
                               gapcosts='11 1')
                    print '\tBlast Done... \n\t\tAnalysing entries...'
                    break
                except:
                    print 'Some problem in NCBI, sleeping for 10...'
                    time.sleep(10)
        else:
            print '\tBlasting (Running locally)...'
            fi = open('temp.fasta', 'w')
            fi.write('>%s\n%s' % (n, F.seqs[F.chains[n[:4]]]))
            fi.close()
            #blastp_cline = NcbiblastpCommandline(query="temp.fasta", db="nr", evalue=0.0001,
            #                                     outfmt=5, out="temp.xml",max_target_seqs=50,
            #                                     num_alignments=50,num_threads=4)
            bl=Popen('blastp -db nr -outfmt "5" -query temp.fasta  -evalue 0.0001 -max_target_seqs 50 '\
                     '-seg yes -num_threads 4  -gapopen 10 -gapextend 1 -matrix BLOSUM90 -out temp.xml',
                     shell=True)
            bl.wait()
            print '\tBlast Done... \n\t\tAnalysing entries...'
            b = open('temp.xml')
        blast_record = NCBIXML.read(b)
        rm = Popen('rm temp.*', shell=True)
        rm.wait()
        nohit = True
        while nohit:
            for a in blast_record.alignments:
                print '\t\t\t%s' % (a.accession)
                h = a.hsps[0]
                if float(h.identities) / float(h.align_length) >= 0.9:
                    ans, k = dict_lookup(a.accession, db)
                    if ans:
                        cls.write(str(db[k]) + ';')
                        print '\t\t\t\tAccession number found in CAZY!, Subfamily %s' % (
                            db[k])
                        nohit = False
                        break
                    else:
                        if blast_record.alignments.index(a) + 1 == len(
                                blast_record.alignments):
                            cls.write('%s;' % (n))
                            nohit = False
                            print '\tNo relative found in CAZY'
                            break
                elif blast_record.alignments.index(a) + 1 == len(
                        blast_record.alignments):
                    cls.write('%s;' % (n))
                    nohit = False
                    print '\tNo relative found in CAZY'
                    break
    cls.write('\n')
    cls.close()
예제 #17
0
def main(logginglevel, input, tmp, idthreshold, evalue):
    logger = createLogger(__file__)
    logger = setLoggerLevel(logger, logginglevel)

    with ThreadPoolExecutor(max_workers=10) as executor:
        loop = asyncio.get_event_loop()

        inputFiles = [
            getFile(inputFile, logger=logger, tmp=tmp) for inputFile in input
        ]
        outputFiles = []
        for file in inputFiles:

            fileOut = file.replace(".fasta", "_ext.fasta")
            logInfo(logger, "Extending file {} to {}".format(file, fileOut))
            seqs = []

            with open(file, "r") as handle:
                for record in SeqIO.parse(handle, "fasta"):
                    seqs.append("> {}\n{}".format(record.id, str(record.seq)))
            queryStr = "\n".join(seqs)

            logInfo(logger, "Requesting web BLASTP search")
            requestOut = qblast("blastp",
                                "nr_v5",
                                queryStr,
                                expect=float(evalue),
                                perc_ident=float(idthreshold)).getvalue()

            root = ET.fromstring(requestOut)

            accessionIDs = []

            hits = root.findall(
                "./BlastOutput_iterations/Iteration/Iteration_hits/Hit")
            logInfo(logger, "Got {} hits for given query".format(len(hits)))
            if len(hits) <= 0:
                logInfo(logger, requestOut)
                raise Exception('No hits found')

            recordsOut = []
            recordCountOut = 0
            for hit in hits:
                seqs = hit.findall("./Hit_hsps/Hsp/Hsp_qseq")
                for seq in seqs:
                    accessionID = hit.find("./Hit_accession").text
                    accessionIDs.append(accessionID)

            gatherTasks = []

            requestsLimit = 3
            chunkSize = int(len(accessionIDs) / requestsLimit) + 1
            accessionChunks = [
                chunk for chunk in chunks(accessionIDs, chunkSize)
            ]
            logDebug(
                logger,
                "Chunked request, will request Entrez for {} batches of size {}."
                .format(len(accessionChunks), chunkSize))

            for accessionIDs in accessionChunks:
                gatherTasks.append(
                    loop.run_in_executor(executor, entrezRetrieveSequence,
                                         *((accessionIDs, tmp))))

            gatherFuture = asyncio.ensure_future(asyncio.gather(*gatherTasks))
            loop.run_until_complete(gatherFuture)

            recordsOut = []
            for records in gatherFuture.result():
                for record in records:
                    recordsOut.append(record.format("fasta"))

            recordCountOut = len(recordsOut)
            logDebug(logger, "Writing to file {}".format(fileOut))
            with open(fileOut, "w") as fileOutHandle:
                fileOutHandle.write("\n".join(recordsOut))

            with open(fileOut, "r") as handle:
                recordCount = 0
                for record in SeqIO.parse(handle, "fasta"):
                    recordCount = recordCount + 1
                if recordCount != recordCountOut:
                    logError(
                        logger,
                        "Got mismatch records count after writing output file. {} records are present in {} and there should be {} reconds."
                        .format(recordCount, fileOut, recordCountOut))
            outputFiles.append(fileOut)

        logInfo(
            logger,
            "Written {} files in total. Returning them to stdout as plaintext paths list"
            .format(len(outputFiles)))
        print("\n".join(outputFiles))
def main(args):
    """Run program

    Args:
        args (NameSpace): ArgParse arguments dictating program use
    """

    tqdm.write('>>> Starting prokka_blast_pipeline')

    # Get Gene IDs from ID file
    ids = [gene_id.strip() for gene_id in args.id]

    tqdm.write('>>> Found {0} ID(s) in {1}'
               .format(str(len(ids)), args.id.name))

    # Get contigs that contain a feature ID matching a given Gene ID
    contigs = []
    gff_reader= GFF3Reader(args.gff3)
    for entry in gff_reader.iterate():
        try:  # Ignore features without a gene_feature field
            if entry.attributes['gene_feature'] in ids:
                contigs.append(entry.seqid)
        except KeyError:
            continue

    tqdm.write('>>> Found {0} contig(s) containing gene features matching '
               'given ID(s) in {1}'.format(str(len(contigs)), args.gff3.name))

    # Obtain PROKKA IDs and annotations of genes on contigs obtained earlier
    prokka_to_contig = defaultdict(str)
    prokka_to_gene = defaultdict(str)
    args.gff3.seek(0)
    for entry in gff_reader.iterate():
        if entry.seqid in contigs and 'gene_feature' in entry.attributes:
            prokka_to_contig[entry.attributes['ID']] = entry.seqid
            prokka_to_gene[entry.attributes['ID']] = entry.attributes[
                'gene_feature']

    tqdm.write('>>> Found {0} gene feature(s) on contigs matching given ID(s)'
               .format(str(len(prokka_to_contig))))

    # Get sequences from FAA file if they match a PROKKA ID obtained above
    blast_entries = []
    for entry in fasta_iter(args.faa):
        if entry.id in prokka_to_contig or ids[0] == '*':
            blast_entries.append(entry)

    tqdm.write('>>> Obtained {0} amino acid sequence(s) from {1}'
               .format(str(len(blast_entries)), args.faa.name))

    # Output header line
    args.output.write('Contig\tPROKKA_ID\tAnnotation\tGene ID\tSubject\t'
                      'Query Coverage(%)\tE-Value\tIdentity(%){0}'.format(
                                                                   os.linesep))
    tqdm.write('>>> BLASTing {0} amino acid sequence(s) against the NCBI {1} '
               'database'.format(str(len(blast_entries)), args.database))

    # BLAST sequences and calculate various summary values
    count = 0
    for entry in tqdm(blast_entries):
        # Continue attempting same blast until it succeeds w/o NCBI errors
        while True:
            try:
                result_handle = qblast(args.program, args.database,
                                       entry.sequence, alignments=args.top,
                                       descriptions=args.top,
                                       hitlist_size=args.top,
                                       expect=args.e_value)
                break
            except (ValueError, socket_error):  # Ignore NCBI
                continue

        # Process BLAST results
        result_generator = NCBIXML.parse(result_handle)
        for result in result_generator:
            for alignment in result.alignments:
                count += 1
                for hsp in alignment.hsps:
                    cov = float(hsp.align_length / len(entry.sequence)) * 100.0
                    perc = float(hsp.identities / len(entry.sequence)) * 100.0
                    taxonomy = alignment.hit_def.split('[')[1]
                    taxonomy = taxonomy.split(']')[0]

                    # Format and write output to custom TSV
                    output = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}{8}'\
                             .format(prokka_to_contig[entry.id], entry.id,
                                     entry.description,
                                     prokka_to_gene[entry.id], taxonomy,
                                     str(cov), str(hsp.expect), str(perc),
                                     os.linesep)
                    args.output.write(output)

    tqdm.write('>>> Wrote {0} total hit(s) to {1}'.format(str(count),
                                                          args.output.name))

    tqdm.write('>>> Exiting prokka_blast_pipeline')
예제 #19
0
#!/usr/bin/python2.7

from glob import glob
from Bio import SeqIO
from Bio.Blast.NCBIWWW import qblast
from os.path import splitext, basename

with open('archaea_gis.txt') as f:
    gis = map(lambda x: x.strip(), f.readlines())
    query = " ".join(gis)

for fn in glob('fasta/*.fasta'):
    try:
        print "getting BLAST for %s" % (fn)
        rec = SeqIO.read(fn, 'fasta')
        accid = splitext(basename(fn))[0]
        handle = qblast('blastn', 'chromosome', rec.seq, entrez_query=query)
        with open('blast/' + accid + '.xml', 'w') as alf:
            alf.write(handle.read())
    except Exception as e:
        print "failed %s: %s" % (fn, str(e))