def blastparse(stdout, output, tname, ntname): global recorddict, minLength handle = open(output, 'w') # open the target fasta file for writing blast_handle = cStringIO.StringIO(stdout) # Convert string to IO object for use in SearchIO using StringIO try: # Necessary to avoid bad genomes for qresult in SearchIO.parse(blast_handle, 'blast-tab'): # Parse the blast output sting as if it were a file for hit in qresult: # Hit object for hsp in hit: # Hsp object begin = hsp.query_range[0] # Start of hsp finish = hsp.query_range[1] # End of hsp if hsp.query_id in recorddict: # For the Contig name in the target fasta dictionary mask using coordinates if finish > begin: recorddict[hsp.query_id].seq = \ recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \ + recorddict[hsp.query_id].seq[finish:] else: recorddict[hsp.query_id].seq \ = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \ + recorddict[hsp.query_id].seq[begin:] recorddict_bak = deepcopy(recorddict) # Copy the dictionary so we may iterate and modify the result for idline in recorddict_bak: # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' # Find a sequence of at least the target length pattern = r'[ATCG]{100,}N{200,900}[ATCG]{100,}|[^N]{' + re.escape(str(minLength))+r'}' if re.match(pattern, str(recorddict[idline].seq)) is not None: SeqIO.write(recorddict[idline], handle, "fasta") else: # print 'Contig \'%s\' not written to file' % id recorddict.pop(idline) except ValueError: print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed.keys())) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed.keys())) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed._proxy._handle.close() # TODO - Better solution if sqlite3 is not None: db_indexed.close() db_indexed._con.close()
def main(): extensions = {'blast-tab': ['tsv', 'csv', 'blast', 'm8', 'blastm8'], 'blast-text': ['txt', 'bls', 'blast'], 'blast-xml': ['xml'], 'blat-psl': ['psl'], 'hmmer3-tab': ['tsv', 'csv'], 'hmmer3-text': ['txt'], 'hmmer2-text': ['txt'], 'exonerate-text': ['txt']} kwargs = args.keywords infile = args.infile in_type = args.in_type in_ext = infile.split('.')[-1] proper_ext = extensions[in_type][0] if in_ext not in extensions[in_type]: print(textwrap.fill("error: invalid input file extension \"{}\". An " "appropriate extension for this input type is {}" .format(in_ext, proper_ext), 79)) sys.exit(1) out_type = args.out_type if args.output: outfile = io_check(args.output, 'w') else: out_ext = extensions[out_type][0] outfile = io_check("{}.{}".format('.'.join(infile.split('.')[:-1]), out_ext), 'w') print("output will be in {} and formatted as {}".format(outfile, out_type)) SearchIO.convert(infile, in_type, outfile, out_type, out_kwargs=kwargs)
def check_raw(self, filename, id, raw, **kwargs): """Index filename using keyword arguments, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) # Anticipate cases where the raw string and/or file uses different # newline characters ~ we set everything to \n. new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() # Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) new = idx.get_raw(id) self.assertTrue(isinstance(new, bytes), "Didn't get bytes from %s get_raw" % self.fmt) self.assertEqual(raw.replace(b'\r\n', b'\n'), new.replace(b'\r\n', b'\n')) idx.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_raw(filename + ".bgz", id, raw, **kwargs)
def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares parsed QueryResults after they have been written to a file.""" source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs)) SearchIO.write(source_qresults, out_file, out_format, **kwargs) out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs)) for source, out in zip(source_qresults, out_qresults): self.assertTrue(compare_search_obj(source, out))
def read_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs): """Compares read QueryResults after it has been written to a file.""" source_qresult = SearchIO.read(source_file, source_format, **kwargs) SearchIO.write(source_qresult, out_file, out_format, **kwargs) out_qresult = SearchIO.read(out_file, out_format, **kwargs) self.assertTrue(compare_search_obj(source_qresult, out_qresult))
def main(args): if len(args) == 2: filenameRoot = args[1].split(".")[0] filenameXML = filenameRoot + ".xml" SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml') elif len(args) == 3: filenameRoot = args[1].split(".")[0] filenameXML = args[2] SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml') else: print("Usage: path/to/blast/tabular/file [optional path/for/new/blast/xml/file]")
def parseBlastOutFile(filename): if filename[-3:] == "xml": qResultGen = SearchIO.parse(filename, 'blast-xml') elif filename[-3:] == "txt": qResultGen = SearchIO.parse(filename, 'blast-tab') else: print("Unrecognized filetype.") assert False parsed = {qRes.id : qRes for qRes in qResultGen} print("Parsed "+filename) return parsed
def start_queryResult_generator(inFile, fDic, work_sheet): """ invoking the parse function to return a 'generator' that can allow you to step though the record one QueryResult Object at a time but invoking nextQuery = (next)generator on it.This approach can allow you to save on memory. I have found with my current task casting this generator with (list) works fine but it is really not called for in this current task of parsing and sorting the records. """ """ http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html""" qGenerator = SearchIO.parse(inFile, 'blast-xml') max_hits = 0 query_count = 1 # Step through all the records in the lump xml data file and write out # each separate hit to file. Also write the summary information to the # work sheet. for query_result in qGenerator: print('Processing Query BLAST return ' + str(query_count)) number_hits = int(len(query_result.hits)) # Extend header out right if new MAXHITS if number_hits > max_hits: max_hits = number_hits if number_hits == 0: # Construct path plus file name for no hit query filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_' + str(query_count) + '_H_none.xml') # Write out any Queries that had to hits to a no Hit subfolder SearchIO.write(query_result, filename, 'blast-xml') write_qr_to_ws(query_count, query_result, work_sheet) else : # Now set up a counter of 'hits' in the QueryResult so hit's # can be sliced away into their own record cleanly. hit_count = 0; for hit in query_result.hits: total_hsps = len (hit.hsps) lowest_eval = hit.hsps[0].evalue best_hsp = hit.hsps[0] for hsp in hit.hsps: if hsp.evalue < lowest_eval: lowest_eval = hsp.evalue best_hsp = hsp filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp)) SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml') hit_count += 1 # Write out query_result to worksheet write_qr_to_ws(query_count, query_result, work_sheet) query_count += 1 # break is debugging code # if query_count == 20: # break build_ws_header(work_sheet, max_hits) return qGenerator
def __init__(self,Maxicircle,out_file): file_out=open(out_file,'w') writer = csv.writer(file_out,delimiter="\t") writer.writerow(["##gff-version","3"]) rows=[] for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"): output_file=protein.split("/")[-1]+".xml" blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein, outfmt=5, out=output_file) blastx_cline() blast_qresult = SearchIO.read(output_file, 'blast-xml') if len(blast_qresult)>0: best=blast_qresult[0][0] query_range=[x for x in best.query_range] if best.query_strand>0: query_strand="+" else: query_strand="-" chromosome=best.query_id rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]]) print(str(len(rows))+" exons found") rows=iter(rows) writer.writerows(rows)
def main(argv): args = parse_arguments(argv) infile = args.infile out = args.out_path outputs = [] query_sequences = [] count = 0 with open(out, 'w+') as output: output.write( "%s\t%s\t%s\t%s\n" % ("Accession", "family", "query_name", "Resfams_description")) for qresult in SearchIO.parse(infile, "hmmer3-tab"): for hits in qresult: accession = hits.accession id = hits.id query = hits.query_id description = hits.description score = hits.bitscore array = [accession, id, query, description, str(score)] print("\t".join(array)) output.write("\t".join(array) + "\n") if hits.query_id not in query_sequences: query_sequences.append(hits.query_id) count += 1 print("Unique Seqs: " + str(count))
def quick_structurome(self, xml_blast_result, data_dir, entries, tmp_dir="/tmp/chain_PDBs", pdb_divided="/data/databases/pdb/divided/", max_models=3): good_model = defaultdict(lambda: []) def identity(hsp): return 1.0 * hsp.ident_num / hsp.aln_span _log.info("searching good templates") for query in tqdm(bpsio.parse(xml_blast_result, "blast-xml")): for hit in query: if list(hit): hsp = list(hit)[0] if 0.6 <= identity(hsp) < 0.95: good_model[hsp.query.id].append(hsp) tuplas = good_model.items() _log.info("creating models") with tqdm(tuplas) as pbar: for seq, hsps in pbar: try: from SNDG.Structure.Modelome import Modelome Modelome.model_hsps(seq, data_dir, hsps, entries=entries, tmp_dir=tmp_dir, pdb_divided=pdb_divided, max_models=max_models) except Exception as ex: _log.exception(ex)
def load_hsp_dict(self, xml_blast_result): for query in bpsio.parse(xml_blast_result, "blast-xml"): for hit in query: if list(hit): hsp = list(hit)[0] self.hsp_dict[query.id][hsp.hit.id] = hsp
def generate_protein_model(self, query: str, template: str, blast_xml_path: str, out_dir: str, template_dir: str): hits = [ _ for _ in SearchIO.read(blast_xml_path, 'blast-xml').hits if _.id == template ] assert len(hits) == 1 best = hits[0].hsps[0].aln tseq = replace_missing_residues(str(best[1].seq), f'{template_dir}/{template}.ent') Path(out_dir).mkdir(parents=True, exist_ok=True) pir_file = f'{out_dir}/{template}.pir' SeqIO.write([ SeqRecord(Seq(str(best[0].seq), generic_protein), id=query, name='', description=f'sequence:{query}::::::::'), SeqRecord( Seq(tseq, generic_protein), id=template, name='', description= f'structureX:{template}::{template[5].upper()}::{template[5].upper()}::::' ) ], pir_file, 'pir') arg = [ self.modpysh, 'python3', Path(__file__).parent.resolve() / 'modeller_script.py', pir_file, template, query, template_dir ] subprocess.run(arg, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
def runHMMsearch(input, basename, tmpdir, cpus, evalue, hmm): Results = {} #load proteins into dictionary protein_dict = SeqIO.to_dict(SeqIO.parse(input, 'fasta')) #do hmmer search of proteins HMM = os.path.join(tmpdir, basename + '.hmmsearch.txt') subprocess.call( ['hmmsearch', '-o', HMM, '--cpu', str(cpus), '-E', evalue, hmm, input], stdout=FNULL, stderr=FNULL) with open(HMM, 'rU') as results: for qresult in SearchIO.parse(results, "hmmer3-text"): query_length = qresult.seq_len #length of HMM model hits = qresult.hits num_hits = len(hits) if num_hits > 0: query = hits[0].id hit = hits[0].query_id score = hits[0].bitscore evalue = hits[0].evalue num_hsps = len(hits[0].hsps) aln_length = 0 for x in range(0, num_hsps): aln_length += hits[0].hsps[x].aln_span if hit not in Results: Results[hit] = [query, score, evalue, aln_length, 'Hmmer3'] for k, v in Results.items(): description = base + '|' + k + "|" + v[0] + "|evalue=" + str( v[2]) + "|HMMer3-Complete" Results[k].append(description) Seq = str(protein_dict[v[0]].seq) Results[k].append(Seq) return Results
def features_via_hmm(seq, hmmdb, eval_thresh=1.0): """ This function takes a Seq, runs hmmscan against a compressed hmmdb (prepare with hmmpress) and output a list of biobython SeqFeature. #Needs strictly HMMER 3.0!!!! """ features = list() ufn = str(uuid.uuid4()) SeqIO.write( [SeqRecord(seq, id='QUERY', name='QUERY', description="QUERY")], ufn + '.fasta', 'fasta') subprocess.call([ "hmmscan", "-o", ufn + ".out", "--tblout", ufn + ".tbl", "--domtblout", ufn + ".dtbl", hmmdb, ufn + '.fasta' ]) #Now let's read it for v in SearchIO.parse(ufn + ".dtbl", "hmmscan3-domtab"): for hit in v: for h in hit.hsps: # print h if h.evalue < eval_thresh: features.append( SeqFeature(FeatureLocation(h.query_start, h.query_end), type="domain", qualifiers={ 'name': h.hit_id, 'evalue': h.evalue })) os.system("rm %s %s %s %s" % (ufn + '.fasta', ufn + '.out', ufn + '.tbl', ufn + '.dtbl')) return features
def generate_blast_graph(self): evalue_filter = lambda hsp: hsp.evalue < self.evalue file_name = "{}/blast_graph.txt".format(self.blast_output_path) for blast_file in glob.glob(self.blast_data_path): print("working on " + blast_file) # Parse the Blast file qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True) for qresult in qresults: write_line = "" write_line += qresult.id + ":" # Go to the Hit section of query for hit in qresult[:]: if not self.blast_graph.has_node(qresult.id): self.blast_graph.add_node(qresult.id) # Check if Hit has min value filtered_hit = hit.filter(evalue_filter) if filtered_hit is not None: if not self.blast_graph.has_node(filtered_hit.id): self.blast_graph.add_node(filtered_hit.id) # Add Edge between graph nodes self.blast_graph.add_edge(qresult.id, filtered_hit.id) write_line += filtered_hit.id + "," if write_line != "": with open(file_name, "a") as f_handle: f_handle.write(write_line + '\n') # Write GML files if self.generate_gml_files: file_name = "{}/blast_graph.gml".format(self.blast_output_path) with open(file_name, "a") as f_handle: nx.write_gml(self.blast_graph, f_handle)
def get_scores_for_curated_via_hmm(self): """ For every curated variant we want to generate a set of scores against HMMs. This is needed to supply the same type of information for curated as well as for automatic seqs. """ #Construct the one big file from all cureated seqs. with open(self.curated_all_fasta, "w") as f: for hist_type, seed in self.get_seeds(): seed_aln_file = os.path.join(self.seed_directory, hist_type, seed) for s in SeqIO.parse(seed_aln_file, "fasta"): s.seq = s.seq.ungap("-") SeqIO.write(s, f, "fasta") #Search it by our HMMs self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta) ##We need to parse this results file; ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"): print "Loading hmmsearch for variant:", variant_query.id variant_model=Variant.objects.get(id=variant_query.id) for hit in variant_query: gi = hit.id.split("|")[1] seq = Sequence.objects.get(id=gi) # print hit try: #sometimes we get this : [No individual domains that satisfy reporting thresholds (although complete target did)] best_hsp = max(hit, key=lambda hsp: hsp.bitscore) add_score(seq, variant_model, best_hsp, seq.variant==variant_model) except: pass
def extract_faa_seqs(HMM_TO_USE): HMM_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".out" HMM_OUTPUT_OBJECT = SearchIO.read(HMM_OUTPUT_FILE, 'hmmer3-tab') FAA_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".faa" if HMM_OUTPUT_OBJECT: DICT_OF_HIT = dict() for seq_record in SeqIO.parse(ORF_FILE, "fasta"): for hit in HMM_OUTPUT_OBJECT: # Compare the seq record from the fasta file to the IDs in the hits if seq_record.id == hit.id: BIN_ID = G2B_DICT[hit.id]['binID'] # If binID has been added to dictionary, keep the one with the higher bitscore if BIN_ID in DICT_OF_HIT: print(BIN_ID + " has multiple hits for " + HMM_TO_USE) if int(hit.bitscore) > int( DICT_OF_HIT[BIN_ID]['bitscore']): DICT_OF_HIT[BIN_ID] = { "sequence": seq_record.seq.rstrip('*'), "bitscore": hit.bitscore } # If binID hasn't been added to dictionary, add it. if BIN_ID not in DICT_OF_HIT: DICT_OF_HIT[BIN_ID] = { "sequence": seq_record.seq.rstrip('*'), "bitscore": hit.bitscore } with open(FAA_OUTPUT_FILE, 'w') as OPENED_FAA_OUTPUT: for bin_id in DICT_OF_HIT: OPENED_FAA_OUTPUT.write('>' + bin_id + '\n' + str(DICT_OF_HIT[bin_id]['sequence']) + '\n')
def blast(): fasta = open("assembly.fasta").read() handle = NCBIWWW.qblast("blastn", "nr", fasta, entrez_query='"Herpesviridae"[organism]' ) #run blast against the assembled sequence with open("blast.xml", "w") as out_handle: out_handle.write(handle.read()) out_handle.close() blast_qresult = SearchIO.read("blast.xml", "blast-xml") output = open('MiniProject.log', 'a') output.write('seq_title ' + 'align_len ' + 'number_HSPs ' + 'topHSP_ident ' + 'topHSP_gaps ' + 'topHSP_bits ' + 'topHSP_expect \n') max_blast_id = 10 if len( blast_qresult ) < 10: #prevents program from crashing when there are less than 10 results max_blast_id = len(blast_qresult) for i in range(0, max_blast_id): hit = blast_qresult[i] blast_hsp = blast_qresult[i][0] output.write( str(hit.description) + ' ' + str(hit.seq_len) + ' ' + str(len(hit.hsps)) + ' ' + str(blast_hsp.ident_num) + ' ' + str(blast_hsp.gap_num) + ' ' + str(blast_hsp.bitscore) + ' ' + str(blast_hsp.evalue) + '\n') output.close()
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None): ''' Performs reciprocal hmmer search against the proteome of given organism ''' print "# Reciprocal search..." is_found = False if out_filename == None: out_filename = modelname + ".rechits_" + organism reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res" os.system(reciprocal_search_command) try: hits = SearchIO.read(out_filename, "hmmer3-tab") max_bitscore = hits[0].bitscore except: hits = [] max_bitscore = 0 if len(hits) > 0: if re.search(modelname_regex, hits[0].description): is_found = True # for h in hits: # if h.bitscore > rev_inc_bitscore_percentage * max_bitscore: # if manual_mode: # print modelname_regex, h.description, re.search(modelname_regex, h.description) # raw_input("...") # if re.search(modelname_regex, h.description): # is_found = True if manual_mode: print filename, is_found raw_input("Check reciprocal search results...") return is_found
def get_hit_seq(fastafile, filename): yamlfile = yaml_load_file(fastafile) blout = SearchIO.parse(filename, 'blast-text') for query in blout: seqid = query.id.split("\n")[0] #print(seqid) fh = open("multi_" + seqid + ".fasta", 'a') yamlfile[seqid]['hits'] = {} for hit in query.hits: gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1) yamlfile[seqid]['hits'][gi] = {} #print(yamlfile[seqid]['hits']) for hsp in hit.hsps: #print(hsp.hit) #print(hsp.hit_strand) #print(hsp.hit_start) #print(hsp.hit_end) hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS hitstart = 1 if hitstart < 0 else hitstart hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS hitstrand = "plus" if (hsp.hit_strand == 1) else "minus" #print(hsp.hit_end) out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read() fh.write(out) #print(hsp.hit.seq) #print(hsp.query.seq) #print("-----") fh.close() #break yaml_dump_file(fastafile, yamlfile)
def reverse_complement_hsp(hsp, query_length): rev = SearchIO.HSP(fragments=[ reverse_complement_hsp_fragment(frag, query_length) for frag in hsp.fragments[::-1] ]) rev.ident_pct = hsp.ident_pct return rev
def blast_search(filename, blast_temp_path): ''' Perform a blast search, using a file as input. Either FASTA or accession number Return results as list of accession numbers and locations ''' filepath = "%s/%s.xml" % (blast_temp_path, filename) if not os.path.exists(filepath): with open('%s/%s' % (blast_temp_path, filename), 'w') as f: f.write(filename) search_cmd = NcbitblastnCommandline(query='%s/%s' % (blast_temp_path, filename), db="/research/sequences/GenBank/blast/db/refseq_genomic", outfmt=5, out=filepath) subprocess.call(str(search_cmd), shell=True) result = SearchIO.read(filepath, 'blast-xml') # Filter e-value hsps = filter(lambda hit: hit.evalue < 1e-10, result.hsps) # Filter length hsps = filter(lambda hit: hit.aln_span > 209, hsps) # Filter identity iden_cutoff = 0.2 hsps = filter(lambda hit: float(hit.ident_num)/float(result.seq_len) > iden_cutoff, hsps) return [(hit.hit_id.split('|')[-2], hit.hit_start, hit.hit_end) for hit in hsps]
def construct_gene_scores_matrix(hmmtable): """ Parse hmmscan tabular output to a dictionary. Arguments: hmmtable: pathlib.Path instance: Path to the hmmscan output, specified with hmmscan's --tblout option. Can also be str. Return: dic_genes_scores: dict: A dictionary with the gene ids as keys with a list of lists for all its hits. This is of the form { gene_id: [ [ hit id, (<- string) hit E-value, (<- np.float32) hit bit-score, (<-np.float32) hit bias, (<-np.float32) ], ...], ...} """ dic_genes_scores = {} for gene in SearchIO.parse(hmmtable, "hmmer3-tab"): dic_genes_scores[gene.id] = [] for hit in gene.hits: hit_info = [ hit.id, np.float32(hit.evalue), np.float32(hit.bitscore), np.float32(hit.bias), ] dic_genes_scores[gene.id].append(hit_info) return dic_genes_scores
def results(self): """ Parse the results and yield biopython SearchIO entries. Beware: Some databases are not unique on the id, and this causes the parser to complain about duplicate entries and raise exceptions such as: ValueError: The ID or alternative IDs of Hit 'DQ448783' exists in this QueryResult. Summary of the columns: https://www.metagenomics.wiki/tools/blast/blastn-output-format-6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore Warning: Unlike BLAST results, if a sequence got no hits it is NOT reported at all in VSEARCH. The number of entries yielded will not match the number of sequences at input. """ with open(self.out_path, 'rt') as handle: for entry in SearchIO.parse( handle, 'blast-tab', ): yield entry
def blaster(fasta_file): """ Based on a target species list, we BLAST the given input sequence and put them in a file. """ fasta_string = open(fasta_file).read() print("BLAST initiated...") # qblast opens up the BLAST function in NCBI. result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string) print("BLAST search done.") # Records will then be written in a file. records = [] # Results need to go into an XML file. with open("my_blast.xml", "w") as out_handle: out_handle.write(result_handle.read()) blast_result = SearchIO.read("my_blast.xml", "blast-xml") print("Writing BLAST results to file..") for i in target_species: # Interate through the blast result hits. for hit in blast_result: print(hit) if i in hit.description: # If the taret species is found, append. records.append(hit[0].hit) # Pretty easy way to write the given sequences in one file. SeqIO.write(records, "blast-results.fasta", "fasta") print("\nBLAST result file written to blast_results.fasta.") return ("blast_results.fasta")
def get_unique_blastp_hits(infile, fasta): hits = set() for aln in SearchIO.parse(infile, 'blast-xml'): for hsp in aln.hsps: hits.add(hsp.hit_id) seqs = {rec.id: rec for rec in SeqIO.parse(fasta, 'fasta')} return [seqs[hit] for hit in hits]
def extract_top_hits(hmmer_hits, top_hits_file, top_hits_ids_file=None, not_significant_ids_file=None, not_found_ids_file=None): top_hits_ids = IdList() not_significant_ids = IdList() not_found_ids = IdList() index_file = "hmmer_hits.tmp.idx" hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text") out_fd = open(top_hits_file, "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) top_hits_ids.append(query) else: not_significant_ids.append(query) else: not_found_ids.append(query) os.remove(index_file) if not_significant_ids_file: not_significant_ids.write(not_significant_ids_file) if not_found_ids_file: not_found_ids.write(not_found_ids_file) if top_hits_ids_file: top_hits_ids.write(top_hits_ids_file)
def multiPFAMsearch(inputList, cpus, tmpdir, output): # run hmmerscan multithreaded by running at same time # input is a list of files, run multiprocessing on them pfam_results = os.path.join(os.path.dirname(tmpdir), 'pfam.txt') pfam_filtered = os.path.join(os.path.dirname(tmpdir), 'pfam.filtered.txt') lib.runMultiNoProgress(safe_run, inputList, cpus) # now grab results and combine, kind of tricky as there are header and footers for each resultList = [os.path.join(tmpdir, f) for f in os.listdir( tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.pfam.txt')] combineHmmerOutputs(resultList, pfam_results) # now parse results with open(output, 'w') as out: with open(pfam_filtered, 'w') as filtered: with open(pfam_results, 'r') as results: for qresult in SearchIO.parse(results, "hmmsearch3-domtab"): hits = qresult.hits num_hits = len(hits) if num_hits > 0: for i in range(0, num_hits): hit_evalue = hits[i].evalue query = hits[i].id pfam = qresult.accession.split('.')[0] hmmLen = qresult.seq_len hmm_aln = int(hits[i].hsps[0].hit_end) - \ int(hits[i].hsps[0].hit_start) coverage = hmm_aln / float(hmmLen) if coverage < 0.50: # coverage needs to be at least 50% continue filtered.write("%s\t%s\t%s\t%f\n" % (query, pfam, hit_evalue, coverage)) out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))
def analyze_BLAST_result(input_fasta_name_wo_path, result_handle): show_header("Step 2. Analyzing the BLAST result.") output_file_name = "retrieved_from_" + str( input_fasta_name_wo_path)[:-6] + ".xml" if not os.path.exists('sample/output'): os.makedirs('sample/output') current_dir = os.getcwd() output_folder = os.path.join(current_dir, "sample/output") os.chdir(output_folder) output_file = open( output_file_name, "w" ) # since it is 'w', an existing file will be overwritten. (if this is "a", new info will be appended to an existing file) output_file.write(result_handle.read()) output_file.close() blast_qresult = SearchIO.read(output_file_name, "blast-xml") # query_result filter_for_no_predicted_hypothetical = lambda hit: ("PREDICTED" in hit. description == False) filtered_qresult = blast_qresult.hit_filter( filter_for_no_predicted_hypothetical) for hit in filtered_qresult: print("%s" % (hit.description))
def parse_blast(blast_pdb_file, max_E, min_pcid, max_pcid, hits): with open_file(blast_pdb_file) as f: for qresult in SearchIO.parse(f, 'blast-xml'): query = qresult.id#.split("|")[1] for hit in qresult: s = hit.id + hit.description hsp = hit[0] # Only the 1st one evalue = hsp.evalue pcid = float(hsp.ident_num)/hsp.aln_span*100 if (evalue<=max_E and pcid>=min_pcid and pcid<=max_pcid): # print "\t>HIT:",hit.id, set(re.findall("pdb\|\w\w\w\w\|\w", s)) # print "\t", hsp.evalue, "{:2.1f}".format(pcid) # print hsp.query_start, hsp.query_end # print hsp.hit_start+1, hsp.hit_end for match in re.findall("pdb\|\w\w\w\w\|\w", s): pdb, chain = match.split("|")[1:] hits[query][pdb][chain]={ "ide": "{:2.1f}".format(pcid), "e-val": evalue, "q-start": str(hsp.query_start+1), "q-end": str(hsp.query_end), "s-start": str(hsp.hit_start+1), "s-end": str(hsp.hit_end) } else: break return hits
def run_blastp(target_blastp_database: str, query_sequence: str, opts: List[str] = None, results_file: str = None ) -> List[SearchIO._model.query.QueryResult]: """ Runs blastp over a single sequence against a database and returns the results as parsed by Bio.SearchIO. Arguments: target_blastp_database: the blastp database to compare to query_sequence: the sequence being compared opts: a list of extra arguments to pass to blastp, or None results_file: a path to keep a copy of blastp results in, if provided Returns: a list of QueryResults as parsed from blast output by SearchIO """ if not query_sequence: raise ValueError("Cannot run blastp on empty sequence") config = get_config() command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database] if opts is not None: command.extend(opts) result = execute(command, stdin=query_sequence) if not result.successful(): raise RuntimeError('blastp returned %d: %r while scanning %r' % ( result.return_code, result.stderr.replace("\n", ""), query_sequence[:100])) if results_file is not None: with open(results_file, 'w') as fh: fh.write(result.stdout) return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))
def process_blast_output(file, simple, argparser): qresults = SearchIO.parse(file, 'blast-xml') if simple: for qresult in qresults: for hit in qresult: for hsp in hit: if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and (hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)): yield ([str(hsp), "\n\n"], None, hsp.aln_span) for hsp in hit: if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and (hsp.aln_span == hsp.ident_num)): yield (None, [str(hsp), "\n\n"], hsp.aln_span) else: for qresult in qresults: for hit in qresult: for hsp in hit: for v, c, p in encode(simstr(hsp.aln)): if v == "1" and c >= argparser.cont: yield (format_alignment(hsp, p, c), None, c) for hsp in hit: for t0, t1, t2 in thrids(encode(simstr(hsp.aln))): if t0[0] == "1": assert (t0[2] < t1[2] < t2[2]) assert t2[0] == "1" assert t1[0] == "0" if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \ (t0[1] + t2[1]) >= argparser.summin and \ t1[1] <= argparser.gapmax: if not (argparser.S and (t0[1] >= argparser.cont or t2[1] >= argparser.cont)): yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]), t0[1]+t2[1]-t1[1])
def get_hits_to_VPFs(hmmout_file): '''Takes a HMMER3 hmmsearch tab output file as an input and returns a dictionary mapping each scaffold with the number of unique genes that match a protein family Input: - hmmout_file (str): path to HMMER3 hmmsearch out file in tab format Returns: - hits_to_VPFs (dict): dictionary where key are scaffold IDs and values are number of unique genes that matched a protein family ''' hits_to_VPFs = {} with open(hmmout_file, 'r') as input: for qresult in SearchIO.parse(input, 'hmmer3-tab'): hits = qresult.hits num_hits = len(hits) if num_hits > 0: for i in range(0, num_hits): query_seq_id = hits[i].id scaffold, gene = query_seq_id.split('|') hits_to_VPFs[scaffold] = hits_to_VPFs.get( scaffold, set([])).union([gene]) for key, value in iter(hits_to_VPFs.items()): hits_to_VPFs[key] = len(value) return hits_to_VPFs
def update_proteins(annotation_dir, proteome, seq_col_name, tax_id, identity=0.9, cpus=multiprocessing.cpu_count(), db_init=None): print seq_col_name if db_init: from SNDG.Sequence.ProteinAnnotator import PABase PABase.sqldb.initialize(db_init) mkdir(annotation_dir) out = annotation_dir + "/species_blast.tbl" tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get() species_tax = None for tax in Tax.parents(tax): if tax.node_rank == "genus": species_tax = tax break tax_data = "/data/xomeq/tax/" species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta" if not os.path.exists(out): if not os.path.exists(species_fasta): Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id), tax_data) cmd = "blastp -query %s -db %s -evalue 0.00001 -outfmt 6 -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s" execute(cmd % (proteome, species_fasta, cpus, out)) species_desc = { x.id.split("|")[1]: " ".join(x.description.split()[1:]) for x in bpio.parse(species_fasta, "fasta") } total = Protein.objects(organism=seq_col_name).count() with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar: for query in pbar: pbar.set_description(query.id) if query[0][0].ident_pct > identity: unip = query[0].id.split( "|")[1] if "|" in query[0].id else query[0].id dbxrefs = [ x.db + "||" + x.value for x in Mapping.select().where(Mapping.uniprot == unip) ] p = Protein.objects(gene=query.id, organism=seq_col_name).no_cache().get() if not p.description and unip in species_desc: p.description = species_desc[unip].split( "OS=")[0] + " | homology with: " + unip p.save() if dbxrefs: p = SearchLoader.update_protein_with_dbxref( query.id, dbxrefs, seq_col_name) p.save()
def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = FileRoutines.split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def xml2fasta(infile=None, outfile=None): print('\nConverting ' + infile + ' to fasta format, removing duplicates...') # Load the blast output file blast_qresult = SearchIO.read(infile, "blast-xml") # Iterate through ids and sequences and add them to lists. Sequences are # only added to the list if they are not already in the list. This is done # because sometimes there will be duplicates in the blast output, which # will produce an error when aligning with clustal ids = [] sequences = [] for hsp in blast_qresult.hsps: if hsp.hit.id not in ids: ids.append(str(hsp.hit.id)) sequences.append(str(hsp.hit.seq)) # Open the sequences output file then for each high-scoring pair # in the blast results, write the hit ID (proceeded by a ">" for # fasta format), followed by the hit sequence on the next line with open(outfile, "w") as f: for i, s in zip(ids, sequences): f.write('> ' + i + '\n') f.write(s + '\n') print('\tDone: writing to ' + outfile)
def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None ) -> List[SearchIO._model.query.QueryResult]: # pylint: disable=protected-access """ Run hmmpfam2 over the provided HMM file and fasta input Arguments: query_hmmfile: the HMM file to use target_sequence: a string in fasta format of the sequence to run Returns: a list of results as parsed by SearchIO """ config = get_config() command = ["hmmpfam2"] # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \ config.hmmer2.multithreading: command.extend(["--cpu", str(config.cpus)]) if extra_args: command.extend(extra_args) command.extend([query_hmmfile, '-']) result = execute(command, stdin=target_sequence) if not result.successful(): logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code, result.stderr, query_hmmfile) raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr)) res_stream = StringIO(result.stdout) return list(SearchIO.parse(res_stream, 'hmmer2-text'))
def _call_hmmer(hmm, inputproteins): inputproteins = list(inputproteins) scores = {} for ip in inputproteins: scores[ip.id] = 0 with ntf(prefix="/dev/shm/") as inputfasta: with ntf(prefix="/dev/shm/") as hmmoutput: SeqIO.write(inputproteins, inputfasta.name, 'fasta') hmmfile = os.path.join(hmm_location, hmm + '.hmm') sp.call(['hmmsearch', '-o', hmmoutput.name, hmmfile, inputfasta.name]) hmmoutput.flush() hmmoutput.seek(0) QRS = SearchIO.parse(hmmoutput, format="hmmer3-text") for qr in QRS: # there's *always* a QR, even though it's usually empty. # qr.sort() # I'm kind of hoping this sorts by hit strength. # worth checking. I guess it doesn't matter anyway. for hit in qr: scores[hit.id] = max(scores[hit.id], hit.bitscore) for hsp in hit.hsps: def appropriate_hyphens(m): return '-' * len(m.group(0)) if len(hsp.hit.seq) > 100: hitseq = re.sub('PPPPP+', appropriate_hyphens, str(hsp.hit.seq)) hitseq = hitseq.translate(None,'-*').upper() yield hit.id, hsp.bitscore, hitseq
def run_hmmsearch(self, name, hmm): """ Rum hmmsearch and return the highest scoring hit """ out = tempfile.NamedTemporaryFile("w") cmd = [ "hmmsearch", "--noali", "-o", out.name, os.path.join(self.cov_dir, hmm + "-nt.hmm"), os.path.join(self.cov_dir, name + ".fa"), ] if self.verbose: print("Command: {0}".format(cmd)) try: subprocess.run(cmd, check=True) except (subprocess.CalledProcessError) as exception: print("Error: {}".format(exception)) sys.exit("Error running hmmsearch using {}".format(hmm)) bestscore = 0 besthit = None # Get HSP with highest score for qresult in SearchIO.parse(out.name, "hmmer3-text"): for hit in qresult: for hsp in hit: if hsp.bitscore > bestscore: besthit = hsp bestscore = hsp.bitscore return besthit
def parse_n_fill_run_data_searchio(run_path, run_data, querydb): run_id = get_run_id(run_path) run_format = get_run_format(run_path) for query in SearchIO.parse(run_path, run_format): for hit in query.hits: for hsp in hit.hsps: exons = [x.hit_range for x in hsp.fragments] coverage = 'N/A' if querydb is not None: total_matched = sum(x.query_span for x in hsp.fragments) coverage = '{:.2f}%'.format(100 * total_matched / len(querydb[query.id])) if hasattr(hsp, 'score'): score = hsp.score elif hasattr(hsp, 'bitscore'): score = hsp.bitscore else: score = 'N/A' if hasattr(hsp, 'ident_num') and hasattr(query, 'seq_len'): matched = '{:.2f}%'.format(100 * hsp.ident_num / query.seq_len) else: matched = 'N/A' alignment = AlignmentData(run_id, score, matched, coverage, hsp.hit_range, exons) run_data[query.id][hit.id].append(alignment)
def exonerate_parser(exonerate_file): """ parser the exonerate result, and return the position of the feather in 4-col bed format 4 col bed4: [chro, start,end, name], example ["seq1", 1, 55, "trnP"] :param query: :param exonerate_file: :param prefix: :return: list of bed4 """ #fw=open(tbl_outname, "w") # change IO to list store bed4=[] texts=SearchIO.parse(StringIO(exonerate_file), format="exonerate-text") for record in texts: for hsp in record: for s in hsp: # the biopython.SearchIO interval is 0 based [start, end), so start+1, end+0 to get 1 based coords table_4=[s.fragment.query_id, s.fragment.query_start+1, s.fragment.query_end,s.fragment.hit_id] bed4.append(table_4) #fw.write("\t".join(table_4)) #fw.write("\n") bed4.sort() #fw.close() return bed4
def runHmmer(args, list_path, file_path, f): """run prodigal and hmmsearch on chr files""" if not os.path.exists(str(args.data) + '/tmp'): os.makedirs(str(args.data) + '/tmp') # get the sample group head, group = os.path.split(os.path.split(file_path)[0]) basename = os.path.splitext(str(ntpath.basename(str(file_path))))[0] exportpath = str(args.data) + '/tmp/' + ntpath.basename(str(file_path)) hmmpath = str(args.data) + '/tmp/' + ntpath.basename( str(file_path)) + '.out' print('Processing %s of group %s' % (basename, group)) s = "" cmd = ("prodigal -p meta -i ", str(file_path), " -a ", exportpath, ' -d /dev/null > /dev/null 2> /dev/null') os.system(s.join(cmd)) # run hmmsearch on faa ORF files s = " " cmd = ("hmmsearch -E 0.001 --domtblout", hmmpath, 'resources/remove.hmm', exportpath, '> /dev/null 2> /dev/null') os.system(s.join(cmd)) # write it to output file if there is a hit with open(hmmpath, 'rU') as input: try: for qresult in SearchIO.parse(input, 'hmmscan3-domtab'): query_id = qresult.id hits = qresult.hits num_hits = len(hits) acc = qresult.accession if num_hits > 0: f.write(''.join((basename, '\t', str(file_path), '\n'))) except ValueError: print('parsing error on %s' % basename)
def get_scores_for_curated_via_hmm(self): """ For every curated variant we want to generate a set of scores against HMMs. This is needed to supply the same type of information for curated as well as for automatic seqs. """ #Construct the one big file from all cureated seqs. with open(self.curated_all_fasta, "w") as f: for hist_type, seed in self.get_seeds(): seed_aln_file = os.path.join(self.seed_directory, hist_type, seed) for s in SeqIO.parse(seed_aln_file, "fasta"): s.seq = s.seq.ungap("-") SeqIO.write(s, f, "fasta") #Search it by our HMMs self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file, sequences=self.curated_all_fasta) ##We need to parse this results file; ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"): self.log.info("Loading hmmsearch for variant: {}".format( variant_query.id)) variant_model = Variant.objects.get(id=variant_query.id) for hit in variant_query: accession = hit.id.split("|")[1] seq = Sequence.objects.get(id=accession) # print hit try: #sometimes we get this: [No individual domains that satisfy reporting thresholds (although complete target did)] best_hsp = max(hit, key=lambda hsp: hsp.bitscore) add_score(seq, variant_model, best_hsp, seq.variant == variant_model) except: pass
def blastSearch(query, speciesList, filename, blastDict): '''Run BLAST, save results of a search to a file and return its contents :param query: String with accession numbers divided by paragraphs :param species: String with all species, against which BLAST is performed :param filename: Name of original fasta file for saving results of BLAST ''' xmlPath = rootFolder \ + '/Blast_XML/' \ + os.path.splitext(filename)[0] \ + '.xml' query = createInputForBlast('.q', query, filename) taxidList = createInputForBlast('.t', speciesList, filename) blastNotVoid = bashBlast(query=query, out=xmlPath, taxidList=taxidList) if blastNotVoid: blast = SearchIO.parse(xmlPath, 'blast-xml') writeInBlastDict(blast, blastDict) os.remove(query) os.remove(taxidList) os.remove(xmlPath) return blastDict
def parse_hmmscan_tab(infile, print_header=True): '''Parse hmmscan output in --tblout format''' if print_header: yield "query","top hit","evalue","certainty","num sig hits" records = SearchIO.parse(infile,'hmmer3-tab') for rec in records: query = rec.id if len(rec) > 1: hit1,hit2 = rec.hits[0],rec.hits[1] eval1,eval2 = hit1.evalue,hit2.evalue if eval1 != 0: # convert to -ln evalue eval1 = -np.log(eval1) if eval2 != 0: eval2 = -np.log(eval2) if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care certainty = 1 elif eval1 == 0 and eval2 == 0: certainty = 0 else: # calculate certainty with info theoretic calc. total = eval1 + eval2 p1,p2 = eval1/total, eval2/total certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2)) else: certainty = 1 yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)
def get_adenylation_domains(fasta, known=None, lagging_strand=False): adenylation_domains = [] fasta_seqs = [] for fs in SeqIO.parse(fasta, 'fasta'): revcom=False seq = str(fs.seq) pepseq, rf = get_pepseq(seq) if rf < 0 == lagging_strand: revcom=True seq = utils.reverse_complement(seq) fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf}) for fs in fasta_seqs: utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'], '>header\n' + pepseq) with open('dump') as f: out = f.read() res_stream = StringIO(out) os.remove('dump') results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab')) for result in results: for i, hsp in enumerate(result.hsps, 1): s = hsp.hit_start e = hsp.hit_end adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e)) return adenylation_domains
def first_exonerate_parse(dir, newdir, prefix): cwd = os.getcwd() if not os.path.exists(cwd + newdir): os.makedirs(cwd + newdir) if not os.path.exists(cwd + '/merged_exons/'): os.makedirs(cwd + '/merged_exons/') for file in slistdir(cwd + dir): if 'DS_Store' not in file: result = SearchIO.parse(cwd + dir + file, 'exonerate-text') for h in result: for hh in h: for hhh in hh: hitcounter = 1 for hhhh in hhh: hitseq = hhhh.query rootname = file.split('.fasta') orthoname = file.split("_") orthosubdir = cwd + newdir + '/' + orthoname[0] if not os.path.exists(orthosubdir): os.makedirs(orthosubdir) newseqstr = str(hitseq.seq.ungap("-")) newid = prefix + str(hitcounter) + '_' + rootname[0] record = SeqRecord(Seq(newseqstr, generic_dna), id = newid, description = '') fastaname = prefix + str(hitcounter) + '_' + rootname[0] + '.fasta' SeqIO.write(record, orthosubdir + '/' + fastaname, "fasta") hitcounter += 1
def parse_results(path, file_name, FA_FILES_PATH, top_k=3, add_to_db=bool): """Parses a result of a blast query Return top k matches and adds them to the database. """ print(f"Parsing {file_name} at {path}") i = 0 results = list() for bresults in SearchIO.parse(path, 'blast-xml'): for r in bresults: i += 1 # Select only top k if i <= top_k: results.append({ "rank": i, "id": r.id, "query_id": r.query_id, "full_name": r.description_all, "bitscore": r.hsps[0].bitscore, "evalue": r.hsps[0].bitscore, "query_range": r.hsps[0].query_range, "hit_range": r.hsps[0].hit_range, }) elif i > top_k and add_to_db is True: print("Top 3 results saved to database") add_to_database(results=results, FA_FILES_PATH=FA_FILES_PATH) break else: return (results)
def retrieve_blast_data(self): for blast_file in glob.glob(self.blast_data_path): print(blast_file) print self.network_data qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True) for qresult in qresults: if(qresult.id in self.network_data): print qresult.id
def check_raw(self, filename, id, raw, **kwargs): """Index filename using **kwargs, check get_raw(id)==raw.""" idx = SearchIO.index(filename, self.fmt, **kwargs) raw = _as_bytes(raw) self.assertEqual(raw, idx.get_raw(id)) idx.close() #Now again, but using SQLite backend if sqlite3: idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs) self.assertEqual(raw, idx.get_raw(id)) idx.close() if os.path.isfile(filename + ".bgz"): #Do the tests again with the BGZF compressed file print "[BONUS %s.bgz]" % filename self.check_raw(filename + ".bgz", id, raw, **kwargs)
def check_index(self, filename, format, **kwargs): # check if Python3 installation has sqlite3 try: import sqlite3 except ImportError: sqlite3 = None if filename.endswith(".bgz"): handle = gzip.open(filename) parsed = list(SearchIO.parse(handle, format, **kwargs)) handle.close() else: parsed = list(SearchIO.parse(filename, format, **kwargs)) # compare values by index indexed = SearchIO.index(filename, format, **kwargs) self.assertEqual(len(parsed), len(indexed), "Should be %i records in %s, index says %i" % (len(parsed), filename, len(indexed))) # compare values by index_db, only if sqlite3 is present if sqlite3 is not None: db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs) self.assertEqual(len(parsed), len(db_indexed), "Should be %i records in %s, index_db says %i" % (len(parsed), filename, len(db_indexed))) for qres in parsed: idx_qres = indexed[qres.id] # parsed and indexed qresult are different objects! self.assertNotEqual(id(qres), id(idx_qres)) # but they should have the same attribute values self.assertTrue(compare_search_obj(qres, idx_qres)) # sqlite3 comparison, only if it's present if sqlite3 is not None: dbidx_qres = db_indexed[qres.id] self.assertNotEqual(id(qres), id(dbidx_qres)) self.assertTrue(compare_search_obj(qres, dbidx_qres)) indexed.close() if sqlite3 is not None: db_indexed.close() db_indexed._con.close() if os.path.isfile(filename + ".bgz"): # Do the tests again with the BGZF compressed file print("[BONUS %s.bgz]" % filename) self.check_index(filename + ".bgz", format, **kwargs)
def parse(target): blast_result = list(SearchIO.parse('BlastResult.xml', 'blast-xml')) for record in blast_result: if len(record) == 0: continue else: tophit = record[0] target.append([tophit[0][0].query, tophit[0][0].hit])
def read_hmmer_file(f_path): """ Uses Biopython's SearchIO to parse a HMMER output file. Returns an iterator with search hits. """ f_path = _check_file(f_path) return SearchIO.read(f_path, "hmmer3-text")
def find_frameshift(sbjct_dict, query_dict, pseudo_hits, temp_dir, out_frameshift): assert(os.path.isdir(temp_dir)) sys.stderr.write("finding frameshift mutations...\n") frameshifts = [] non_frameshift = [] exn_query = temp_dir + "/" + "exn_query.fa" exn_target = temp_dir + "/" + "exn_target.fa" align_file = temp_dir + "/" + "fshift_exonerate.exn" for hit in pseudo_hits: chrom = hit[0] record = sbjct_dict[chrom] qseqid = hit[8].split(";")[0].split("=")[1] SeqIO.write(query_dict[qseqid], exn_query, "fasta") flank = 1000 flank_record = _get_hit_record(record, hit, flank) SeqIO.write(flank_record, exn_target, "fasta") # alignment using exonerate p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " + exn_query + " -t " + exn_target + ">" + align_file, shell=True) os.waitpid(p.pid, 0) fshift = False try: qresult = SearchIO.read(align_file, "exonerate-text") hsp = qresult[0][0] # first hit, first hsp # query overlapping with the best-hit new_hit_start = flank + 1 new_hit_end = len(flank_record.seq) - flank if hsp.hit_start + 1 <= new_hit_end and \ hsp.hit_end >= new_hit_start: # there are frameshifts if len(hsp.hit_frame_all) > 1: fshift = True except: pass if fshift: fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end] fshift_id = hit[0] + ":" + \ str(hsp.hit_start + 1) + "-" + str(hsp.hit_end) frameshift_record = SeqRecord( fshift_seq, id=fshift_id, name=fshift_id, description="qseqid=" + qseqid) frameshifts.append(frameshift_record) else: non_frameshift.append(hit) # end for SeqIO.write(frameshifts, out_frameshift, "fasta") sys.stderr.write("done.\n") return non_frameshift
def generate_blast_data(self): self.initialize_variables() for blast_file in glob.glob(self.blast_data_path): # Parse each Blast file query_results = SearchIO.parse(blast_file, 'blast-tab', comments=True) filtered_query_results = self.apply_filtering(query_results) # Parse each blast record for query_result in filtered_query_results: print query_result.id self.generate_blast_graph(query_result)
def getIndices(resultHandle): '''If not provided directly by the user, this function retrieves the best BLAST hit's indices.''' blast_result = SearchIO.read(resultHandle, 'blast-tab') print(blast_result[0][0]) start = blast_result[0][0].hit_start end = blast_result[0][0].hit_end return start, end
def runBlastParserTAB(cline,blast_out_file, False): startTime = datetime.now() os.system(str(cline)) print 'Running BLAST:' + str(datetime.now() - startTime) startTime = datetime.now() blast_records = SearchIO.parse(blast_out_file, 'blast-tab', comments=False) print 'Parsing Results:' + str(datetime.now() - startTime) return blast_records
def process_tot(): print "beginning process_tot" #level name level = sys.argv[3]+" " #get the name of the proteome from the file name omeid = sys.argv[1].replace(".fasta", "") omeid = omeid.split('/') omeid = omeid[len(omeid)-1] #read in results results = SearchIO.parse(sys.argv[2], "hmmer3-text") #build up list of entries #processed = [] #intialize list to add entries to count = 0 scans=len(results) cutoff=1.0/scans print scans print cutoff for protein in results: processed = [] if count % 100 ==0: print count count = count + 1 pid = protein.id+" " if len(protein) == 0: #if a protein has no hits groupid=proteinid and rank=0 rank = "0 " OGid = protein.id+" " e = "n/a " qr= "n/a " processed.append((rank, level, pid, OGid, e, qr, omeid)) elif protein[0].evalue > cutoff: #proteins with hits that do not meet the threshold are treated as those without any hits rank = "0 " OGid = protein.id+" " e = "n/a " qr= "n/a " processed.append((rank, level, pid, OGid, e, qr, omeid)) else: i = 0 while i<len(protein) and protein[i].evalue <= cutoff: rank = str(i+1)+" " OGid = protein[i].id.split('.') OGid = OGid[0]+"."+OGid[1]+" " e = str(protein[i].evalue)+" " qr = [] #empty list for domain ranges of this hit for d in protein[i]: qr.append(d.query_range) processed.append((rank, level, pid, OGid, e, str(qr).replace(" ", "")," ", omeid)) i += 1 #Write to file for i in processed: output.write("".join(str(s) for s in i) + "\n") output.close()