コード例 #1
1
def blastparse(stdout, output, tname, ntname):
    global recorddict, minLength
    handle = open(output, 'w')  # open the target fasta file for writing
    blast_handle = cStringIO.StringIO(stdout)  # Convert string to IO object for use in SearchIO using StringIO
    try:  # Necessary to avoid bad genomes
        for qresult in SearchIO.parse(blast_handle, 'blast-tab'):  # Parse the blast output sting as if it were a file
            for hit in qresult:  # Hit object
                for hsp in hit:  # Hsp object
                    begin = hsp.query_range[0]  # Start of hsp
                    finish = hsp.query_range[1]  # End of hsp
                    if hsp.query_id in recorddict:
                        # For the Contig name in the target fasta dictionary mask using coordinates
                        if finish > begin:
                            recorddict[hsp.query_id].seq = \
                                recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
                                + recorddict[hsp.query_id].seq[finish:]
                        else:
                            recorddict[hsp.query_id].seq \
                                = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
                                + recorddict[hsp.query_id].seq[begin:]
        recorddict_bak = deepcopy(recorddict)  # Copy the dictionary so we may iterate and modify the result
        for idline in recorddict_bak:
            # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' #  Find a sequence of at least the target length
            pattern = r'[ATCG]{100,}N{200,900}[ATCG]{100,}|[^N]{' + re.escape(str(minLength))+r'}'
            if re.match(pattern, str(recorddict[idline].seq)) is not None:
                SeqIO.write(recorddict[idline], handle, "fasta")
            else:
                # print 'Contig \'%s\' not written to file' % id
                recorddict.pop(idline)
    except ValueError:
        print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)
コード例 #2
0
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed.keys()))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed.keys()))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed._proxy._handle.close()  # TODO - Better solution
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()
コード例 #3
0
def main():
    extensions = {'blast-tab': ['tsv', 'csv', 'blast', 'm8', 'blastm8'],
        'blast-text': ['txt', 'bls', 'blast'], 'blast-xml': ['xml'],
        'blat-psl': ['psl'], 'hmmer3-tab': ['tsv', 'csv'],
        'hmmer3-text': ['txt'], 'hmmer2-text': ['txt'],
        'exonerate-text': ['txt']}
    kwargs = args.keywords
    infile = args.infile
    in_type = args.in_type
    in_ext = infile.split('.')[-1]
    proper_ext = extensions[in_type][0]
    if in_ext not in extensions[in_type]:
        print(textwrap.fill("error: invalid input file extension \"{}\". An "
            "appropriate extension for this input type is {}"
            .format(in_ext, proper_ext), 79))
        sys.exit(1)

    out_type = args.out_type
    if args.output:
        outfile = io_check(args.output, 'w')
    else:
        out_ext = extensions[out_type][0]
        outfile = io_check("{}.{}".format('.'.join(infile.split('.')[:-1]), out_ext), 'w')

    print("output will be in {} and formatted as {}".format(outfile, out_type))
    SearchIO.convert(infile, in_type, outfile, out_type, out_kwargs=kwargs)
コード例 #4
0
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using keyword arguments, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        # Anticipate cases where the raw string and/or file uses different
        # newline characters ~ we set everything to \n.
        new = idx.get_raw(id)
        self.assertTrue(isinstance(new, bytes),
                        "Didn't get bytes from %s get_raw" % self.fmt)
        self.assertEqual(raw.replace(b'\r\n', b'\n'),
                         new.replace(b'\r\n', b'\n'))
        idx.close()

        # Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            new = idx.get_raw(id)
            self.assertTrue(isinstance(new, bytes),
                            "Didn't get bytes from %s get_raw" % self.fmt)
            self.assertEqual(raw.replace(b'\r\n', b'\n'),
                             new.replace(b'\r\n', b'\n'))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
コード例 #5
0
 def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs):
     """Compares parsed QueryResults after they have been written to a file."""
     source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs))
     SearchIO.write(source_qresults, out_file, out_format, **kwargs)
     out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs))
     for source, out in zip(source_qresults, out_qresults):
         self.assertTrue(compare_search_obj(source, out))
コード例 #6
0
 def read_write_and_compare(self, source_file, source_format, out_file,
         out_format, **kwargs):
     """Compares read QueryResults after it has been written to a file."""
     source_qresult = SearchIO.read(source_file, source_format, **kwargs)
     SearchIO.write(source_qresult, out_file, out_format, **kwargs)
     out_qresult = SearchIO.read(out_file, out_format, **kwargs)
     self.assertTrue(compare_search_obj(source_qresult, out_qresult))
コード例 #7
0
def main(args):
	if len(args) == 2:
		filenameRoot = args[1].split(".")[0]
		filenameXML = filenameRoot + ".xml"
		SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml')

	elif len(args) == 3:
		filenameRoot = args[1].split(".")[0]
		filenameXML = args[2]
		SearchIO.convert(args[1], 'blast-tab', filenameXML, 'blast-xml')

	else:
		print("Usage: path/to/blast/tabular/file [optional path/for/new/blast/xml/file]")
コード例 #8
0
ファイル: compare-blast-results.py プロジェクト: Rinoahu/MICA
def parseBlastOutFile(filename):
	if filename[-3:] == "xml":
		qResultGen = SearchIO.parse(filename, 'blast-xml')
	elif filename[-3:] == "txt":
		qResultGen = SearchIO.parse(filename, 'blast-tab')
	else:
		print("Unrecognized filetype.")
		assert False

	parsed = {qRes.id : qRes for qRes in qResultGen}
	print("Parsed "+filename)

	return parsed
def start_queryResult_generator(inFile, fDic, work_sheet):
    """ invoking the parse function to return a 'generator' that can allow you 
        to step though the record one QueryResult Object at a time but invoking
        nextQuery = (next)generator on it.This approach can allow you to save 
        on memory. I have found with my current task casting this generator with
        (list) works fine but it is really not called for in this current 
        task of parsing and sorting the records.
    """
    """ http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html"""
    qGenerator = SearchIO.parse(inFile, 'blast-xml')
    max_hits = 0
    query_count = 1
    # Step through all the records in the lump xml data file and write out
    # each separate hit to file. Also write the summary information to the
    # work sheet.
    for query_result in qGenerator:
        print('Processing Query BLAST return ' + str(query_count))
        number_hits = int(len(query_result.hits))
        # Extend header out right if new MAXHITS
        if number_hits > max_hits:
            max_hits = number_hits       
        if number_hits == 0:
            # Construct path plus file name for no hit query
            filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_' 
                           + str(query_count) + '_H_none.xml')
            # Write out any Queries that had to hits to a no Hit subfolder
            SearchIO.write(query_result, filename, 'blast-xml')
            write_qr_to_ws(query_count, query_result, work_sheet)
        else :
            # Now set up a counter of 'hits' in the QueryResult so hit's
            # can be sliced away into their own record cleanly.
            hit_count = 0;
            for hit in query_result.hits:
                total_hsps = len (hit.hsps)
                lowest_eval = hit.hsps[0].evalue
                best_hsp = hit.hsps[0]
                for hsp in hit.hsps:
                    if hsp.evalue < lowest_eval:
                        lowest_eval = hsp.evalue
                        best_hsp = hsp
                filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp))
                SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml')
                hit_count += 1
            # Write out query_result to worksheet           
            write_qr_to_ws(query_count, query_result, work_sheet)
        query_count += 1
        # break is debugging code
        # if query_count == 20:
        #   break
    build_ws_header(work_sheet, max_hits)
    return qGenerator
コード例 #10
0
	def __init__(self,Maxicircle,out_file):
		file_out=open(out_file,'w')
		writer = csv.writer(file_out,delimiter="\t")
		writer.writerow(["##gff-version","3"])
		rows=[]
		
		for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"):
			output_file=protein.split("/")[-1]+".xml"
			blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein, 
		                                      outfmt=5, out=output_file)
			blastx_cline()
			blast_qresult = SearchIO.read(output_file, 'blast-xml')
			if len(blast_qresult)>0:
				best=blast_qresult[0][0]
				query_range=[x for x in best.query_range]
				if best.query_strand>0:
					query_strand="+"
				else:
					query_strand="-"
				chromosome=best.query_id
				rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]])
		
		print(str(len(rows))+" exons found")
		rows=iter(rows)
		writer.writerows(rows)
コード例 #11
0
def main(argv):
    args = parse_arguments(argv)

    infile = args.infile
    out = args.out_path
    outputs = []
    query_sequences = []
    count = 0

    with open(out, 'w+') as output:
        output.write(
            "%s\t%s\t%s\t%s\n" %
            ("Accession", "family", "query_name", "Resfams_description"))
        for qresult in SearchIO.parse(infile, "hmmer3-tab"):
            for hits in qresult:
                accession = hits.accession
                id = hits.id
                query = hits.query_id
                description = hits.description
                score = hits.bitscore

                array = [accession, id, query, description, str(score)]

                print("\t".join(array))
                output.write("\t".join(array) + "\n")

                if hits.query_id not in query_sequences:
                    query_sequences.append(hits.query_id)
                    count += 1
        print("Unique Seqs: " + str(count))
コード例 #12
0
    def quick_structurome(self, xml_blast_result, data_dir, entries, tmp_dir="/tmp/chain_PDBs",
                          pdb_divided="/data/databases/pdb/divided/", max_models=3):

        good_model = defaultdict(lambda: [])

        def identity(hsp):
            return 1.0 * hsp.ident_num / hsp.aln_span

        _log.info("searching good templates")
        for query in tqdm(bpsio.parse(xml_blast_result, "blast-xml")):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    if 0.6 <= identity(hsp) < 0.95:
                        good_model[hsp.query.id].append(hsp)



        tuplas = good_model.items()

        _log.info("creating models")
        with tqdm(tuplas) as pbar:
            for seq, hsps in pbar:
                try:
                    from SNDG.Structure.Modelome import Modelome
                    Modelome.model_hsps(seq, data_dir, hsps, entries=entries, tmp_dir=tmp_dir,
                                        pdb_divided=pdb_divided, max_models=max_models)
                except Exception as ex:
                    _log.exception(ex)
コード例 #13
0
    def load_hsp_dict(self, xml_blast_result):

        for query in bpsio.parse(xml_blast_result, "blast-xml"):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    self.hsp_dict[query.id][hsp.hit.id] = hsp
コード例 #14
0
 def generate_protein_model(self, query: str, template: str,
                            blast_xml_path: str, out_dir: str,
                            template_dir: str):
     hits = [
         _ for _ in SearchIO.read(blast_xml_path, 'blast-xml').hits
         if _.id == template
     ]
     assert len(hits) == 1
     best = hits[0].hsps[0].aln
     tseq = replace_missing_residues(str(best[1].seq),
                                     f'{template_dir}/{template}.ent')
     Path(out_dir).mkdir(parents=True, exist_ok=True)
     pir_file = f'{out_dir}/{template}.pir'
     SeqIO.write([
         SeqRecord(Seq(str(best[0].seq), generic_protein),
                   id=query,
                   name='',
                   description=f'sequence:{query}::::::::'),
         SeqRecord(
             Seq(tseq, generic_protein),
             id=template,
             name='',
             description=
             f'structureX:{template}::{template[5].upper()}::{template[5].upper()}::::'
         )
     ], pir_file, 'pir')
     arg = [
         self.modpysh, 'python3',
         Path(__file__).parent.resolve() / 'modeller_script.py', pir_file,
         template, query, template_dir
     ]
     subprocess.run(arg,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    universal_newlines=True)
コード例 #15
0
def runHMMsearch(input, basename, tmpdir, cpus, evalue, hmm):
    Results = {}
    #load proteins into dictionary
    protein_dict = SeqIO.to_dict(SeqIO.parse(input, 'fasta'))
    #do hmmer search of proteins
    HMM = os.path.join(tmpdir, basename + '.hmmsearch.txt')
    subprocess.call(
        ['hmmsearch', '-o', HMM, '--cpu',
         str(cpus), '-E', evalue, hmm, input],
        stdout=FNULL,
        stderr=FNULL)
    with open(HMM, 'rU') as results:
        for qresult in SearchIO.parse(results, "hmmer3-text"):
            query_length = qresult.seq_len  #length of HMM model
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                query = hits[0].id
                hit = hits[0].query_id
                score = hits[0].bitscore
                evalue = hits[0].evalue
                num_hsps = len(hits[0].hsps)
                aln_length = 0
                for x in range(0, num_hsps):
                    aln_length += hits[0].hsps[x].aln_span
                if hit not in Results:
                    Results[hit] = [query, score, evalue, aln_length, 'Hmmer3']
    for k, v in Results.items():
        description = base + '|' + k + "|" + v[0] + "|evalue=" + str(
            v[2]) + "|HMMer3-Complete"
        Results[k].append(description)
        Seq = str(protein_dict[v[0]].seq)
        Results[k].append(Seq)
    return Results
コード例 #16
0
def features_via_hmm(seq, hmmdb, eval_thresh=1.0):
    """
    This function takes a Seq, runs hmmscan against a compressed hmmdb (prepare with hmmpress)
    and output a list of biobython SeqFeature.
    #Needs strictly HMMER 3.0!!!!
    """
    features = list()
    ufn = str(uuid.uuid4())
    SeqIO.write(
        [SeqRecord(seq, id='QUERY', name='QUERY', description="QUERY")],
        ufn + '.fasta', 'fasta')
    subprocess.call([
        "hmmscan", "-o", ufn + ".out", "--tblout", ufn + ".tbl", "--domtblout",
        ufn + ".dtbl", hmmdb, ufn + '.fasta'
    ])
    #Now let's read it

    for v in SearchIO.parse(ufn + ".dtbl", "hmmscan3-domtab"):
        for hit in v:
            for h in hit.hsps:
                # print h
                if h.evalue < eval_thresh:
                    features.append(
                        SeqFeature(FeatureLocation(h.query_start, h.query_end),
                                   type="domain",
                                   qualifiers={
                                       'name': h.hit_id,
                                       'evalue': h.evalue
                                   }))

    os.system("rm %s %s %s %s" %
              (ufn + '.fasta', ufn + '.out', ufn + '.tbl', ufn + '.dtbl'))
    return features
コード例 #17
0
ファイル: FileManager.py プロジェクト: Falgunithakor/BioInfo
    def generate_blast_graph(self):
        evalue_filter = lambda hsp: hsp.evalue < self.evalue
        file_name = "{}/blast_graph.txt".format(self.blast_output_path)
        for blast_file in glob.glob(self.blast_data_path):
            print("working on " + blast_file)
            # Parse the Blast file
            qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
            for qresult in qresults:
                write_line = ""
                write_line += qresult.id + ":"
                # Go to the Hit section of query
                for hit in qresult[:]:
                    if not self.blast_graph.has_node(qresult.id):
                        self.blast_graph.add_node(qresult.id)
                    # Check if Hit has min value
                    filtered_hit = hit.filter(evalue_filter)
                    if filtered_hit is not None:
                        if not self.blast_graph.has_node(filtered_hit.id):
                            self.blast_graph.add_node(filtered_hit.id)
                        # Add Edge between graph nodes
                        self.blast_graph.add_edge(qresult.id, filtered_hit.id)
                        write_line += filtered_hit.id + ","
                if write_line != "":
                    with open(file_name, "a") as f_handle:
                        f_handle.write(write_line + '\n')

        # Write GML files
        if self.generate_gml_files:
            file_name = "{}/blast_graph.gml".format(self.blast_output_path)
            with open(file_name, "a") as f_handle:
                nx.write_gml(self.blast_graph, f_handle)
コード例 #18
0
ファイル: buildvariants.py プロジェクト: edraizen/HistoneDB
 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type, seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"):
         print "Loading hmmsearch for variant:", variant_query.id
         variant_model=Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             gi = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=gi)
             # print hit
             try: #sometimes we get this :    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp, seq.variant==variant_model)
             except:
                 pass
def extract_faa_seqs(HMM_TO_USE):
    HMM_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".out"
    HMM_OUTPUT_OBJECT = SearchIO.read(HMM_OUTPUT_FILE, 'hmmer3-tab')
    FAA_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".faa"
    if HMM_OUTPUT_OBJECT:
        DICT_OF_HIT = dict()
        for seq_record in SeqIO.parse(ORF_FILE, "fasta"):
            for hit in HMM_OUTPUT_OBJECT:
                # Compare the seq record from the fasta file to the IDs in the hits
                if seq_record.id == hit.id:
                    BIN_ID = G2B_DICT[hit.id]['binID']
                    # If binID has been added to dictionary, keep the one with the higher bitscore
                    if BIN_ID in DICT_OF_HIT:
                        print(BIN_ID + " has multiple hits for " + HMM_TO_USE)
                        if int(hit.bitscore) > int(
                                DICT_OF_HIT[BIN_ID]['bitscore']):
                            DICT_OF_HIT[BIN_ID] = {
                                "sequence": seq_record.seq.rstrip('*'),
                                "bitscore": hit.bitscore
                            }
                    # If binID hasn't been added to dictionary, add it.
                    if BIN_ID not in DICT_OF_HIT:
                        DICT_OF_HIT[BIN_ID] = {
                            "sequence": seq_record.seq.rstrip('*'),
                            "bitscore": hit.bitscore
                        }
        with open(FAA_OUTPUT_FILE, 'w') as OPENED_FAA_OUTPUT:
            for bin_id in DICT_OF_HIT:
                OPENED_FAA_OUTPUT.write('>' + bin_id + '\n' +
                                        str(DICT_OF_HIT[bin_id]['sequence']) +
                                        '\n')
コード例 #20
0
def blast():
    fasta = open("assembly.fasta").read()
    handle = NCBIWWW.qblast("blastn",
                            "nr",
                            fasta,
                            entrez_query='"Herpesviridae"[organism]'
                            )  #run blast against the assembled sequence
    with open("blast.xml", "w") as out_handle:
        out_handle.write(handle.read())
    out_handle.close()
    blast_qresult = SearchIO.read("blast.xml", "blast-xml")
    output = open('MiniProject.log', 'a')
    output.write('seq_title ' + 'align_len ' + 'number_HSPs ' +
                 'topHSP_ident ' + 'topHSP_gaps ' + 'topHSP_bits ' +
                 'topHSP_expect \n')
    max_blast_id = 10
    if len(
            blast_qresult
    ) < 10:  #prevents program from crashing when there are less than 10 results
        max_blast_id = len(blast_qresult)
    for i in range(0, max_blast_id):
        hit = blast_qresult[i]
        blast_hsp = blast_qresult[i][0]
        output.write(
            str(hit.description) + ' ' + str(hit.seq_len) + ' ' +
            str(len(hit.hsps)) + ' ' + str(blast_hsp.ident_num) + ' ' +
            str(blast_hsp.gap_num) + ' ' + str(blast_hsp.bitscore) + ' ' +
            str(blast_hsp.evalue) + '\n')
    output.close()
コード例 #21
0
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None):
    ''' Performs reciprocal hmmer search against the proteome of given organism
    '''
    print "# Reciprocal search..."
    is_found = False
    if out_filename == None:
        out_filename = modelname + ".rechits_" + organism
    reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res"
    os.system(reciprocal_search_command)
    try:
        hits = SearchIO.read(out_filename, "hmmer3-tab")
        max_bitscore = hits[0].bitscore
    except:
        hits = []
        max_bitscore = 0
    if len(hits) > 0:
        if re.search(modelname_regex, hits[0].description):
            is_found = True
    # for h in hits:
    #     if h.bitscore > rev_inc_bitscore_percentage * max_bitscore:
    #         if manual_mode:
    #             print modelname_regex, h.description, re.search(modelname_regex, h.description)
    #             raw_input("...")
    #         if re.search(modelname_regex, h.description):
    #             is_found = True
    if manual_mode:
        print filename, is_found
        raw_input("Check reciprocal search results...")
    return is_found
コード例 #22
0
ファイル: msa_setup.py プロジェクト: kashmatic/BLCA
def get_hit_seq(fastafile, filename):
	yamlfile = yaml_load_file(fastafile)
	blout = SearchIO.parse(filename, 'blast-text')
	for query in blout:
		seqid = query.id.split("\n")[0]
		#print(seqid)
		fh = open("multi_" + seqid + ".fasta", 'a')
		yamlfile[seqid]['hits'] = {}
		for hit in query.hits:
			gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
			yamlfile[seqid]['hits'][gi] = {}
			#print(yamlfile[seqid]['hits'])
			for hsp in hit.hsps:
				#print(hsp.hit)
				#print(hsp.hit_strand)
				#print(hsp.hit_start)
				#print(hsp.hit_end)
				hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS
				hitstart = 1 if hitstart < 0 else hitstart
				hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS
				hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
				#print(hsp.hit_end)
				out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
				fh.write(out)
				#print(hsp.hit.seq)
				#print(hsp.query.seq)
				#print("-----")
		fh.close()
		#break
	yaml_dump_file(fastafile, yamlfile)
コード例 #23
0
def reverse_complement_hsp(hsp, query_length):
    rev = SearchIO.HSP(fragments=[
        reverse_complement_hsp_fragment(frag, query_length)
        for frag in hsp.fragments[::-1]
    ])
    rev.ident_pct = hsp.ident_pct
    return rev
コード例 #24
0
def blast_search(filename, blast_temp_path):
    '''
        Perform a blast search, using a file as input. Either FASTA or accession number

        Return results as list of accession numbers and locations
    '''

    filepath = "%s/%s.xml" % (blast_temp_path, filename)
    if not os.path.exists(filepath):
        with open('%s/%s' % (blast_temp_path, filename), 'w') as f:
            f.write(filename)
        search_cmd = NcbitblastnCommandline(query='%s/%s' % (blast_temp_path, filename),  db="/research/sequences/GenBank/blast/db/refseq_genomic", outfmt=5, out=filepath)
        subprocess.call(str(search_cmd), shell=True)

    result = SearchIO.read(filepath, 'blast-xml')

    # Filter e-value
    hsps = filter(lambda hit: hit.evalue < 1e-10, result.hsps)

    # Filter length
    hsps = filter(lambda hit: hit.aln_span > 209, hsps)

    # Filter identity
    iden_cutoff = 0.2
    hsps = filter(lambda hit: float(hit.ident_num)/float(result.seq_len) > iden_cutoff, hsps)

    return [(hit.hit_id.split('|')[-2], hit.hit_start, hit.hit_end) for hit in hsps]
コード例 #25
0
def construct_gene_scores_matrix(hmmtable):
    """
    Parse hmmscan tabular output to a dictionary.
    Arguments:
        hmmtable: pathlib.Path instance: Path to the hmmscan output, specified
            with hmmscan's --tblout option. Can also be str.
    Return:
        dic_genes_scores: dict: A dictionary with the gene ids as keys with
            a list of lists for all its hits. This is of the form
            { gene_id: [
                [ hit id, (<- string)
                  hit E-value, (<- np.float32)
                  hit bit-score, (<-np.float32)
                  hit bias, (<-np.float32)
                  ], ...],
                  ...}
    """
    dic_genes_scores = {}
    for gene in SearchIO.parse(hmmtable, "hmmer3-tab"):
        dic_genes_scores[gene.id] = []
        for hit in gene.hits:
            hit_info = [
                hit.id,
                np.float32(hit.evalue),
                np.float32(hit.bitscore),
                np.float32(hit.bias),
            ]
            dic_genes_scores[gene.id].append(hit_info)
    return dic_genes_scores
コード例 #26
0
ファイル: vsearch.py プロジェクト: xapple/seqsearch
    def results(self):
        """
        Parse the results and yield biopython SearchIO entries.

        Beware:
        Some databases are not unique on the id, and this causes the parser to
        complain about duplicate entries and raise exceptions such as:

            ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
            in this QueryResult.

        Summary of the columns:
        https://www.metagenomics.wiki/tools/blast/blastn-output-format-6

            qseqid sseqid pident length mismatch gapopen qstart qend sstart send
            evalue bitscore

        Warning: Unlike BLAST results, if a sequence got no hits it is NOT
                 reported at all in VSEARCH. The number of entries yielded
                 will not match the number of sequences at input.
        """
        with open(self.out_path, 'rt') as handle:
            for entry in SearchIO.parse(
                    handle,
                    'blast-tab',
            ):
                yield entry
コード例 #27
0
def blaster(fasta_file):
    """
    Based on a target species list, we BLAST the given
    input sequence and put them in a file. 
    """
    fasta_string = open(fasta_file).read()
    print("BLAST initiated...")

    # qblast opens up the BLAST function in NCBI.
    result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string)

    print("BLAST search done.")
    # Records will then be written in a file.
    records = []
    # Results need to go into an XML file.
    with open("my_blast.xml", "w") as out_handle:
        out_handle.write(result_handle.read())

    blast_result = SearchIO.read("my_blast.xml", "blast-xml")
    print("Writing BLAST results to file..")
    for i in target_species:
        # Interate through the blast result hits.
        for hit in blast_result:
            print(hit)
            if i in hit.description:
                # If the taret species is found, append.
                records.append(hit[0].hit)

    # Pretty easy way to write the given sequences in one file.
    SeqIO.write(records, "blast-results.fasta", "fasta")
    print("\nBLAST result file written to blast_results.fasta.")
    return ("blast_results.fasta")
コード例 #28
0
def get_unique_blastp_hits(infile, fasta):
    hits = set()
    for aln in SearchIO.parse(infile, 'blast-xml'):
        for hsp in aln.hsps:
            hits.add(hsp.hit_id)
    seqs = {rec.id: rec for rec in SeqIO.parse(fasta, 'fasta')}
    return [seqs[hit] for hit in hits]
コード例 #29
0
    def extract_top_hits(hmmer_hits, top_hits_file, top_hits_ids_file=None,
                         not_significant_ids_file=None, not_found_ids_file=None):
        top_hits_ids = IdList()
        not_significant_ids = IdList()
        not_found_ids = IdList()

        index_file = "hmmer_hits.tmp.idx"
        hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text")

        out_fd = open(top_hits_file, "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

        for query in hmm_dict:
            if hmm_dict[query].hits:
                if hmm_dict[query][0].is_included:
                    out_fd.write("%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue,
                                                       hmm_dict[query][0].bitscore))
                    top_hits_ids.append(query)
                else:
                    not_significant_ids.append(query)
            else:
                not_found_ids.append(query)

        os.remove(index_file)

        if not_significant_ids_file:
            not_significant_ids.write(not_significant_ids_file)

        if not_found_ids_file:
            not_found_ids.write(not_found_ids_file)

        if top_hits_ids_file:
            top_hits_ids.write(top_hits_ids_file)
コード例 #30
0
def multiPFAMsearch(inputList, cpus, tmpdir, output):
    # run hmmerscan multithreaded by running at same time
    # input is a list of files, run multiprocessing on them
    pfam_results = os.path.join(os.path.dirname(tmpdir), 'pfam.txt')
    pfam_filtered = os.path.join(os.path.dirname(tmpdir), 'pfam.filtered.txt')
    lib.runMultiNoProgress(safe_run, inputList, cpus)

    # now grab results and combine, kind of tricky as there are header and footers for each
    resultList = [os.path.join(tmpdir, f) for f in os.listdir(
        tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.pfam.txt')]
    combineHmmerOutputs(resultList, pfam_results)

    # now parse results
    with open(output, 'w') as out:
        with open(pfam_filtered, 'w') as filtered:
            with open(pfam_results, 'r') as results:
                for qresult in SearchIO.parse(results, "hmmsearch3-domtab"):
                    hits = qresult.hits
                    num_hits = len(hits)
                    if num_hits > 0:
                        for i in range(0, num_hits):
                            hit_evalue = hits[i].evalue
                            query = hits[i].id
                            pfam = qresult.accession.split('.')[0]
                            hmmLen = qresult.seq_len
                            hmm_aln = int(hits[i].hsps[0].hit_end) - \
                                int(hits[i].hsps[0].hit_start)
                            coverage = hmm_aln / float(hmmLen)
                            if coverage < 0.50:  # coverage needs to be at least 50%
                                continue
                            filtered.write("%s\t%s\t%s\t%f\n" %
                                           (query, pfam, hit_evalue, coverage))
                            out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))
コード例 #31
0
ファイル: run_me.py プロジェクト: minghao2016/AutoBlast
def analyze_BLAST_result(input_fasta_name_wo_path, result_handle):
    show_header("Step 2. Analyzing the BLAST result.")

    output_file_name = "retrieved_from_" + str(
        input_fasta_name_wo_path)[:-6] + ".xml"

    if not os.path.exists('sample/output'):
        os.makedirs('sample/output')

    current_dir = os.getcwd()
    output_folder = os.path.join(current_dir, "sample/output")

    os.chdir(output_folder)

    output_file = open(
        output_file_name, "w"
    )  # since it is 'w', an existing file will be overwritten. (if this is "a", new info will be appended to an existing file)
    output_file.write(result_handle.read())
    output_file.close()

    blast_qresult = SearchIO.read(output_file_name,
                                  "blast-xml")  # query_result
    filter_for_no_predicted_hypothetical = lambda hit: ("PREDICTED" in hit.
                                                        description == False)
    filtered_qresult = blast_qresult.hit_filter(
        filter_for_no_predicted_hypothetical)
    for hit in filtered_qresult:
        print("%s" % (hit.description))
コード例 #32
0
ファイル: run_interprets.py プロジェクト: JCGonzS/mechnetor
def parse_blast(blast_pdb_file, max_E, min_pcid, max_pcid, hits):
    with open_file(blast_pdb_file) as f:
        for qresult in SearchIO.parse(f, 'blast-xml'):
            query = qresult.id#.split("|")[1]
            for hit in qresult:
                s = hit.id + hit.description
                hsp = hit[0] # Only the 1st one
                evalue = hsp.evalue
                pcid = float(hsp.ident_num)/hsp.aln_span*100
                if (evalue<=max_E
                and pcid>=min_pcid and pcid<=max_pcid):
                    # print "\t>HIT:",hit.id, set(re.findall("pdb\|\w\w\w\w\|\w", s))
                    # print "\t", hsp.evalue, "{:2.1f}".format(pcid)
                    # print hsp.query_start, hsp.query_end
                    # print hsp.hit_start+1, hsp.hit_end
                    for match in re.findall("pdb\|\w\w\w\w\|\w", s):
                        pdb, chain = match.split("|")[1:]
                        hits[query][pdb][chain]={
                                      "ide": "{:2.1f}".format(pcid),
                                      "e-val": evalue,
                                      "q-start": str(hsp.query_start+1),
                                      "q-end": str(hsp.query_end),
                                      "s-start": str(hsp.hit_start+1),
                                      "s-end": str(hsp.hit_end)
                                      }
                else:
                    break
    return hits
コード例 #33
0
def run_blastp(target_blastp_database: str, query_sequence: str,
               opts: List[str] = None, results_file: str = None
               ) -> List[SearchIO._model.query.QueryResult]:
    """ Runs blastp over a single sequence against a database and returns the
        results as parsed by Bio.SearchIO.

        Arguments:
            target_blastp_database: the blastp database to compare to
            query_sequence: the sequence being compared
            opts: a list of extra arguments to pass to blastp, or None
            results_file: a path to keep a copy of blastp results in, if provided

        Returns:
            a list of QueryResults as parsed from blast output by SearchIO
    """
    if not query_sequence:
        raise ValueError("Cannot run blastp on empty sequence")

    config = get_config()
    command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database]

    if opts is not None:
        command.extend(opts)

    result = execute(command, stdin=query_sequence)
    if not result.successful():
        raise RuntimeError('blastp returned %d: %r while scanning %r' % (
                           result.return_code, result.stderr.replace("\n", ""),
                           query_sequence[:100]))

    if results_file is not None:
        with open(results_file, 'w') as fh:
            fh.write(result.stdout)

    return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))
コード例 #34
0
ファイル: peptblast.py プロジェクト: aurbn/peptblast
def process_blast_output(file, simple, argparser):
    qresults = SearchIO.parse(file, 'blast-xml')
    if simple:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and
                             (hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)):
                        yield ([str(hsp), "\n\n"], None, hsp.aln_span)

                for hsp in hit:
                    if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and
                            (hsp.aln_span == hsp.ident_num)):
                        yield (None, [str(hsp), "\n\n"], hsp.aln_span)
    else:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    for v, c, p in encode(simstr(hsp.aln)):
                        if v == "1" and c >= argparser.cont:
                            yield (format_alignment(hsp, p, c), None, c)
                for hsp in hit:
                    for t0, t1, t2 in thrids(encode(simstr(hsp.aln))):
                        if t0[0] == "1":
                            assert (t0[2] < t1[2] < t2[2])
                            assert t2[0] == "1"
                            assert t1[0] == "0"
                            if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \
                               (t0[1] + t2[1]) >= argparser.summin and \
                                t1[1] <= argparser.gapmax:
                                if not (argparser.S and
                                        (t0[1] >= argparser.cont or t2[1] >= argparser.cont)):
                                    yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]),
                                           t0[1]+t2[1]-t1[1])
コード例 #35
0
def get_hits_to_VPFs(hmmout_file):
    '''Takes a HMMER3 hmmsearch tab output file as an input and
    returns a dictionary mapping each scaffold with the number of unique genes that match a protein family

    Input:
        - hmmout_file (str): path to HMMER3 hmmsearch out file in tab format

    Returns:
        - hits_to_VPFs (dict): dictionary where key are scaffold IDs and values are number of unique genes that matched a protein family
    '''
    hits_to_VPFs = {}
    with open(hmmout_file, 'r') as input:
        for qresult in SearchIO.parse(input, 'hmmer3-tab'):
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                for i in range(0, num_hits):
                    query_seq_id = hits[i].id
                    scaffold, gene = query_seq_id.split('|')
                    hits_to_VPFs[scaffold] = hits_to_VPFs.get(
                        scaffold, set([])).union([gene])

    for key, value in iter(hits_to_VPFs.items()):
        hits_to_VPFs[key] = len(value)
    return hits_to_VPFs
コード例 #36
0
def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):
    print seq_col_name
    if db_init:
        from SNDG.Sequence.ProteinAnnotator import PABase
        PABase.sqldb.initialize(db_init)
    mkdir(annotation_dir)
    out = annotation_dir + "/species_blast.tbl"

    tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    species_tax = None
    for tax in Tax.parents(tax):
        if tax.node_rank == "genus":
            species_tax = tax
            break
    tax_data = "/data/xomeq/tax/"
    species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()
コード例 #37
0
def handle_input(filename):
    sys.stdout.write("Handling %s\n" % filename)
    not_significant_ids = IdList()
    not_found_ids = IdList()

    prefix = FileRoutines.split_filename(filename)[1]
    index_file = "%s.tmp.idx" % prefix
    hmm_dict = SearchIO.index_db(index_file, filename, args.format)
    if args.output == "stdout":
        out_fd = sys.stdout
    else:
        out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

    for query in hmm_dict:
        if hmm_dict[query].hits:
            if hmm_dict[query][0].is_included:
                out_fd.write(
                    "%s\t%s\t%s\t%s\n" %
                    (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue,
                     hmm_dict[query][0].bitscore))
            else:
                not_significant_ids.append(query)
        else:
            not_found_ids.append(query)

    if args.output != "stdout":
        out_fd.close()

    os.remove(index_file)
    return not_significant_ids, not_found_ids
コード例 #38
0
def xml2fasta(infile=None, outfile=None):

    print('\nConverting ' + infile +
          ' to fasta format, removing duplicates...')

    # Load the blast output file
    blast_qresult = SearchIO.read(infile, "blast-xml")

    # Iterate through ids and sequences and add them to lists. Sequences are
    # only added to the list if they are not already in the list. This is done
    # because sometimes there will be duplicates in the blast output, which
    # will produce an error when aligning with clustal

    ids = []
    sequences = []
    for hsp in blast_qresult.hsps:
        if hsp.hit.id not in ids:
            ids.append(str(hsp.hit.id))
            sequences.append(str(hsp.hit.seq))

    # Open the sequences output file then for each high-scoring pair
    # in the blast results, write the hit ID (proceeded by a ">" for
    # fasta format), followed by the hit sequence on the next line
    with open(outfile, "w") as f:
        for i, s in zip(ids, sequences):
            f.write('> ' + i + '\n')
            f.write(s + '\n')

    print('\tDone: writing to ' + outfile)
コード例 #39
0
def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None
                 ) -> List[SearchIO._model.query.QueryResult]:  # pylint: disable=protected-access
    """ Run hmmpfam2 over the provided HMM file and fasta input

        Arguments:
            query_hmmfile: the HMM file to use
            target_sequence: a string in fasta format of the sequence to run

        Returns:
            a list of results as parsed by SearchIO
    """
    config = get_config()
    command = ["hmmpfam2"]

    # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this
    if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \
            config.hmmer2.multithreading:
        command.extend(["--cpu", str(config.cpus)])
    if extra_args:
        command.extend(extra_args)
    command.extend([query_hmmfile, '-'])

    result = execute(command, stdin=target_sequence)
    if not result.successful():
        logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code,
                      result.stderr, query_hmmfile)
        raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr))
    res_stream = StringIO(result.stdout)
    return list(SearchIO.parse(res_stream, 'hmmer2-text'))
コード例 #40
0
ファイル: hmm.py プロジェクト: kemball/karkadann
def _call_hmmer(hmm, inputproteins):
	inputproteins = list(inputproteins)
	scores = {}
	for ip in inputproteins:
		scores[ip.id] = 0

	with ntf(prefix="/dev/shm/") as inputfasta:
		with ntf(prefix="/dev/shm/") as hmmoutput:
			SeqIO.write(inputproteins, inputfasta.name, 'fasta')
			hmmfile = os.path.join(hmm_location, hmm + '.hmm')
			sp.call(['hmmsearch', '-o', hmmoutput.name, hmmfile, inputfasta.name])
			hmmoutput.flush()
			hmmoutput.seek(0)
			QRS = SearchIO.parse(hmmoutput, format="hmmer3-text")
			for qr in QRS:
				# there's *always* a QR, even though it's usually empty.
				# qr.sort()
				# I'm kind of hoping this sorts by hit strength.
				# worth checking. I guess it doesn't matter anyway.

				for hit in qr:
					scores[hit.id] = max(scores[hit.id], hit.bitscore)
					for hsp in hit.hsps:
						def appropriate_hyphens(m):
							return '-' * len(m.group(0))

						if len(hsp.hit.seq) > 100:
							hitseq = re.sub('PPPPP+', appropriate_hyphens, str(hsp.hit.seq))
							hitseq = hitseq.translate(None,'-*').upper()
							yield hit.id, hsp.bitscore, hitseq
コード例 #41
0
 def run_hmmsearch(self, name, hmm):
     """
     Rum hmmsearch and return the highest scoring hit
     """
     out = tempfile.NamedTemporaryFile("w")
     cmd = [
         "hmmsearch",
         "--noali",
         "-o",
         out.name,
         os.path.join(self.cov_dir, hmm + "-nt.hmm"),
         os.path.join(self.cov_dir, name + ".fa"),
     ]
     if self.verbose:
         print("Command: {0}".format(cmd))
     try:
         subprocess.run(cmd, check=True)
     except (subprocess.CalledProcessError) as exception:
         print("Error: {}".format(exception))
         sys.exit("Error running hmmsearch using {}".format(hmm))
     bestscore = 0
     besthit = None
     # Get HSP with highest score
     for qresult in SearchIO.parse(out.name, "hmmer3-text"):
         for hit in qresult:
             for hsp in hit:
                 if hsp.bitscore > bestscore:
                     besthit = hsp
                     bestscore = hsp.bitscore
     return besthit
コード例 #42
0
def parse_n_fill_run_data_searchio(run_path, run_data, querydb):
    run_id = get_run_id(run_path)
    run_format = get_run_format(run_path)
    for query in SearchIO.parse(run_path, run_format):
        for hit in query.hits:
            for hsp in hit.hsps:
                exons = [x.hit_range for x in hsp.fragments]
                coverage = 'N/A'

                if querydb is not None:
                    total_matched = sum(x.query_span for x in hsp.fragments)
                    coverage = '{:.2f}%'.format(100 * total_matched / len(querydb[query.id]))

                if hasattr(hsp, 'score'):
                    score = hsp.score
                elif hasattr(hsp, 'bitscore'):
                    score = hsp.bitscore
                else:
                    score = 'N/A'

                if hasattr(hsp, 'ident_num') and hasattr(query, 'seq_len'):
                    matched = '{:.2f}%'.format(100 * hsp.ident_num / query.seq_len)
                else:
                    matched = 'N/A'

                alignment = AlignmentData(run_id, score, matched, coverage, hsp.hit_range, exons)
                run_data[query.id][hit.id].append(alignment)
コード例 #43
0
def exonerate_parser(exonerate_file):
    """
    parser the exonerate result, and return the position of the feather in 4-col bed format
    4 col bed4: [chro, start,end, name], example ["seq1", 1, 55, "trnP"]
    :param query:
    :param exonerate_file:
    :param prefix:
    :return: list of bed4
    """
    #fw=open(tbl_outname, "w") # change IO to list store
    bed4=[]

    texts=SearchIO.parse(StringIO(exonerate_file), format="exonerate-text")
    for record in texts:
        for hsp in record:
            for s in hsp:
                # the biopython.SearchIO interval is 0 based [start, end), so start+1, end+0 to get 1 based coords
                table_4=[s.fragment.query_id, s.fragment.query_start+1, s.fragment.query_end,s.fragment.hit_id]
                bed4.append(table_4)

                #fw.write("\t".join(table_4))
                #fw.write("\n")
    bed4.sort()
    #fw.close()
    return bed4
コード例 #44
0
ファイル: getdataset.py プロジェクト: wangdi2014/plasmidminer
def runHmmer(args, list_path, file_path, f):
    """run prodigal and hmmsearch on chr files"""
    if not os.path.exists(str(args.data) + '/tmp'):
        os.makedirs(str(args.data) + '/tmp')
    # get the sample group
    head, group = os.path.split(os.path.split(file_path)[0])
    basename = os.path.splitext(str(ntpath.basename(str(file_path))))[0]
    exportpath = str(args.data) + '/tmp/' + ntpath.basename(str(file_path))
    hmmpath = str(args.data) + '/tmp/' + ntpath.basename(
        str(file_path)) + '.out'
    print('Processing %s of group %s' % (basename, group))
    s = ""
    cmd = ("prodigal -p meta -i ", str(file_path), " -a ", exportpath,
           ' -d /dev/null > /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # run hmmsearch on faa ORF files
    s = " "
    cmd = ("hmmsearch -E 0.001 --domtblout", hmmpath, 'resources/remove.hmm',
           exportpath, '> /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # write it to output file if there is a hit
    with open(hmmpath, 'rU') as input:
        try:
            for qresult in SearchIO.parse(input, 'hmmscan3-domtab'):
                query_id = qresult.id
                hits = qresult.hits
                num_hits = len(hits)
                acc = qresult.accession
                if num_hits > 0:
                    f.write(''.join((basename, '\t', str(file_path), '\n')))
        except ValueError:
            print('parsing error on %s' % basename)
コード例 #45
0
 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type,
                                          seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file,
                 out=self.curated_search_results_file,
                 sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file,
                                         "hmmer3-text"):
         self.log.info("Loading hmmsearch for variant: {}".format(
             variant_query.id))
         variant_model = Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             accession = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=accession)
             # print hit
             try:  #sometimes we get this:    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp,
                           seq.variant == variant_model)
             except:
                 pass
コード例 #46
0
ファイル: cogOffline4.py プロジェクト: bugds/COG
def blastSearch(query, speciesList, filename, blastDict):
    '''Run BLAST, save results of a search to a file and return its contents
    :param query: String with accession numbers divided by paragraphs
    :param species: String with all species, against which BLAST is performed
    :param filename: Name of original fasta file for saving results of BLAST
    '''

    xmlPath = rootFolder \
        + '/Blast_XML/' \
        + os.path.splitext(filename)[0] \
        + '.xml'

    query = createInputForBlast('.q', query, filename)
    taxidList = createInputForBlast('.t', speciesList, filename)

    blastNotVoid = bashBlast(query=query, out=xmlPath, taxidList=taxidList)

    if blastNotVoid:
        blast = SearchIO.parse(xmlPath, 'blast-xml')
        writeInBlastDict(blast, blastDict)

    os.remove(query)
    os.remove(taxidList)
    os.remove(xmlPath)

    return blastDict
コード例 #47
0
def parse_hmmscan_tab(infile, print_header=True):
    '''Parse hmmscan output in --tblout format'''
    if print_header:
        yield "query","top hit","evalue","certainty","num sig hits"
    records = SearchIO.parse(infile,'hmmer3-tab')
    for rec in records:
        query = rec.id
        if len(rec) > 1:
            hit1,hit2 = rec.hits[0],rec.hits[1]
            eval1,eval2 = hit1.evalue,hit2.evalue
            if eval1 != 0: # convert to -ln evalue
                eval1 = -np.log(eval1)
            if eval2 != 0:
                eval2 = -np.log(eval2)
            if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care
                certainty = 1
            elif eval1 == 0 and eval2 == 0:
                certainty = 0
            else: # calculate certainty with info theoretic calc.
                total = eval1 + eval2
                p1,p2 = eval1/total, eval2/total
                certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2))
        else:
            certainty = 1
        yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)
コード例 #48
0
ファイル: nrpslib.py プロジェクト: dkmva/nrps-oligo-designer
def get_adenylation_domains(fasta, known=None, lagging_strand=False):
    adenylation_domains = []

    fasta_seqs = []
    for fs in SeqIO.parse(fasta, 'fasta'):
        revcom=False
        seq = str(fs.seq)
        pepseq, rf = get_pepseq(seq)
        if rf < 0 == lagging_strand:
            revcom=True
            seq = utils.reverse_complement(seq)
        fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf})
    for fs in fasta_seqs:
        utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'],
                  '>header\n' + pepseq)
        with open('dump') as f:
            out = f.read()
        res_stream = StringIO(out)
        os.remove('dump')
        results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab'))

        for result in results:
            for i, hsp in enumerate(result.hsps, 1):
                s = hsp.hit_start
                e = hsp.hit_end

                adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e))

    return adenylation_domains
コード例 #49
0
ファイル: Exons_0.014.py プロジェクト: JMPflug/Exon_machine
def first_exonerate_parse(dir, newdir, prefix):
	cwd = os.getcwd()
	if not os.path.exists(cwd + newdir):
		os.makedirs(cwd + newdir)
	if not os.path.exists(cwd + '/merged_exons/'):
		os.makedirs(cwd + '/merged_exons/')
	for file in slistdir(cwd + dir):
		if 'DS_Store' not in file:
			result = SearchIO.parse(cwd + dir + file, 'exonerate-text')
			for h in result:
				for hh in h:
					for hhh in hh:
						hitcounter = 1
						for hhhh in hhh:
							hitseq =  hhhh.query
							rootname = file.split('.fasta')
							orthoname = file.split("_")
							orthosubdir = cwd + newdir + '/' + orthoname[0]
							if not os.path.exists(orthosubdir):
								os.makedirs(orthosubdir)
							newseqstr = str(hitseq.seq.ungap("-"))
							newid = prefix + str(hitcounter) + '_' + rootname[0]
							record = SeqRecord(Seq(newseqstr, generic_dna), id =  newid, description = '')
							fastaname = prefix + str(hitcounter) + '_' + rootname[0] + '.fasta'
							SeqIO.write(record, orthosubdir + '/' + fastaname, "fasta")
							hitcounter += 1
コード例 #50
0
ファイル: blast.py プロジェクト: UtrechtUniversity/microbiome
def parse_results(path, file_name, FA_FILES_PATH, top_k=3, add_to_db=bool):
    """Parses a result of a blast query

    Return top k matches and adds them to the database.
    """

    print(f"Parsing {file_name} at {path}")
    i = 0
    results = list()
    for bresults in SearchIO.parse(path, 'blast-xml'):
        for r in bresults:
            i += 1
            # Select only top k
            if i <= top_k:
                results.append({
                    "rank": i,
                    "id": r.id,
                    "query_id": r.query_id,
                    "full_name": r.description_all,
                    "bitscore": r.hsps[0].bitscore,
                    "evalue": r.hsps[0].bitscore,
                    "query_range": r.hsps[0].query_range,
                    "hit_range": r.hsps[0].hit_range,
                })
            elif i > top_k and add_to_db is True:
                print("Top 3 results saved to database")
                add_to_database(results=results, FA_FILES_PATH=FA_FILES_PATH)
                break
            else:
                return (results)
コード例 #51
0
 def retrieve_blast_data(self):
     for blast_file in glob.glob(self.blast_data_path):
         print(blast_file)
         print self.network_data
         qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         for qresult in qresults:
             if(qresult.id in self.network_data):
                 print qresult.id
コード例 #52
0
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using **kwargs, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        self.assertEqual(raw, idx.get_raw(id))
        idx.close()

        #Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            self.assertEqual(raw, idx.get_raw(id))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            #Do the tests again with the BGZF compressed file
            print "[BONUS %s.bgz]" % filename
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
コード例 #53
0
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed),
                         "Should be %i records in %s, index says %i"
                         % (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed),
                             "Should be %i records in %s, index_db says %i"
                             % (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)
コード例 #54
0
ファイル: getcontig.py プロジェクト: jangwen/python
def parse(target):
    blast_result = list(SearchIO.parse('BlastResult.xml', 'blast-xml'))
    for record in blast_result:
        if len(record) == 0:
            continue
        else:
            tophit = record[0]
        target.append([tophit[0][0].query, tophit[0][0].hit])
コード例 #55
0
def read_hmmer_file(f_path):
    """
    Uses Biopython's SearchIO to parse a HMMER output file.
    Returns an iterator with search hits.
    """

    f_path = _check_file(f_path)
    return SearchIO.read(f_path, "hmmer3-text")
コード例 #56
0
ファイル: pseudogene.py プロジェクト: jianzuoyi/orfam
def find_frameshift(sbjct_dict, query_dict, pseudo_hits,
                    temp_dir, out_frameshift):
    assert(os.path.isdir(temp_dir))
    sys.stderr.write("finding frameshift mutations...\n")

    frameshifts = []
    non_frameshift = []
    exn_query = temp_dir + "/" + "exn_query.fa"
    exn_target = temp_dir + "/" + "exn_target.fa"
    align_file = temp_dir + "/" + "fshift_exonerate.exn"

    for hit in pseudo_hits:
        chrom = hit[0]
        record = sbjct_dict[chrom]
        qseqid = hit[8].split(";")[0].split("=")[1]
        SeqIO.write(query_dict[qseqid], exn_query, "fasta")
        flank = 1000
        flank_record = _get_hit_record(record, hit, flank)
        SeqIO.write(flank_record, exn_target, "fasta")

        # alignment using exonerate
        p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " +
                             exn_query + " -t " + exn_target + ">" +
                             align_file, shell=True)
        os.waitpid(p.pid, 0)
        fshift = False

        try:
            qresult = SearchIO.read(align_file, "exonerate-text")
            hsp = qresult[0][0]  # first hit, first hsp
            # query overlapping with the best-hit
            new_hit_start = flank + 1
            new_hit_end = len(flank_record.seq) - flank
            if hsp.hit_start + 1 <= new_hit_end and \
                    hsp.hit_end >= new_hit_start:
            # there are frameshifts
                if len(hsp.hit_frame_all) > 1:
                    fshift = True
        except:
            pass

        if fshift:
            fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end]
            fshift_id = hit[0] + ":" + \
                str(hsp.hit_start + 1) + "-" + str(hsp.hit_end)
            frameshift_record = SeqRecord(
                fshift_seq, id=fshift_id, name=fshift_id,
                description="qseqid=" + qseqid)
            frameshifts.append(frameshift_record)
        else:
            non_frameshift.append(hit)
    # end for

    SeqIO.write(frameshifts, out_frameshift, "fasta")
    sys.stderr.write("done.\n")
    return non_frameshift
コード例 #57
0
 def generate_blast_data(self):
     self.initialize_variables()
     for blast_file in glob.glob(self.blast_data_path):
         # Parse each Blast file
         query_results = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         filtered_query_results = self.apply_filtering(query_results)
         # Parse each blast record
         for query_result in filtered_query_results:
             print query_result.id
             self.generate_blast_graph(query_result)
コード例 #58
0
def getIndices(resultHandle):
    '''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''

    blast_result = SearchIO.read(resultHandle, 'blast-tab')

    print(blast_result[0][0])
    start = blast_result[0][0].hit_start
    end = blast_result[0][0].hit_end

    return start, end
コード例 #59
0
def runBlastParserTAB(cline,blast_out_file, False):
	startTime = datetime.now()
	os.system(str(cline))
	print 'Running BLAST:' + str(datetime.now() - startTime)
	startTime = datetime.now()
	blast_records = SearchIO.parse(blast_out_file, 'blast-tab', comments=False)

	print 'Parsing Results:' + str(datetime.now() - startTime)

	return blast_records
コード例 #60
0
def process_tot():
        print "beginning process_tot"
	#level name
	level = sys.argv[3]+" "
	#get the name of the proteome from the file name
	omeid = sys.argv[1].replace(".fasta", "")
	omeid = omeid.split('/')
	omeid = omeid[len(omeid)-1] 
	#read in results
	results = SearchIO.parse(sys.argv[2], "hmmer3-text")
	
	#build up list of entries
	#processed = [] #intialize list to add entries to
        count = 0

        scans=len(results)
        cutoff=1.0/scans
        print scans
        print cutoff

	for protein in results: 
                processed = []
                if count % 100 ==0:
                     print count
                count = count + 1
		pid = protein.id+" "
		if len(protein) == 0: #if a protein has no hits groupid=proteinid and rank=0
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		elif protein[0].evalue > cutoff: #proteins with hits that do not meet the threshold are treated as those without any hits
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		else:
			i = 0
			while i<len(protein) and protein[i].evalue <= cutoff:
				rank = str(i+1)+" "
				OGid = protein[i].id.split('.')
				OGid = OGid[0]+"."+OGid[1]+" "
				e = str(protein[i].evalue)+" "
				qr = [] #empty list for domain ranges of this hit
				for d in protein[i]:
					qr.append(d.query_range)
				processed.append((rank, level, pid, OGid, e, str(qr).replace(" ", "")," ", omeid))
				i += 1
		#Write to file
	 	for i in processed:
			output.write("".join(str(s) for s in i) + "\n")

	output.close()