def blastparse(stdout, output, tname, ntname):
    global recorddict, minLength
    handle = open(output, 'w')  # open the target fasta file for writing
    blast_handle = cStringIO.StringIO(stdout)  # Convert string to IO object for use in SearchIO using StringIO
    try:  # Necessary to avoid bad genomes
        for qresult in SearchIO.parse(blast_handle, 'blast-tab'):  # Parse the blast output sting as if it were a file
            for hit in qresult:  # Hit object
                for hsp in hit:  # Hsp object
                    begin = hsp.query_range[0]  # Start of hsp
                    finish = hsp.query_range[1]  # End of hsp
                    if hsp.query_id in recorddict:
                        # For the Contig name in the target fasta dictionary mask using coordinates
                        if finish > begin:
                            recorddict[hsp.query_id].seq = \
                                recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
                                + recorddict[hsp.query_id].seq[finish:]
                        else:
                            recorddict[hsp.query_id].seq \
                                = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
                                + recorddict[hsp.query_id].seq[begin:]
        recorddict_bak = deepcopy(recorddict)  # Copy the dictionary so we may iterate and modify the result
        for idline in recorddict_bak:
            # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' #  Find a sequence of at least the target length
            pattern = r'[ATCG]{100,}N{200,900}[ATCG]{100,}|[^N]{' + re.escape(str(minLength))+r'}'
            if re.match(pattern, str(recorddict[idline].seq)) is not None:
                SeqIO.write(recorddict[idline], handle, "fasta")
            else:
                # print 'Contig \'%s\' not written to file' % id
                recorddict.pop(idline)
    except ValueError:
        print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)
 def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs):
     """Compares parsed QueryResults after they have been written to a file."""
     source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs))
     SearchIO.write(source_qresults, out_file, out_format, **kwargs)
     out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs))
     for source, out in zip(source_qresults, out_qresults):
         self.assertTrue(compare_search_obj(source, out))
Beispiel #3
0
def parseBlastOutFile(filename):
    if filename[-3:] == "xml":
        qResultGen = SearchIO.parse(filename, 'blast-xml')
    elif filename[-3:] == "txt":
        qResultGen = SearchIO.parse(filename, 'blast-tab')
    else:
        print("Unrecognized filetype.")
        assert False

    parsed = {qRes.id: qRes for qRes in qResultGen}
    print("Parsed " + filename)

    return parsed
Beispiel #4
0
def parseBlastOutFile(filename):
	if filename[-3:] == "xml":
		qResultGen = SearchIO.parse(filename, 'blast-xml')
	elif filename[-3:] == "txt":
		qResultGen = SearchIO.parse(filename, 'blast-tab')
	else:
		print("Unrecognized filetype.")
		assert False

	parsed = {qRes.id : qRes for qRes in qResultGen}
	print("Parsed "+filename)

	return parsed
Beispiel #5
0
    def __init__(self, string):
        """
        Define the attributes of a ExonerateGene object.

        - contig_id:     ID of source contig.
        - locs:          Genomic location of called gene on contig.
        - gene_id:       ID of called gene, derived from contig_id and locs.
        - ref:           Reference homolog (i.e. seed gene for exonerate).
        - internal_stop: Internal stop codon present or not.
        - introns:       Number of introns in called gene.
        - called:        Called gene's translated protein sequence.

        All attributes above are derived ultimately from exonerate output.

        Note: locs are always given in "positive" sense, regardless of gene's
        actual sense, this is consistent with Biopython.SearchIO.

        Note: In weird cases, exonerate-text returns a negative start
        co-ordinate for some (not all?) reverse complement genes. I put in
        a second parse using exonerate-vulgar to determine co-ordinates for
        genes.
        """
        contig_id = ""
        introns = 0
        called = []
        for result in SearchIO.parse(string, "exonerate-text"):
            ref = result.id
            stop = False
            for hit in result:
                contig_id = hit.id
                called = []
                introns = len(hit[0].hit_inter_ranges)
                for fragment in hit[0].fragments:
                    for record in fragment.aln._records:
                        if record.name == "aligned hit sequence":
                            called.append(str(record.seq))
                            if "*" in record.seq[:-1]:
                                stop = True
            self.ref = "Exonerate={0}".format(str(ref))
            self.contig_id = contig_id
            self.internal_stop = "IS={0}".format(str(stop))
            self.introns = "Introns={0}".format(str(introns))
            self.called = "".join(called)
        string.seek(0)
        for result in SearchIO.parse(string, "exonerate-vulgar"):
            for hit in result:
                locs = hit[0].hit_range
                gene_id = "{0}_{1}".format(hit.id, "_".join(str(loc) for
                                                            loc in locs))
                self.locs = locs
                self.id = gene_id
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(
            len(parsed), len(indexed),
            "Should be %i records in %s, index says %i" %
            (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format,
                                           **kwargs)
            self.assertEqual(
                len(parsed), len(db_indexed),
                "Should be %i records in %s, index_db says %i" %
                (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)
Beispiel #7
0
def hmmer_results_parser(hmm_filename, edges, len_th, evalue_th, K, sequences = []):
    sequences_hits = {}
    hmm_hits = {}
    hits_lst = []
    with open (hmm_filename,'rU') as handle: 
        for record in SearchIO.parse(handle, 'hmmscan3-domtab'):
            hmm_name = record.id
            hmm_len = record.seq_len
            for h in record.hits:
                seq_name = h.id
                hit_evalue = h.evalue
                for f in h.fragments:
                    hit_len = f.hit_end - f.hit_start
                    seq_s, seq_e = f.query_start, f.query_end - 1
                    if hit_len > len_th*hmm_len and hit_evalue < evalue_th:
                        if len(sequences) > 0:
                            cur_seq = sequences[seq_name]
                        else:
                            cur_seq = merge(edges, K, seq_name)
                        hits_lst.append({"seq_name": seq_name,"hmm_name": hmm_name, "start": seq_s, "end": seq_e, \
                                         "seq": cur_seq.seq[seq_s:seq_e], "e-val": hit_evalue })
                        if seq_name not in sequences_hits:
                            sequences_hits[seq_name] = []
                        if hmm_name not in hmm_hits:
                            hmm_hits[hmm_name] = []
                        sequences_hits[seq_name].append({"hmm_name": hmm_name, "start": seq_s, "end": seq_e})
                        hmm_hits[hmm_name].append({"seq_name": seq_name, "start": seq_s, "end": seq_e, "len": len(cur_seq.seq), "hmm_len": hmm_len})
    return hits_lst, sequences_hits, hmm_hits
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed.keys()))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed.keys()))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed._proxy._handle.close()  # TODO - Better solution
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()
Beispiel #9
0
 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type, seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"):
         print "Loading hmmsearch for variant:", variant_query.id
         variant_model=Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             gi = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=gi)
             # print hit
             try: #sometimes we get this :    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp, seq.variant==variant_model)
             except:
                 pass
Beispiel #10
0
def get_blast_alignments(blast_path):
    records = SearchIO.parse(blast_path, 'blast-xml')
    hit_list = []
    results = []
    for idx, cur in enumerate(records):
        for hit in cur.hits:
            for i, hsp in enumerate(hit.hsps):
                if cur.id != hit.id:
                    qs = hsp.fragment.query_start
                    qe = hsp.fragment.query_end
                    he = hsp.fragment.hit_end
                    hs = hsp.fragment.hit_start
                    query_s = str(hsp.fragment.query.seq)
                    hit_s = str(hsp.fragment.hit.seq)
                    aln_s = hsp.aln_annotation['similarity']
                    score = hsp.bitscore
                    expect = hsp.evalue
                    toks = list(
                        map(str, [
                            cur.id, hit.id, i, qs, qe, he, hs, query_s, hit_s,
                            aln_s, score, expect
                        ]))
                    results.append(toks)
    columns = [
        'query_id', 'hit_id', 'fragment_num', 'query_start', 'query_end',
        'hit_start', 'hist_end', 'query_string', 'hit_string',
        'alignment_string', 'score', 'evalue'
    ]
    return pd.DataFrame(results, columns=columns)
Beispiel #11
0
def parse_hmmer_output(results):
    """Parse hmmsearch output

    Args:
        file_list: List, string of file name of results that need parsing
    Return:
        hit_info: list of class objects, with information
                 - query, subject, identity, coverage, e-value, bit score
    """
    hit_info = []
    for record in SearchIO.parse(results, 'hmmer3-text'):
        if not record.hits:
            continue
        for hit in record.hits:
            hit_class = Hit(
                query=record.accession,  # Pfam id
                subject=hit.id,  # Hit id
                identity=None,  # Not present
                coverage=None,  # Not present
                evalue=hit.evalue,  # E-value of hit
                bitscore=hit.bitscore,  # Bit score of hit
            )
            hit_info.append(hit_class)
    if not hit_info:
        LOG.error("No hits have been found")
    return hit_info
Beispiel #12
0
def check_bl_out(in_fasta, in_xml):
    skip_blast = False
    if not os.path.isfile(in_xml):
        logging.info(in_xml + " does not exist")
        skip_blast = False
    elif os.stat(in_xml).st_size == 0:
        logging.info(in_xml + " is empty")
        os.remove(in_xml)
        skip_blast = False
    else:
        try:
            blast_ids = natsorted([
                qresult.id for qresult in SearchIO.parse(in_xml, 'blast-xml')
            ])
            fa_ids = natsorted(
                [seq.id for seq in SeqIO.parse(in_fasta, "fasta")])
            if blast_ids == fa_ids:
                skip_blast = True
            else:
                logging.info(
                    "Number of input and output sequences do not match" +
                    in_xml)
                os.remove(in_xml)
                skip_blast = False
        except:
            logging.info("Cannot read " + in_xml)
            os.remove(in_xml)
            skip_blast = False

    return (skip_blast)
Beispiel #13
0
def runHMMsearch(input, basename, tmpdir, cpus, evalue, hmm):
    Results = {}
    #load proteins into dictionary
    protein_dict = SeqIO.to_dict(SeqIO.parse(input, 'fasta'))
    #do hmmer search of proteins
    HMM = os.path.join(tmpdir, basename + '.hmmsearch.txt')
    subprocess.call(
        ['hmmsearch', '-o', HMM, '--cpu',
         str(cpus), '-E', evalue, hmm, input],
        stdout=FNULL,
        stderr=FNULL)
    with open(HMM, 'rU') as results:
        for qresult in SearchIO.parse(results, "hmmer3-text"):
            query_length = qresult.seq_len  #length of HMM model
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                query = hits[0].id
                hit = hits[0].query_id
                score = hits[0].bitscore
                evalue = hits[0].evalue
                num_hsps = len(hits[0].hsps)
                aln_length = 0
                for x in range(0, num_hsps):
                    aln_length += hits[0].hsps[x].aln_span
                if hit not in Results:
                    Results[hit] = [query, score, evalue, aln_length, 'Hmmer3']
    for k, v in Results.items():
        description = base + '|' + k + "|" + v[0] + "|evalue=" + str(
            v[2]) + "|HMMer3-Complete"
        Results[k].append(description)
        Seq = str(protein_dict[v[0]].seq)
        Results[k].append(Seq)
    return Results
Beispiel #14
0
def get_hits_to_VPFs(hmmout_file):
    '''Takes a HMMER3 hmmsearch tab output file as an input and
    returns a dictionary mapping each scaffold with the number of unique genes that match a protein family

    Input:
        - hmmout_file (str): path to HMMER3 hmmsearch out file in tab format

    Returns:
        - hits_to_VPFs (dict): dictionary where key are scaffold IDs and values are number of unique genes that matched a protein family
    '''
    hits_to_VPFs = {}
    with open(hmmout_file, 'r') as input:
        for qresult in SearchIO.parse(input, 'hmmer3-tab'):
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                for i in range(0, num_hits):
                    query_seq_id = hits[i].id
                    scaffold, gene = query_seq_id.split('|')
                    hits_to_VPFs[scaffold] = hits_to_VPFs.get(
                        scaffold, set([])).union([gene])

    for key, value in iter(hits_to_VPFs.items()):
        hits_to_VPFs[key] = len(value)
    return hits_to_VPFs
Beispiel #15
0
def parse_blast(blast_pdb_file, max_E, min_pcid, max_pcid, hits):
    with open_file(blast_pdb_file) as f:
        for qresult in SearchIO.parse(f, 'blast-xml'):
            query = qresult.id#.split("|")[1]
            for hit in qresult:
                s = hit.id + hit.description
                hsp = hit[0] # Only the 1st one
                evalue = hsp.evalue
                pcid = float(hsp.ident_num)/hsp.aln_span*100
                if (evalue<=max_E
                and pcid>=min_pcid and pcid<=max_pcid):
                    # print "\t>HIT:",hit.id, set(re.findall("pdb\|\w\w\w\w\|\w", s))
                    # print "\t", hsp.evalue, "{:2.1f}".format(pcid)
                    # print hsp.query_start, hsp.query_end
                    # print hsp.hit_start+1, hsp.hit_end
                    for match in re.findall("pdb\|\w\w\w\w\|\w", s):
                        pdb, chain = match.split("|")[1:]
                        hits[query][pdb][chain]={
                                      "ide": "{:2.1f}".format(pcid),
                                      "e-val": evalue,
                                      "q-start": str(hsp.query_start+1),
                                      "q-end": str(hsp.query_end),
                                      "s-start": str(hsp.hit_start+1),
                                      "s-end": str(hsp.hit_end)
                                      }
                else:
                    break
    return hits
Beispiel #16
0
    def quick_structurome(self, xml_blast_result, data_dir, entries, tmp_dir="/tmp/chain_PDBs",
                          pdb_divided="/data/databases/pdb/divided/", max_models=3):

        good_model = defaultdict(lambda: [])

        def identity(hsp):
            return 1.0 * hsp.ident_num / hsp.aln_span

        _log.info("searching good templates")
        for query in tqdm(bpsio.parse(xml_blast_result, "blast-xml")):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    if 0.6 <= identity(hsp) < 0.95:
                        good_model[hsp.query.id].append(hsp)



        tuplas = good_model.items()

        _log.info("creating models")
        with tqdm(tuplas) as pbar:
            for seq, hsps in pbar:
                try:
                    from SNDG.Structure.Modelome import Modelome
                    Modelome.model_hsps(seq, data_dir, hsps, entries=entries, tmp_dir=tmp_dir,
                                        pdb_divided=pdb_divided, max_models=max_models)
                except Exception as ex:
                    _log.exception(ex)
Beispiel #17
0
    def load_hsp_dict(self, xml_blast_result):

        for query in bpsio.parse(xml_blast_result, "blast-xml"):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    self.hsp_dict[query.id][hsp.hit.id] = hsp
def construct_gene_scores_matrix(hmmtable):
    """
    Parse hmmscan tabular output to a dictionary.
    Arguments:
        hmmtable: pathlib.Path instance: Path to the hmmscan output, specified
            with hmmscan's --tblout option. Can also be str.
    Return:
        dic_genes_scores: dict: A dictionary with the gene ids as keys with
            a list of lists for all its hits. This is of the form
            { gene_id: [
                [ hit id, (<- string)
                  hit E-value, (<- np.float32)
                  hit bit-score, (<-np.float32)
                  hit bias, (<-np.float32)
                  ], ...],
                  ...}
    """
    dic_genes_scores = {}
    for gene in SearchIO.parse(hmmtable, "hmmer3-tab"):
        dic_genes_scores[gene.id] = []
        for hit in gene.hits:
            hit_info = [
                hit.id,
                np.float32(hit.evalue),
                np.float32(hit.bitscore),
                np.float32(hit.bias),
            ]
            dic_genes_scores[gene.id].append(hit_info)
    return dic_genes_scores
Beispiel #19
0
def features_via_hmm(seq, hmmdb, eval_thresh=1.0):
    """
    This function takes a Seq, runs hmmscan against a compressed hmmdb (prepare with hmmpress)
    and output a list of biobython SeqFeature.
    #Needs strictly HMMER 3.0!!!!
    """
    features = list()
    ufn = str(uuid.uuid4())
    SeqIO.write(
        [SeqRecord(seq, id='QUERY', name='QUERY', description="QUERY")],
        ufn + '.fasta', 'fasta')
    subprocess.call([
        "hmmscan", "-o", ufn + ".out", "--tblout", ufn + ".tbl", "--domtblout",
        ufn + ".dtbl", hmmdb, ufn + '.fasta'
    ])
    #Now let's read it

    for v in SearchIO.parse(ufn + ".dtbl", "hmmscan3-domtab"):
        for hit in v:
            for h in hit.hsps:
                # print h
                if h.evalue < eval_thresh:
                    features.append(
                        SeqFeature(FeatureLocation(h.query_start, h.query_end),
                                   type="domain",
                                   qualifiers={
                                       'name': h.hit_id,
                                       'evalue': h.evalue
                                   }))

    os.system("rm %s %s %s %s" %
              (ufn + '.fasta', ufn + '.out', ufn + '.tbl', ufn + '.dtbl'))
    return features
def main(argv):
    args = parse_arguments(argv)

    infile = args.infile
    out = args.out_path
    outputs = []
    query_sequences = []
    count = 0

    with open(out, 'w+') as output:
        output.write(
            "%s\t%s\t%s\t%s\n" %
            ("Accession", "family", "query_name", "Resfams_description"))
        for qresult in SearchIO.parse(infile, "hmmer3-tab"):
            for hits in qresult:
                accession = hits.accession
                id = hits.id
                query = hits.query_id
                description = hits.description
                score = hits.bitscore

                array = [accession, id, query, description, str(score)]

                print("\t".join(array))
                output.write("\t".join(array) + "\n")

                if hits.query_id not in query_sequences:
                    query_sequences.append(hits.query_id)
                    count += 1
        print("Unique Seqs: " + str(count))
Beispiel #21
0
 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type,
                                          seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file,
                 out=self.curated_search_results_file,
                 sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file,
                                         "hmmer3-text"):
         self.log.info("Loading hmmsearch for variant: {}".format(
             variant_query.id))
         variant_model = Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             accession = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=accession)
             # print hit
             try:  #sometimes we get this:    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp,
                           seq.variant == variant_model)
             except:
                 pass
Beispiel #22
0
def _call_hmmer(hmm, inputproteins):
	inputproteins = list(inputproteins)
	scores = {}
	for ip in inputproteins:
		scores[ip.id] = 0

	with ntf(prefix="/dev/shm/") as inputfasta:
		with ntf(prefix="/dev/shm/") as hmmoutput:
			SeqIO.write(inputproteins, inputfasta.name, 'fasta')
			hmmfile = os.path.join(hmm_location, hmm + '.hmm')
			sp.call(['hmmsearch', '-o', hmmoutput.name, hmmfile, inputfasta.name])
			hmmoutput.flush()
			hmmoutput.seek(0)
			QRS = SearchIO.parse(hmmoutput, format="hmmer3-text")
			for qr in QRS:
				# there's *always* a QR, even though it's usually empty.
				# qr.sort()
				# I'm kind of hoping this sorts by hit strength.
				# worth checking. I guess it doesn't matter anyway.

				for hit in qr:
					scores[hit.id] = max(scores[hit.id], hit.bitscore)
					for hsp in hit.hsps:
						def appropriate_hyphens(m):
							return '-' * len(m.group(0))

						if len(hsp.hit.seq) > 100:
							hitseq = re.sub('PPPPP+', appropriate_hyphens, str(hsp.hit.seq))
							hitseq = hitseq.translate(None,'-*').upper()
							yield hit.id, hsp.bitscore, hitseq
Beispiel #23
0
    def generate_blast_graph(self):
        evalue_filter = lambda hsp: hsp.evalue < self.evalue
        file_name = "{}/blast_graph.txt".format(self.blast_output_path)
        for blast_file in glob.glob(self.blast_data_path):
            print("working on " + blast_file)
            # Parse the Blast file
            qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
            for qresult in qresults:
                write_line = ""
                write_line += qresult.id + ":"
                # Go to the Hit section of query
                for hit in qresult[:]:
                    if not self.blast_graph.has_node(qresult.id):
                        self.blast_graph.add_node(qresult.id)
                    # Check if Hit has min value
                    filtered_hit = hit.filter(evalue_filter)
                    if filtered_hit is not None:
                        if not self.blast_graph.has_node(filtered_hit.id):
                            self.blast_graph.add_node(filtered_hit.id)
                        # Add Edge between graph nodes
                        self.blast_graph.add_edge(qresult.id, filtered_hit.id)
                        write_line += filtered_hit.id + ","
                if write_line != "":
                    with open(file_name, "a") as f_handle:
                        f_handle.write(write_line + '\n')

        # Write GML files
        if self.generate_gml_files:
            file_name = "{}/blast_graph.gml".format(self.blast_output_path)
            with open(file_name, "a") as f_handle:
                nx.write_gml(self.blast_graph, f_handle)
def get_unique_blastp_hits(infile, fasta):
    hits = set()
    for aln in SearchIO.parse(infile, 'blast-xml'):
        for hsp in aln.hsps:
            hits.add(hsp.hit_id)
    seqs = {rec.id: rec for rec in SeqIO.parse(fasta, 'fasta')}
    return [seqs[hit] for hit in hits]
Beispiel #25
0
    def results(self):
        """
        Parse the results and yield biopython SearchIO entries.

        Beware:
        Some databases are not unique on the id, and this causes the parser to
        complain about duplicate entries and raise exceptions such as:

            ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
            in this QueryResult.

        Summary of the columns:
        https://www.metagenomics.wiki/tools/blast/blastn-output-format-6

            qseqid sseqid pident length mismatch gapopen qstart qend sstart send
            evalue bitscore

        Warning: Unlike BLAST results, if a sequence got no hits it is NOT
                 reported at all in VSEARCH. The number of entries yielded
                 will not match the number of sequences at input.
        """
        with open(self.out_path, 'rt') as handle:
            for entry in SearchIO.parse(
                    handle,
                    'blast-tab',
            ):
                yield entry
Beispiel #26
0
    def profile_search(database,
                       pssm_file,
                       search_result,
                       cpu=1,
                       evalue=0.00001):
        cmd = f"psiblast -db {database} -in_pssm {pssm_file} -num_threads {cpu} -evalue {evalue}  -outfmt 5 -out {search_result} 1>&2"
        execute(cmd)
        try:
            search_result = list(bpsio.parse(search_result, "blast-xml"))
        except ParseError:
            sys.stderr.write(
                f'PSIProfile: error parsing results from {search_result}')
            return None

        for query in search_result:
            for hit in list(query):
                for hsp in hit:
                    identity = 1.0 * hsp.ident_num / hsp.aln_span
                    data = [
                        hsp.query.id, hsp.query_start, hsp.query_end,
                        hsp.hit.id, hsp.hit_start, hsp.hit_end, hsp.evalue,
                        identity,
                        str(hsp.aln[0].seq),
                        str(hsp.aln[1].seq)
                    ]
                    yield {
                        f: data[i]
                        for i, f in enumerate(PsiProfile.search_result_fields)
                    }
Beispiel #27
0
def blastSearch(query, speciesList, filename, blastDict):
    '''Run BLAST, save results of a search to a file and return its contents
    :param query: String with accession numbers divided by paragraphs
    :param species: String with all species, against which BLAST is performed
    :param filename: Name of original fasta file for saving results of BLAST
    '''

    xmlPath = rootFolder \
        + '/Blast_XML/' \
        + os.path.splitext(filename)[0] \
        + '.xml'

    query = createInputForBlast('.q', query, filename)
    taxidList = createInputForBlast('.t', speciesList, filename)

    blastNotVoid = bashBlast(query=query, out=xmlPath, taxidList=taxidList)

    if blastNotVoid:
        blast = SearchIO.parse(xmlPath, 'blast-xml')
        writeInBlastDict(blast, blastDict)

    os.remove(query)
    os.remove(taxidList)
    os.remove(xmlPath)

    return blastDict
Beispiel #28
0
def blast_partition(path, partition_dict):
    partitions = defaultdict(lambda: [], {"no_hit": [], "else": []})
    for query in bpsio.parse(path, "blast-xml"):
        hits = list(query)
        if not hits:
            partitions["no_hit"].append(query.id)
        for hit in hits:
            for hsp in hit:
                hsp.identity = identity(hsp)
                hsp.coverage = coverage(query, hsp)
                hsp.hit_coverage = hit_coverage(hit, hsp)

                added = False
                for k, fn_filter in partition_dict.items():
                    if fn_filter(query, hit, hsp):
                        partitions[k].append((
                            query,
                            hit,
                            hsp,
                        ))
                        added = True
                        break
                if not added:
                    partitions["else"].append((
                        query,
                        hit,
                        hsp,
                    ))

    return partitions
def parse_n_fill_run_data_searchio(run_path, run_data, querydb):
    run_id = get_run_id(run_path)
    run_format = get_run_format(run_path)
    for query in SearchIO.parse(run_path, run_format):
        for hit in query.hits:
            for hsp in hit.hsps:
                exons = [x.hit_range for x in hsp.fragments]
                coverage = 'N/A'

                if querydb is not None:
                    total_matched = sum(x.query_span for x in hsp.fragments)
                    coverage = '{:.2f}%'.format(100 * total_matched / len(querydb[query.id]))

                if hasattr(hsp, 'score'):
                    score = hsp.score
                elif hasattr(hsp, 'bitscore'):
                    score = hsp.bitscore
                else:
                    score = 'N/A'

                if hasattr(hsp, 'ident_num') and hasattr(query, 'seq_len'):
                    matched = '{:.2f}%'.format(100 * hsp.ident_num / query.seq_len)
                else:
                    matched = 'N/A'

                alignment = AlignmentData(run_id, score, matched, coverage, hsp.hit_range, exons)
                run_data[query.id][hit.id].append(alignment)
def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None
                 ) -> List[SearchIO._model.query.QueryResult]:  # pylint: disable=protected-access
    """ Run hmmpfam2 over the provided HMM file and fasta input

        Arguments:
            query_hmmfile: the HMM file to use
            target_sequence: a string in fasta format of the sequence to run

        Returns:
            a list of results as parsed by SearchIO
    """
    config = get_config()
    command = ["hmmpfam2"]

    # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this
    if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \
            config.hmmer2.multithreading:
        command.extend(["--cpu", str(config.cpus)])
    if extra_args:
        command.extend(extra_args)
    command.extend([query_hmmfile, '-'])

    result = execute(command, stdin=target_sequence)
    if not result.successful():
        logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code,
                      result.stderr, query_hmmfile)
        raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr))
    res_stream = StringIO(result.stdout)
    return list(SearchIO.parse(res_stream, 'hmmer2-text'))
def run_blastp(target_blastp_database: str, query_sequence: str,
               opts: List[str] = None, results_file: str = None
               ) -> List[SearchIO._model.query.QueryResult]:
    """ Runs blastp over a single sequence against a database and returns the
        results as parsed by Bio.SearchIO.

        Arguments:
            target_blastp_database: the blastp database to compare to
            query_sequence: the sequence being compared
            opts: a list of extra arguments to pass to blastp, or None
            results_file: a path to keep a copy of blastp results in, if provided

        Returns:
            a list of QueryResults as parsed from blast output by SearchIO
    """
    if not query_sequence:
        raise ValueError("Cannot run blastp on empty sequence")

    config = get_config()
    command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database]

    if opts is not None:
        command.extend(opts)

    result = execute(command, stdin=query_sequence)
    if not result.successful():
        raise RuntimeError('blastp returned %d: %r while scanning %r' % (
                           result.return_code, result.stderr.replace("\n", ""),
                           query_sequence[:100]))

    if results_file is not None:
        with open(results_file, 'w') as fh:
            fh.write(result.stdout)

    return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))
Beispiel #32
0
def multiPFAMsearch(inputList, cpus, tmpdir, output):
    # run hmmerscan multithreaded by running at same time
    # input is a list of files, run multiprocessing on them
    pfam_results = os.path.join(os.path.dirname(tmpdir), 'pfam.txt')
    pfam_filtered = os.path.join(os.path.dirname(tmpdir), 'pfam.filtered.txt')
    lib.runMultiNoProgress(safe_run, inputList, cpus)

    # now grab results and combine, kind of tricky as there are header and footers for each
    resultList = [os.path.join(tmpdir, f) for f in os.listdir(
        tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.pfam.txt')]
    combineHmmerOutputs(resultList, pfam_results)

    # now parse results
    with open(output, 'w') as out:
        with open(pfam_filtered, 'w') as filtered:
            with open(pfam_results, 'r') as results:
                for qresult in SearchIO.parse(results, "hmmsearch3-domtab"):
                    hits = qresult.hits
                    num_hits = len(hits)
                    if num_hits > 0:
                        for i in range(0, num_hits):
                            hit_evalue = hits[i].evalue
                            query = hits[i].id
                            pfam = qresult.accession.split('.')[0]
                            hmmLen = qresult.seq_len
                            hmm_aln = int(hits[i].hsps[0].hit_end) - \
                                int(hits[i].hsps[0].hit_start)
                            coverage = hmm_aln / float(hmmLen)
                            if coverage < 0.50:  # coverage needs to be at least 50%
                                continue
                            filtered.write("%s\t%s\t%s\t%f\n" %
                                           (query, pfam, hit_evalue, coverage))
                            out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))
Beispiel #33
0
def get_adenylation_domains(fasta, known=None, lagging_strand=False):
    adenylation_domains = []

    fasta_seqs = []
    for fs in SeqIO.parse(fasta, 'fasta'):
        revcom=False
        seq = str(fs.seq)
        pepseq, rf = get_pepseq(seq)
        if rf < 0 == lagging_strand:
            revcom=True
            seq = utils.reverse_complement(seq)
        fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf})
    for fs in fasta_seqs:
        utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'],
                  '>header\n' + pepseq)
        with open('dump') as f:
            out = f.read()
        res_stream = StringIO(out)
        os.remove('dump')
        results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab'))

        for result in results:
            for i, hsp in enumerate(result.hsps, 1):
                s = hsp.hit_start
                e = hsp.hit_end

                adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e))

    return adenylation_domains
def important_pfam(seqs_from_pdb_hmm):
    for query in tqdm(bpsio.parse(seqs_from_pdb_hmm, 'hmmer3-text')):
        try:
            pdb, chain, start, end = query.id.split("_")  # @UnusedVariable
            if ExperimentalStructure.objects(name=pdb,residue_sets__name="important_pfam").count():
                continue

            strdoc = ExperimentalStructure.objects(name=pdb).get()

            if not strdoc.residue_set("important_pfam"):
                important_rs = ResidueSet(name="important_pfam")
                domain_rs = None
                for hit in query:
                    if len(hit):
                        hsp = hit[0]
                        domain_rs = ResidueSet(name=hit.id)
                        i = 0
                        for x in str(hsp.aln[1].seq):
                            residue = chain + "_" + str(i + int(start))
                            if x == x.upper():
                                important_rs.residues.append(residue)
                            i = i + 1
                            domain_rs.residues.append(residue)
                        if domain_rs:
                            strdoc.residue_sets.append(domain_rs)
                strdoc.residue_sets.append(important_rs)
                strdoc.save()
        except DoesNotExist:
            pass
Beispiel #35
0
def parse_results(path, file_name, FA_FILES_PATH, top_k=3, add_to_db=bool):
    """Parses a result of a blast query

    Return top k matches and adds them to the database.
    """

    print(f"Parsing {file_name} at {path}")
    i = 0
    results = list()
    for bresults in SearchIO.parse(path, 'blast-xml'):
        for r in bresults:
            i += 1
            # Select only top k
            if i <= top_k:
                results.append({
                    "rank": i,
                    "id": r.id,
                    "query_id": r.query_id,
                    "full_name": r.description_all,
                    "bitscore": r.hsps[0].bitscore,
                    "evalue": r.hsps[0].bitscore,
                    "query_range": r.hsps[0].query_range,
                    "hit_range": r.hsps[0].hit_range,
                })
            elif i > top_k and add_to_db is True:
                print("Top 3 results saved to database")
                add_to_database(results=results, FA_FILES_PATH=FA_FILES_PATH)
                break
            else:
                return (results)
Beispiel #36
0
def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):

    # if db_init:
    #     from SNDG.Sequence.ProteinAnnotator import PABase
    #     PABase.sqldb.initialize(db_init)
    # mkdir(annotation_dir)
    # out = annotation_dir + "/species_blast.tbl"
    #
    # tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    # species_tax = None
    # for tax in Tax.parents(tax):
    #     if tax.node_rank == "genus":
    #         species_tax = tax
    #         break
    # tax_data = "/data/xomeq/tax/"
    # species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()
Beispiel #37
0
def exonerate_parser(exonerate_file):
    """
    parser the exonerate result, and return the position of the feather in 4-col bed format
    4 col bed4: [chro, start,end, name], example ["seq1", 1, 55, "trnP"]
    :param query:
    :param exonerate_file:
    :param prefix:
    :return: list of bed4
    """
    #fw=open(tbl_outname, "w") # change IO to list store
    bed4=[]

    texts=SearchIO.parse(StringIO(exonerate_file), format="exonerate-text")
    for record in texts:
        for hsp in record:
            for s in hsp:
                # the biopython.SearchIO interval is 0 based [start, end), so start+1, end+0 to get 1 based coords
                table_4=[s.fragment.query_id, s.fragment.query_start+1, s.fragment.query_end,s.fragment.hit_id]
                bed4.append(table_4)

                #fw.write("\t".join(table_4))
                #fw.write("\n")
    bed4.sort()
    #fw.close()
    return bed4
 def run_hmmsearch(self, name, hmm):
     """
     Rum hmmsearch and return the highest scoring hit
     """
     out = tempfile.NamedTemporaryFile("w")
     cmd = [
         "hmmsearch",
         "--noali",
         "-o",
         out.name,
         os.path.join(self.cov_dir, hmm + "-nt.hmm"),
         os.path.join(self.cov_dir, name + ".fa"),
     ]
     if self.verbose:
         print("Command: {0}".format(cmd))
     try:
         subprocess.run(cmd, check=True)
     except (subprocess.CalledProcessError) as exception:
         print("Error: {}".format(exception))
         sys.exit("Error running hmmsearch using {}".format(hmm))
     bestscore = 0
     besthit = None
     # Get HSP with highest score
     for qresult in SearchIO.parse(out.name, "hmmer3-text"):
         for hit in qresult:
             for hsp in hit:
                 if hsp.bitscore > bestscore:
                     besthit = hsp
                     bestscore = hsp.bitscore
     return besthit
Beispiel #39
0
def process_blast_output(file, simple, argparser):
    qresults = SearchIO.parse(file, 'blast-xml')
    if simple:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and
                             (hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)):
                        yield ([str(hsp), "\n\n"], None, hsp.aln_span)

                for hsp in hit:
                    if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and
                            (hsp.aln_span == hsp.ident_num)):
                        yield (None, [str(hsp), "\n\n"], hsp.aln_span)
    else:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    for v, c, p in encode(simstr(hsp.aln)):
                        if v == "1" and c >= argparser.cont:
                            yield (format_alignment(hsp, p, c), None, c)
                for hsp in hit:
                    for t0, t1, t2 in thrids(encode(simstr(hsp.aln))):
                        if t0[0] == "1":
                            assert (t0[2] < t1[2] < t2[2])
                            assert t2[0] == "1"
                            assert t1[0] == "0"
                            if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \
                               (t0[1] + t2[1]) >= argparser.summin and \
                                t1[1] <= argparser.gapmax:
                                if not (argparser.S and
                                        (t0[1] >= argparser.cont or t2[1] >= argparser.cont)):
                                    yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]),
                                           t0[1]+t2[1]-t1[1])
def parse_hmmscan_tab(infile, print_header=True):
    '''Parse hmmscan output in --tblout format'''
    if print_header:
        yield "query","top hit","evalue","certainty","num sig hits"
    records = SearchIO.parse(infile,'hmmer3-tab')
    for rec in records:
        query = rec.id
        if len(rec) > 1:
            hit1,hit2 = rec.hits[0],rec.hits[1]
            eval1,eval2 = hit1.evalue,hit2.evalue
            if eval1 != 0: # convert to -ln evalue
                eval1 = -np.log(eval1)
            if eval2 != 0:
                eval2 = -np.log(eval2)
            if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care
                certainty = 1
            elif eval1 == 0 and eval2 == 0:
                certainty = 0
            else: # calculate certainty with info theoretic calc.
                total = eval1 + eval2
                p1,p2 = eval1/total, eval2/total
                certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2))
        else:
            certainty = 1
        yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)
Beispiel #41
0
def runHmmer(args, list_path, file_path, f):
    """run prodigal and hmmsearch on chr files"""
    if not os.path.exists(str(args.data) + '/tmp'):
        os.makedirs(str(args.data) + '/tmp')
    # get the sample group
    head, group = os.path.split(os.path.split(file_path)[0])
    basename = os.path.splitext(str(ntpath.basename(str(file_path))))[0]
    exportpath = str(args.data) + '/tmp/' + ntpath.basename(str(file_path))
    hmmpath = str(args.data) + '/tmp/' + ntpath.basename(
        str(file_path)) + '.out'
    print('Processing %s of group %s' % (basename, group))
    s = ""
    cmd = ("prodigal -p meta -i ", str(file_path), " -a ", exportpath,
           ' -d /dev/null > /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # run hmmsearch on faa ORF files
    s = " "
    cmd = ("hmmsearch -E 0.001 --domtblout", hmmpath, 'resources/remove.hmm',
           exportpath, '> /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # write it to output file if there is a hit
    with open(hmmpath, 'rU') as input:
        try:
            for qresult in SearchIO.parse(input, 'hmmscan3-domtab'):
                query_id = qresult.id
                hits = qresult.hits
                num_hits = len(hits)
                acc = qresult.accession
                if num_hits > 0:
                    f.write(''.join((basename, '\t', str(file_path), '\n')))
        except ValueError:
            print('parsing error on %s' % basename)
Beispiel #42
0
def get_hit_seq(fastafile, filename):
	yamlfile = yaml_load_file(fastafile)
	blout = SearchIO.parse(filename, 'blast-text')
	for query in blout:
		seqid = query.id.split("\n")[0]
		#print(seqid)
		fh = open("multi_" + seqid + ".fasta", 'a')
		yamlfile[seqid]['hits'] = {}
		for hit in query.hits:
			gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
			yamlfile[seqid]['hits'][gi] = {}
			#print(yamlfile[seqid]['hits'])
			for hsp in hit.hsps:
				#print(hsp.hit)
				#print(hsp.hit_strand)
				#print(hsp.hit_start)
				#print(hsp.hit_end)
				hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS
				hitstart = 1 if hitstart < 0 else hitstart
				hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS
				hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
				#print(hsp.hit_end)
				out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
				fh.write(out)
				#print(hsp.hit.seq)
				#print(hsp.query.seq)
				#print("-----")
		fh.close()
		#break
	yaml_dump_file(fastafile, yamlfile)
Beispiel #43
0
def first_exonerate_parse(dir, newdir, prefix):
	cwd = os.getcwd()
	if not os.path.exists(cwd + newdir):
		os.makedirs(cwd + newdir)
	if not os.path.exists(cwd + '/merged_exons/'):
		os.makedirs(cwd + '/merged_exons/')
	for file in slistdir(cwd + dir):
		if 'DS_Store' not in file:
			result = SearchIO.parse(cwd + dir + file, 'exonerate-text')
			for h in result:
				for hh in h:
					for hhh in hh:
						hitcounter = 1
						for hhhh in hhh:
							hitseq =  hhhh.query
							rootname = file.split('.fasta')
							orthoname = file.split("_")
							orthosubdir = cwd + newdir + '/' + orthoname[0]
							if not os.path.exists(orthosubdir):
								os.makedirs(orthosubdir)
							newseqstr = str(hitseq.seq.ungap("-"))
							newid = prefix + str(hitcounter) + '_' + rootname[0]
							record = SeqRecord(Seq(newseqstr, generic_dna), id =  newid, description = '')
							fastaname = prefix + str(hitcounter) + '_' + rootname[0] + '.fasta'
							SeqIO.write(record, orthosubdir + '/' + fastaname, "fasta")
							hitcounter += 1
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed),
                         "Should be %i records in %s, index says %i"
                         % (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed),
                             "Should be %i records in %s, index_db says %i"
                             % (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)
 def retrieve_blast_data(self):
     for blast_file in glob.glob(self.blast_data_path):
         print(blast_file)
         print self.network_data
         qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         for qresult in qresults:
             if(qresult.id in self.network_data):
                 print qresult.id
Beispiel #46
0
def parse(target):
    blast_result = list(SearchIO.parse('BlastResult.xml', 'blast-xml'))
    for record in blast_result:
        if len(record) == 0:
            continue
        else:
            tophit = record[0]
        target.append([tophit[0][0].query, tophit[0][0].hit])
 def generate_blast_data(self):
     self.initialize_variables()
     for blast_file in glob.glob(self.blast_data_path):
         # Parse each Blast file
         query_results = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         filtered_query_results = self.apply_filtering(query_results)
         # Parse each blast record
         for query_result in filtered_query_results:
             print query_result.id
             self.generate_blast_graph(query_result)
def runBlastParserTAB(cline,blast_out_file, False):
	startTime = datetime.now()
	os.system(str(cline))
	print 'Running BLAST:' + str(datetime.now() - startTime)
	startTime = datetime.now()
	blast_records = SearchIO.parse(blast_out_file, 'blast-tab', comments=False)

	print 'Parsing Results:' + str(datetime.now() - startTime)

	return blast_records
def process_tot():
        print "beginning process_tot"
	#level name
	level = sys.argv[3]+" "
	#get the name of the proteome from the file name
	omeid = sys.argv[1].replace(".fasta", "")
	omeid = omeid.split('/')
	omeid = omeid[len(omeid)-1] 
	#read in results
	results = SearchIO.parse(sys.argv[2], "hmmer3-text")
	
	#build up list of entries
	#processed = [] #intialize list to add entries to
        count = 0

        scans=len(results)
        cutoff=1.0/scans
        print scans
        print cutoff

	for protein in results: 
                processed = []
                if count % 100 ==0:
                     print count
                count = count + 1
		pid = protein.id+" "
		if len(protein) == 0: #if a protein has no hits groupid=proteinid and rank=0
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		elif protein[0].evalue > cutoff: #proteins with hits that do not meet the threshold are treated as those without any hits
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		else:
			i = 0
			while i<len(protein) and protein[i].evalue <= cutoff:
				rank = str(i+1)+" "
				OGid = protein[i].id.split('.')
				OGid = OGid[0]+"."+OGid[1]+" "
				e = str(protein[i].evalue)+" "
				qr = [] #empty list for domain ranges of this hit
				for d in protein[i]:
					qr.append(d.query_range)
				processed.append((rank, level, pid, OGid, e, str(qr).replace(" ", "")," ", omeid))
				i += 1
		#Write to file
	 	for i in processed:
			output.write("".join(str(s) for s in i) + "\n")

	output.close()	
Beispiel #50
0
def validate(candidate_file, input_file, n_seqs, min_len, min_covrage,
             max_mismatch):
    # remove gap in old alignment file
    no_gap = 'validate.fasta'
    with open(no_gap, 'w') as new, open(input_file, 'r') as old:
        for line in old:
            if line.startswith('>'):
                new.write(line)
            else:
                new.write(line.replace('-', ''))

    # build blast db
    candidate_fasta = 'primer_candidate.fasta'
    SeqIO.convert(candidate_file, 'fastq', candidate_fasta, 'fasta')
    run('makeblastdb -in {} -dbtype nucl'.format(no_gap), shell=True)
    # blast
    blast_result_file = 'BlastResult.xml'
    cmd = nb(num_threads=cpu_count(),
             query=candidate_fasta,
             db=no_gap,
             task='blastn',
             evalue=1e-5,
             max_hsps=1,
             max_target_seqs=n_seqs,
             outfmt=5,
             out=blast_result_file)
    stdout, stderr = cmd()
    # parse
    min_bitscore_raw = min_len - max_mismatch
    blast_result = [['ID', 'Hits', 'Sum_Bitscore_raw'], ]
    blast_result.append(['All', n_seqs, min_len])
    for query in SearchIO.parse(blast_result_file, 'blast-xml'):
        if len(query) == 0:
            blast_result.append([query.id, 0, 0])
            continue
        sum_bitscore_raw = 0
        good_hits = 0
        start = 0
        for hit in query:
            hsp_bitscore_raw = hit[0].bitscore_raw
            if hsp_bitscore_raw >= min_bitscore_raw:
                sum_bitscore_raw += hsp_bitscore_raw
                good_hits += 1
                start += sum(hit[0].hit_range) / 2
        blast_result.append([query.id, good_hits/n_seqs, sum_bitscore_raw,
                             start/n_seqs])
    # validate
    # validate_result = [['ID', 'Hits', 'Sum_Bitscore_raw', 'Seq'], ]
    validate_result = list()
    for record in blast_result[2:]:
        if record[1] >= min_covrage:
            validate_result.append(record)
    validate_result.sort(key=lambda x: x[1], reverse=True)
    return validate_result
def start_queryResult_generator(inFile, fDic, work_sheet):
    """ invoking the parse function to return a 'generator' that can allow you 
        to step though the record one QueryResult Object at a time but invoking
        nextQuery = (next)generator on it.This approach can allow you to save 
        on memory. I have found with my current task casting this generator with
        (list) works fine but it is really not called for in this current 
        task of parsing and sorting the records.
    """
    """ http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html"""
    qGenerator = SearchIO.parse(inFile, 'blast-xml')
    max_hits = 0
    query_count = 1
    # Step through all the records in the lump xml data file and write out
    # each separate hit to file. Also write the summary information to the
    # work sheet.
    for query_result in qGenerator:
        print('Processing Query BLAST return ' + str(query_count))
        number_hits = int(len(query_result.hits))
        # Extend header out right if new MAXHITS
        if number_hits > max_hits:
            max_hits = number_hits       
        if number_hits == 0:
            # Construct path plus file name for no hit query
            filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_' 
                           + str(query_count) + '_H_none.xml')
            # Write out any Queries that had to hits to a no Hit subfolder
            SearchIO.write(query_result, filename, 'blast-xml')
            write_qr_to_ws(query_count, query_result, work_sheet)
        else :
            # Now set up a counter of 'hits' in the QueryResult so hit's
            # can be sliced away into their own record cleanly.
            hit_count = 0;
            for hit in query_result.hits:
                total_hsps = len (hit.hsps)
                lowest_eval = hit.hsps[0].evalue
                best_hsp = hit.hsps[0]
                for hsp in hit.hsps:
                    if hsp.evalue < lowest_eval:
                        lowest_eval = hsp.evalue
                        best_hsp = hsp
                filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp))
                SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml')
                hit_count += 1
            # Write out query_result to worksheet           
            write_qr_to_ws(query_count, query_result, work_sheet)
        query_count += 1
        # break is debugging code
        # if query_count == 20:
        #   break
    build_ws_header(work_sheet, max_hits)
    return qGenerator
Beispiel #52
0
def FilterFASTAByPSL(fasta_fnames, psl_fnames, output_dir,
					 output_filename_postfix=None):
	"""Produces a FASTA file with only those sequences in the PSL.

	Iterates over pairs of FASTA and PSL files and produces a new
	FASTA file containing only those sequences for which there was a 
	record in the PSL file.

	Args:
		fasta_fnames: the FASTA filenames.
		psl_fnames: the PSL (BLAT output) filenames in the same order.
	
	Returns:
		A list of paths to filtered FASTA files.
	"""
	filtered_fnames = []
	for fasta_fname, psl_fname in zip(fasta_fnames, psl_fnames):
		filtered_fname = filename_util.MakeFASTAFilename(
			fasta_fname, dest_dir=output_dir,
			postfix=output_filename_postfix)
		filtered_fnames.append(filtered_fname)
		if path.exists(filtered_fname):
			print 'Skipping filtering of %s as output exists' % psl_fname

		# Get the IDs of all the matching sequences.
		parsed = SearchIO.parse(psl_fname, 'blat-psl')
		ids_with_hits = set()
		for record in parsed:
			for hsp in record.hsps:
				ids_with_hits.add(hsp.query_id)

		parsed = SeqIO.parse(fasta_fname, 'fasta')
		retained = []
		n_seqs = 0
		for record in parsed:
			n_seqs += 1
			if record.id in ids_with_hits:
				retained.append(record)

		assert len(ids_with_hits) == len(retained), 'Some sequences missing!'
		pct_retained = 100 * float(len(retained)) / float(n_seqs)

		print '\tRetained %d of %d records (%.2f%%)' % (len(retained),
													   n_seqs, pct_retained)
		print '\tWriting output to', filtered_fname
		SeqIO.write(retained, filtered_fname, 'fasta')

		# Force delete these lists since they might be very big.
		del retained
		del ids_with_hits
	return filtered_fnames
Beispiel #53
0
def get_model_scores(model_output):
    """Get the bit score for each hit/domain in a hmmersearch result

    Parameters:
    -----------
    model_output : str or File-like object
        Path to hmmersearch output file

    Return:
    -------
    A list of all bitscores
    """
    return [hsp.bitscore for query in SearchIO.parse(model_output, "hmmer3-text") \
        for hit in query for hsp in hit] 
Beispiel #54
0
def main():
    start = timer()
    arg = argparse.ArgumentParser()
    arg.add_argument('input', help='input BLAST result (xml format)')
    arg.add_argument('-s', '--simple', action='store_true',
                     help='only handle first hsp')
    arg.add_argument('-ss', '--very_simple', action='store_true',
                     help='only handle first hit')
    arg = arg.parse_args()

    def safe(old):
        return re.sub(r'\W', '_', old)

    xml = SearchIO.parse(arg.input, 'blast-xml')
    handle_tsv = open('{}.tsv'.format(arg.input), 'w')
    handle_tsv.write('Query\tbitscore\tSpecies name\thit\n')
    for query in xml:
        if query.description != '':
            query.id = ''.join([query.id, query.description])
        if len(query) == 0:
            with open(arg.input+'_not_found.log', 'a') as not_found:
                not_found.write('{} not found!\n'.format(query.id))
            continue
        handle = open('{}.fasta'.format(safe(query.id)), 'w')
        SeqIO.write(query[0][0].query, handle, 'fasta')
        for hit in query:
            for hsp in hit:
                species_name = hsp.hit.description.split(' ')
                if species_name[0].isupper():
                    species_name = '{}_{}_{}'.format(
                        *species_name[1:3], species_name[0].replace(':', ''))
                else:
                    species_name = '_'.join(species_name[:2])
                info = '{}\t{}\t{}\t{}{}\n'.format(
                    query.id,
                    hsp.bitscore,
                    species_name,
                    hsp.hit.id, hsp.hit.description)
                handle_tsv.write(info)
                hsp.hit.id = '{}|{}'.format(hsp.bitscore, hsp.hit.id)
                SeqIO.write(hsp.hit, handle, 'fasta')
                if arg.simple or arg.very_simple:
                    break
            if arg.very_simple:
                break
    end = timer()
    print('Cost {:.3f} seconds.'.format(end-start))
def process_small():
	#get the name of the proteome from the file name
	pid = sys.argv[1].replace(".fasta", "")
	pid = pid.split('/')
	pid = pid[len(pid)-1] 
	#read in results
	results = SearchIO.parse(sys.argv[2], "hmmer3-text")

	#build up list of entries	
	processed = [] #initialize list to add entries to
	for protein in results:
		if len(protein) == 0 or protein[0].evalue > 0.01: #if a query has no significant hits groupid=proteinid and rank=0
			OGid = protein.id
			es = "n/a"
			rank = "0"
		elif len(protein) == 1: #if a query has 1 hit it is recorded with rank=1
			OG = protein[0]
			OGid = OG.id.replace(".meta_raw", "")
			es = str(OG.evalue)
			rank = "1"
		else:	#if a query has more hits the top hit is recorded with rank=1
			OG = protein[0]
			OGid = OG.id.replace(".meta_raw", "")
			e = OG.evalue
			es = str(e)
			OG2 = protein[1]
			e2 = OG2.evalue
			rank = "1"
			#if the difference between the evalues of the top 2 hits is within 10fold the 
			#second hit is also recorded with rank=2
			if e2==0:
				OG2id = OG2.id.replace(".meta_raw", "")
				processed.append(("2 ", sys.argv[3]+" ", protein.id+" ", OG2id+" ", str(e2)+" ", pid))
			elif e!=0 and math.log10(e/e2) >= -10: 
				OG2id = OG2.id.replace(".meta_raw", "")
				processed.append(("2 ", sys.argv[3]+" ", protein.id+" ", OG2id+" ", str(e2)+" ", pid))
		processed.append((rank+" ", sys.argv[3]+" ", protein.id+" ", OGid+" ", es+" ", pid))	

	#write entries to file
	if os.path.isfile(sys.argv[4]): #if an existing file was provided append entries to that file
		output = open(sys.argv[4], "a")
	else: #else make a new file and add a header
		output = open(sys.argv[4], "w")
		output.write("Rank Level ProteinID GroupID evalue ProteomeID \n")
	for i in processed:
		output.write("".join(str(s) for s in i) + "\n")
	output.close()	
Beispiel #56
0
def parse_blast():
    parse_result = list()
    blast_result = SearchIO.parse('out/BlastResult.xml', 'blast-xml')
    for record in blast_result:
        if len(record) == 0:
            continue
        else:
            tophit = record[0]
        query_info = ''.join([
            tophit[0][0].query_id,
            ' ',
            tophit[0][0].query_description
        ])
        hit_info = tophit[0][0].hit.id
        parse_result.append([query_info, hit_info])
    parse_result = dict(parse_result)
    return parse_result
Beispiel #57
0
def blastparse(stdout, output, tname, ntname):
    global recorddict, minLength
    # evaluehit = True
    handle = open(output, 'w')  # open the target fasta file for writing
    blast_handle = cStringIO.StringIO(stdout)  # Convert string to IO object for use in SearchIO using StringIO
    try:  # Necessary to avoid bad genomes
        for qresult in SearchIO.parse(blast_handle, 'blast-tab'):  # Parse the blast output sting as if it were a file
            for hit in qresult:  # Hit object
                for hsp in hit:  # Hsp object
                    begin = hsp.query_range[0]  # Start of hsp
                    finish = hsp.query_range[1]  # End of hsp
                    if hsp.query_id in recorddict:
                        # Change the hit to lower case for the first time
                        sequence = recorddict[hsp.query_id].seq[begin:finish]  # make mutable
                        if sequence != "N" * len(sequence):
                            if str(sequence).isupper():
                                # sequence = sequence[begin:finish].tostring().lower()
                                recorddict[hsp.query_id].seq[begin:finish] = str(sequence).lower()
                                # print repr(recorddict[hsp.query_id].seq[begin:finish])
                            elif re.search('[A-Z]+', str(sequence)) is not None:
                                recorddict[hsp.query_id].seq[begin:finish] = str(sequence).lower()
                            # For the Contig name in the target fasta dictionary mask using coordinates
                            else:
                                if finish > begin:
                                    recorddict[hsp.query_id].seq = \
                                        recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
                                        + recorddict[hsp.query_id].seq[finish:]
                                else:
                                    recorddict[hsp.query_id].seq \
                                        = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
                                        + recorddict[hsp.query_id].seq[begin:]
        recorddict_bak = deepcopy(recorddict)  # Copy the dictionary so we may iterate and modify the result
        for idline in recorddict_bak:
            recorddict[idline].seq = recorddict[idline].seq.toseq()
            # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' #  Find a sequence of at least the target length
            pattern = r'[^N]{'+ re.escape(str(minLength))+r',}|[ATCG]{20,}N{200,900}[ATCG]{20,}'
            if re.match(pattern, str(recorddict[idline].seq), re.IGNORECASE, overlapped=True) is not None:
                SeqIO.write(recorddict[idline], handle, "fasta")
                recorddict[idline].seq = recorddict[idline].seq.tomutable()
            else:
                # print 'Contig \'%s\' not written to file' % id
                recorddict.pop(idline)
    except ValueError:
        print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)
def computeStats(aliFName, validCols):

    # Quick and VERY slow
    # convert o matrix and operate directly on it
    ali = AlignIO.read(aliFName, 'clustal')
    scores = {}

    for seqNumRemove in range(len(ali)):
        newSeqs = AlignIO.MultipleSeqAlignment([])
        aliWithValidCols = AlignIO.MultipleSeqAlignment([])

        for seqNum in range(len(ali)):
            if seqNum != seqNumRemove:
                newSeqs.append(ali[seqNum])

        aliWithValidCols = newSeqs[:,validCols[0]:validCols[0]+1]
        querySeq = ali[seqNumRemove, validCols[0]:validCols[0]+1]

        for col in validCols[1:]:
            aliWithValidCols += newSeqs[:,col:col+1]
            querySeq += ali[seqNumRemove, col:col+1]
        AlignIO.write(aliWithValidCols, open("/tmp/tempPartialAli", 'w'), 'clustal')
        SeqIO.write(querySeq, open('/tmp/querySeq', 'w'), 'fasta')
        print "seq %s out of %s completed" % (seqNumRemove, len(ali))
        print "running hmmer and gathering stats"

        hmmBuildCmd = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmbuild", "/tmp/tempPartialAli.hmm", "/tmp/tempPartialAli"]
        hmmScanCmd  = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmscan",
                      "/tmp/tempPartialAli.hmm", "/tmp/querySeq"]
        hmmPressCmd = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmpress",
                      "/tmp/tempPartialAli.hmm"]
        with open(os.devnull, 'w') as fnull, open("/tmp/alResult", 'w') as out:
            subprocess.call(hmmBuildCmd, stdout=fnull)
            subprocess.call(hmmPressCmd, stderr=fnull, stdout=fnull)

            subprocess.call(hmmScanCmd, stderr=fnull, stdout=out)


            print hmmBuildCmd
            print hmmScanCmd
        search =  SearchIO.parse("/tmp/alResult", "hmmer3-text").next()
        bScore = search.hsps[0].bitscore
        scores[search.id] = bScore
    return scores
Beispiel #59
0
def Parse():
    parse_blast_results = list(SearchIO.parse(tmpfile_name, 'blast-xml'))
    for result in parse_blast_results:
        query_id = result.description
        add = list()
        for hit in result:
            hit_id = hit.description
            hit_score = hit[0].evalue  # Only use first HSP
            name = hit_id.split(sep=' ')[:3]
            # The first three words of description is usually enough to know the organism name
            name = ' '.join(name)
            dictionary[name] = None
            add.append([query_id, hit_id, str(hit_score), name])
        out.extend(add)

    # Translate organism name with YouDao API
    api_key = '1630771459'  # 1000 times per hour
    for words in dictionary.keys():
        if dictionary[words] is not None:
            continue
        youdao_results = urllib.request.urlopen(''.join([
            'http://fanyi.youdao.com/openapi.do?keyfrom=Blastit&key=',
            api_key,
            '&type=data&doctype=json&version=1.1&q=',
            words
        ])).read().decode('utf-8')
        parse_translate_results = json.loads(youdao_results)
        translation_results = parse_translate_results['translation']
        dictionary[words] = translation_results[0]

    # Output
    n = 0
    for item in out:
        if n % 3 is 0:
            print('\nQuery sequence id:\t', item[0])
        print(
            '\t', 'Description:', item[1], '\n',
            '\t', 'Evalue:', item[2], '\n',
            '\t', 'Possible name:', item[3], '\n',
            '\t', 'Chinese:', dictionary[item[3]],
            '\n'
        )
        n = n + 1
    return
Beispiel #60
0
def get_hit_seq_megan(filename):
	yamlfile = yaml_load_file()
	blout = SearchIO.parse(filename, 'blast-text')
	for query in blout:
		seqid = query.id.split("\n")[0]
		seqid = seqid
		#print(seqid)
		fh = open("multi_" + seqid + ".fasta", 'a')
		yamlfile[seqid]['hits'] = {}
		bcp = 1 - (my_module.BLAST_CUTOFF_PERCENT / 100)
		topscore = 0
		for hit in query.hits:
			gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
			yamlfile[seqid]['hits'][gi] = {}
			#print(yamlfile[seqid]['hits'])
			for hsp in hit.hsps:
				if topscore == 0:
					topscore = hsp.bitscore
				if hsp.bitscore < (topscore * bcp):
					#print(seqid, " not included: ", gi, " score: ", str(hsp.bitscore), " topscore: ", topscore)
					continue
				if hsp.bitscore < my_module.BLAST_CUTOFF_SCORE:
					continue
				if int(100 * hsp.ident_num/hsp.aln_span) < my_module.BLAST_CUTOFF_PERCENT:
					continue
				if int(100 * (float(hsp.query_end - hsp.query_start + 1)/query.seq_len)) < my_module.BLAST_COVERAGE:
					continue
				#print(hsp.hit, "\t", hsp.hit_strand, "\t", hsp.hit_start, "\t", hsp.hit_end, "\t", hsp.bitscore)
				#print(hsp.query_end, "\t", hsp.query_start, "\t", query.seq_len , "\t", int(100 * (float(hsp.query_end - hsp.query_start + 1)/query.seq_len)))
				hitstart = hsp.hit_start + 1 - 10
				hitstart = 1 if hitstart <= 0 else hitstart
				hitend = hsp.hit_end + 1 + 10
				hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
				#print(hitstart, hitend)
				out = os.popen(my_module.BLAST_BINARY + "/blastdbcmd -db " + my_module.BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
				#print("out", out)
				fh.write(out)
				#print(hsp.hit.seq)
				#print(hsp.query.seq)
				#print("-----")
		fh.close()
		#break
	yaml_dump_file(yamlfile)