Python SearchIO.parse Beispiele, Bio.SearchIO.parse Python Beispiele

Beispiel #1

1

Datei anzeigen

Datei: USSPpip.py Projekt: OLC-LOC-Bioinformatics/SigSeekr

def blastparse(stdout, output, tname, ntname):
    global recorddict, minLength
    handle = open(output, 'w')  # open the target fasta file for writing
    blast_handle = cStringIO.StringIO(stdout)  # Convert string to IO object for use in SearchIO using StringIO
    try:  # Necessary to avoid bad genomes
        for qresult in SearchIO.parse(blast_handle, 'blast-tab'):  # Parse the blast output sting as if it were a file
            for hit in qresult:  # Hit object
                for hsp in hit:  # Hsp object
                    begin = hsp.query_range[0]  # Start of hsp
                    finish = hsp.query_range[1]  # End of hsp
                    if hsp.query_id in recorddict:
                        # For the Contig name in the target fasta dictionary mask using coordinates
                        if finish > begin:
                            recorddict[hsp.query_id].seq = \
                                recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
                                + recorddict[hsp.query_id].seq[finish:]
                        else:
                            recorddict[hsp.query_id].seq \
                                = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
                                + recorddict[hsp.query_id].seq[begin:]
        recorddict_bak = deepcopy(recorddict)  # Copy the dictionary so we may iterate and modify the result
        for idline in recorddict_bak:
            # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' #  Find a sequence of at least the target length
            pattern = r'[ATCG]{100,}N{200,900}[ATCG]{100,}|[^N]{' + re.escape(str(minLength))+r'}'
            if re.match(pattern, str(recorddict[idline].seq)) is not None:
                SeqIO.write(recorddict[idline], handle, "fasta")
            else:
                # print 'Contig \'%s\' not written to file' % id
                recorddict.pop(idline)
    except ValueError:
        print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)

Beispiel #2

0

Datei anzeigen

Datei: test_SearchIO_write.py Projekt: janusz005/biopython

 def parse_write_and_compare(self, source_file, source_format, out_file, out_format, **kwargs):
     """Compares parsed QueryResults after they have been written to a file."""
     source_qresults = list(SearchIO.parse(source_file, source_format, **kwargs))
     SearchIO.write(source_qresults, out_file, out_format, **kwargs)
     out_qresults = list(SearchIO.parse(out_file, out_format, **kwargs))
     for source, out in zip(source_qresults, out_qresults):
         self.assertTrue(compare_search_obj(source, out))

Beispiel #3

0

Datei anzeigen

def parseBlastOutFile(filename):
    if filename[-3:] == "xml":
        qResultGen = SearchIO.parse(filename, 'blast-xml')
    elif filename[-3:] == "txt":
        qResultGen = SearchIO.parse(filename, 'blast-tab')
    else:
        print("Unrecognized filetype.")
        assert False

    parsed = {qRes.id: qRes for qRes in qResultGen}
    print("Parsed " + filename)

    return parsed

Beispiel #4

0

Datei anzeigen

Datei: compare-blast-results.py Projekt: Rinoahu/MICA

def parseBlastOutFile(filename):
	if filename[-3:] == "xml":
		qResultGen = SearchIO.parse(filename, 'blast-xml')
	elif filename[-3:] == "txt":
		qResultGen = SearchIO.parse(filename, 'blast-tab')
	else:
		print("Unrecognized filetype.")
		assert False

	parsed = {qRes.id : qRes for qRes in qResultGen}
	print("Parsed "+filename)

	return parsed

Beispiel #5

0

Datei anzeigen

    def __init__(self, string):
        """
        Define the attributes of a ExonerateGene object.

        - contig_id:     ID of source contig.
        - locs:          Genomic location of called gene on contig.
        - gene_id:       ID of called gene, derived from contig_id and locs.
        - ref:           Reference homolog (i.e. seed gene for exonerate).
        - internal_stop: Internal stop codon present or not.
        - introns:       Number of introns in called gene.
        - called:        Called gene's translated protein sequence.

        All attributes above are derived ultimately from exonerate output.

        Note: locs are always given in "positive" sense, regardless of gene's
        actual sense, this is consistent with Biopython.SearchIO.

        Note: In weird cases, exonerate-text returns a negative start
        co-ordinate for some (not all?) reverse complement genes. I put in
        a second parse using exonerate-vulgar to determine co-ordinates for
        genes.
        """
        contig_id = ""
        introns = 0
        called = []
        for result in SearchIO.parse(string, "exonerate-text"):
            ref = result.id
            stop = False
            for hit in result:
                contig_id = hit.id
                called = []
                introns = len(hit[0].hit_inter_ranges)
                for fragment in hit[0].fragments:
                    for record in fragment.aln._records:
                        if record.name == "aligned hit sequence":
                            called.append(str(record.seq))
                            if "*" in record.seq[:-1]:
                                stop = True
            self.ref = "Exonerate={0}".format(str(ref))
            self.contig_id = contig_id
            self.internal_stop = "IS={0}".format(str(stop))
            self.introns = "Introns={0}".format(str(introns))
            self.called = "".join(called)
        string.seek(0)
        for result in SearchIO.parse(string, "exonerate-vulgar"):
            for hit in result:
                locs = hit[0].hit_range
                gene_id = "{0}_{1}".format(hit.id, "_".join(str(loc) for
                                                            loc in locs))
                self.locs = locs
                self.id = gene_id

Beispiel #6

0

Datei anzeigen

Datei: search_tests_common.py Projekt: yuanzhw/biopython

    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(
            len(parsed), len(indexed),
            "Should be %i records in %s, index says %i" %
            (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format,
                                           **kwargs)
            self.assertEqual(
                len(parsed), len(db_indexed),
                "Should be %i records in %s, index_db says %i" %
                (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)

Beispiel #7

0

Datei anzeigen

def hmmer_results_parser(hmm_filename, edges, len_th, evalue_th, K, sequences = []):
    sequences_hits = {}
    hmm_hits = {}
    hits_lst = []
    with open (hmm_filename,'rU') as handle: 
        for record in SearchIO.parse(handle, 'hmmscan3-domtab'):
            hmm_name = record.id
            hmm_len = record.seq_len
            for h in record.hits:
                seq_name = h.id
                hit_evalue = h.evalue
                for f in h.fragments:
                    hit_len = f.hit_end - f.hit_start
                    seq_s, seq_e = f.query_start, f.query_end - 1
                    if hit_len > len_th*hmm_len and hit_evalue < evalue_th:
                        if len(sequences) > 0:
                            cur_seq = sequences[seq_name]
                        else:
                            cur_seq = merge(edges, K, seq_name)
                        hits_lst.append({"seq_name": seq_name,"hmm_name": hmm_name, "start": seq_s, "end": seq_e, \
                                         "seq": cur_seq.seq[seq_s:seq_e], "e-val": hit_evalue })
                        if seq_name not in sequences_hits:
                            sequences_hits[seq_name] = []
                        if hmm_name not in hmm_hits:
                            hmm_hits[hmm_name] = []
                        sequences_hits[seq_name].append({"hmm_name": hmm_name, "start": seq_s, "end": seq_e})
                        hmm_hits[hmm_name].append({"seq_name": seq_name, "start": seq_s, "end": seq_e, "len": len(cur_seq.seq), "hmm_len": hmm_len})
    return hits_lst, sequences_hits, hmm_hits

Beispiel #8

0

Datei anzeigen

Datei: search_tests_common.py Projekt: rwbarrette/biopython

    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed.keys()))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed.keys()))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed._proxy._handle.close()  # TODO - Better solution
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

Beispiel #9

0

Datei anzeigen

Datei: buildvariants.py Projekt: edraizen/HistoneDB

 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type, seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file, out=self.curated_search_results_file,sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file, "hmmer3-text"):
         print "Loading hmmsearch for variant:", variant_query.id
         variant_model=Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             gi = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=gi)
             # print hit
             try: #sometimes we get this :    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp, seq.variant==variant_model)
             except:
                 pass

Beispiel #10

0

Datei anzeigen

def get_blast_alignments(blast_path):
    records = SearchIO.parse(blast_path, 'blast-xml')
    hit_list = []
    results = []
    for idx, cur in enumerate(records):
        for hit in cur.hits:
            for i, hsp in enumerate(hit.hsps):
                if cur.id != hit.id:
                    qs = hsp.fragment.query_start
                    qe = hsp.fragment.query_end
                    he = hsp.fragment.hit_end
                    hs = hsp.fragment.hit_start
                    query_s = str(hsp.fragment.query.seq)
                    hit_s = str(hsp.fragment.hit.seq)
                    aln_s = hsp.aln_annotation['similarity']
                    score = hsp.bitscore
                    expect = hsp.evalue
                    toks = list(
                        map(str, [
                            cur.id, hit.id, i, qs, qe, he, hs, query_s, hit_s,
                            aln_s, score, expect
                        ]))
                    results.append(toks)
    columns = [
        'query_id', 'hit_id', 'fragment_num', 'query_start', 'query_end',
        'hit_start', 'hist_end', 'query_string', 'hit_string',
        'alignment_string', 'score', 'evalue'
    ]
    return pd.DataFrame(results, columns=columns)

Beispiel #11

0

Datei anzeigen

Datei: hmm_search.py Projekt: gamcil/cblaster

def parse_hmmer_output(results):
    """Parse hmmsearch output

    Args:
        file_list: List, string of file name of results that need parsing
    Return:
        hit_info: list of class objects, with information
                 - query, subject, identity, coverage, e-value, bit score
    """
    hit_info = []
    for record in SearchIO.parse(results, 'hmmer3-text'):
        if not record.hits:
            continue
        for hit in record.hits:
            hit_class = Hit(
                query=record.accession,  # Pfam id
                subject=hit.id,  # Hit id
                identity=None,  # Not present
                coverage=None,  # Not present
                evalue=hit.evalue,  # E-value of hit
                bitscore=hit.bitscore,  # Bit score of hit
            )
            hit_info.append(hit_class)
    if not hit_info:
        LOG.error("No hits have been found")
    return hit_info

Beispiel #12

0

Datei anzeigen

Datei: blast_utils.py Projekt: soybase/GOMAP

def check_bl_out(in_fasta, in_xml):
    skip_blast = False
    if not os.path.isfile(in_xml):
        logging.info(in_xml + " does not exist")
        skip_blast = False
    elif os.stat(in_xml).st_size == 0:
        logging.info(in_xml + " is empty")
        os.remove(in_xml)
        skip_blast = False
    else:
        try:
            blast_ids = natsorted([
                qresult.id for qresult in SearchIO.parse(in_xml, 'blast-xml')
            ])
            fa_ids = natsorted(
                [seq.id for seq in SeqIO.parse(in_fasta, "fasta")])
            if blast_ids == fa_ids:
                skip_blast = True
            else:
                logging.info(
                    "Number of input and output sequences do not match" +
                    in_xml)
                os.remove(in_xml)
                skip_blast = False
        except:
            logging.info("Cannot read " + in_xml)
            os.remove(in_xml)
            skip_blast = False

    return (skip_blast)

Beispiel #13

0

Datei anzeigen

def runHMMsearch(input, basename, tmpdir, cpus, evalue, hmm):
    Results = {}
    #load proteins into dictionary
    protein_dict = SeqIO.to_dict(SeqIO.parse(input, 'fasta'))
    #do hmmer search of proteins
    HMM = os.path.join(tmpdir, basename + '.hmmsearch.txt')
    subprocess.call(
        ['hmmsearch', '-o', HMM, '--cpu',
         str(cpus), '-E', evalue, hmm, input],
        stdout=FNULL,
        stderr=FNULL)
    with open(HMM, 'rU') as results:
        for qresult in SearchIO.parse(results, "hmmer3-text"):
            query_length = qresult.seq_len  #length of HMM model
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                query = hits[0].id
                hit = hits[0].query_id
                score = hits[0].bitscore
                evalue = hits[0].evalue
                num_hsps = len(hits[0].hsps)
                aln_length = 0
                for x in range(0, num_hsps):
                    aln_length += hits[0].hsps[x].aln_span
                if hit not in Results:
                    Results[hit] = [query, score, evalue, aln_length, 'Hmmer3']
    for k, v in Results.items():
        description = base + '|' + k + "|" + v[0] + "|evalue=" + str(
            v[2]) + "|HMMer3-Complete"
        Results[k].append(description)
        Seq = str(protein_dict[v[0]].seq)
        Results[k].append(Seq)
    return Results

Beispiel #14

0

Datei anzeigen

def get_hits_to_VPFs(hmmout_file):
    '''Takes a HMMER3 hmmsearch tab output file as an input and
    returns a dictionary mapping each scaffold with the number of unique genes that match a protein family

    Input:
        - hmmout_file (str): path to HMMER3 hmmsearch out file in tab format

    Returns:
        - hits_to_VPFs (dict): dictionary where key are scaffold IDs and values are number of unique genes that matched a protein family
    '''
    hits_to_VPFs = {}
    with open(hmmout_file, 'r') as input:
        for qresult in SearchIO.parse(input, 'hmmer3-tab'):
            hits = qresult.hits
            num_hits = len(hits)
            if num_hits > 0:
                for i in range(0, num_hits):
                    query_seq_id = hits[i].id
                    scaffold, gene = query_seq_id.split('|')
                    hits_to_VPFs[scaffold] = hits_to_VPFs.get(
                        scaffold, set([])).union([gene])

    for key, value in iter(hits_to_VPFs.items()):
        hits_to_VPFs[key] = len(value)
    return hits_to_VPFs

Beispiel #15

0

Datei anzeigen

Datei: run_interprets.py Projekt: JCGonzS/mechnetor

def parse_blast(blast_pdb_file, max_E, min_pcid, max_pcid, hits):
    with open_file(blast_pdb_file) as f:
        for qresult in SearchIO.parse(f, 'blast-xml'):
            query = qresult.id#.split("|")[1]
            for hit in qresult:
                s = hit.id + hit.description
                hsp = hit[0] # Only the 1st one
                evalue = hsp.evalue
                pcid = float(hsp.ident_num)/hsp.aln_span*100
                if (evalue<=max_E
                and pcid>=min_pcid and pcid<=max_pcid):
                    # print "\t>HIT:",hit.id, set(re.findall("pdb\|\w\w\w\w\|\w", s))
                    # print "\t", hsp.evalue, "{:2.1f}".format(pcid)
                    # print hsp.query_start, hsp.query_end
                    # print hsp.hit_start+1, hsp.hit_end
                    for match in re.findall("pdb\|\w\w\w\w\|\w", s):
                        pdb, chain = match.split("|")[1:]
                        hits[query][pdb][chain]={
                                      "ide": "{:2.1f}".format(pcid),
                                      "e-val": evalue,
                                      "q-start": str(hsp.query_start+1),
                                      "q-end": str(hsp.query_end),
                                      "s-start": str(hsp.hit_start+1),
                                      "s-end": str(hsp.hit_end)
                                      }
                else:
                    break
    return hits

Beispiel #16

0

Datei anzeigen

Datei: QuickModelome.py Projekt: ezequieljsosa/sndg-bio

    def quick_structurome(self, xml_blast_result, data_dir, entries, tmp_dir="/tmp/chain_PDBs",
                          pdb_divided="/data/databases/pdb/divided/", max_models=3):

        good_model = defaultdict(lambda: [])

        def identity(hsp):
            return 1.0 * hsp.ident_num / hsp.aln_span

        _log.info("searching good templates")
        for query in tqdm(bpsio.parse(xml_blast_result, "blast-xml")):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    if 0.6 <= identity(hsp) < 0.95:
                        good_model[hsp.query.id].append(hsp)



        tuplas = good_model.items()

        _log.info("creating models")
        with tqdm(tuplas) as pbar:
            for seq, hsps in pbar:
                try:
                    from SNDG.Structure.Modelome import Modelome
                    Modelome.model_hsps(seq, data_dir, hsps, entries=entries, tmp_dir=tmp_dir,
                                        pdb_divided=pdb_divided, max_models=max_models)
                except Exception as ex:
                    _log.exception(ex)

Beispiel #17

0

Datei anzeigen

Datei: QuickModelome.py Projekt: ezequieljsosa/sndg-bio

    def load_hsp_dict(self, xml_blast_result):

        for query in bpsio.parse(xml_blast_result, "blast-xml"):
            for hit in query:
                if list(hit):
                    hsp = list(hit)[0]
                    self.hsp_dict[query.id][hsp.hit.id] = hsp

Beispiel #18

0

Datei anzeigen

Datei: vHULK.py Projekt: LaboratorioBioinformatica/vHULK

def construct_gene_scores_matrix(hmmtable):
    """
    Parse hmmscan tabular output to a dictionary.
    Arguments:
        hmmtable: pathlib.Path instance: Path to the hmmscan output, specified
            with hmmscan's --tblout option. Can also be str.
    Return:
        dic_genes_scores: dict: A dictionary with the gene ids as keys with
            a list of lists for all its hits. This is of the form
            { gene_id: [
                [ hit id, (<- string)
                  hit E-value, (<- np.float32)
                  hit bit-score, (<-np.float32)
                  hit bias, (<-np.float32)
                  ], ...],
                  ...}
    """
    dic_genes_scores = {}
    for gene in SearchIO.parse(hmmtable, "hmmer3-tab"):
        dic_genes_scores[gene.id] = []
        for hit in gene.hits:
            hit_info = [
                hit.id,
                np.float32(hit.evalue),
                np.float32(hit.bitscore),
                np.float32(hit.bias),
            ]
            dic_genes_scores[gene.id].append(hit_info)
    return dic_genes_scores

Beispiel #19

0

Datei anzeigen

def features_via_hmm(seq, hmmdb, eval_thresh=1.0):
    """
    This function takes a Seq, runs hmmscan against a compressed hmmdb (prepare with hmmpress)
    and output a list of biobython SeqFeature.
    #Needs strictly HMMER 3.0!!!!
    """
    features = list()
    ufn = str(uuid.uuid4())
    SeqIO.write(
        [SeqRecord(seq, id='QUERY', name='QUERY', description="QUERY")],
        ufn + '.fasta', 'fasta')
    subprocess.call([
        "hmmscan", "-o", ufn + ".out", "--tblout", ufn + ".tbl", "--domtblout",
        ufn + ".dtbl", hmmdb, ufn + '.fasta'
    ])
    #Now let's read it

    for v in SearchIO.parse(ufn + ".dtbl", "hmmscan3-domtab"):
        for hit in v:
            for h in hit.hsps:
                # print h
                if h.evalue < eval_thresh:
                    features.append(
                        SeqFeature(FeatureLocation(h.query_start, h.query_end),
                                   type="domain",
                                   qualifiers={
                                       'name': h.hit_id,
                                       'evalue': h.evalue
                                   }))

    os.system("rm %s %s %s %s" %
              (ufn + '.fasta', ufn + '.out', ufn + '.tbl', ufn + '.dtbl'))
    return features

Beispiel #20

0

Datei anzeigen

Datei: hmmscan_parse.py Projekt: dantaslab/resfams_update

def main(argv):
    args = parse_arguments(argv)

    infile = args.infile
    out = args.out_path
    outputs = []
    query_sequences = []
    count = 0

    with open(out, 'w+') as output:
        output.write(
            "%s\t%s\t%s\t%s\n" %
            ("Accession", "family", "query_name", "Resfams_description"))
        for qresult in SearchIO.parse(infile, "hmmer3-tab"):
            for hits in qresult:
                accession = hits.accession
                id = hits.id
                query = hits.query_id
                description = hits.description
                score = hits.bitscore

                array = [accession, id, query, description, str(score)]

                print("\t".join(array))
                output.write("\t".join(array) + "\n")

                if hits.query_id not in query_sequences:
                    query_sequences.append(hits.query_id)
                    count += 1
        print("Unique Seqs: " + str(count))

Beispiel #21

0

Datei anzeigen

 def get_scores_for_curated_via_hmm(self):
     """
     For every curated variant we want to generate a set of scores against HMMs.
     This is needed to supply the same type of information for curated as well as for automatic seqs.
     """
     #Construct the one big file from all cureated seqs.
     with open(self.curated_all_fasta, "w") as f:
         for hist_type, seed in self.get_seeds():
             seed_aln_file = os.path.join(self.seed_directory, hist_type,
                                          seed)
             for s in SeqIO.parse(seed_aln_file, "fasta"):
                 s.seq = s.seq.ungap("-")
                 SeqIO.write(s, f, "fasta")
     #Search it by our HMMs
     self.search(hmms_db=self.combined_hmm_file,
                 out=self.curated_search_results_file,
                 sequences=self.curated_all_fasta)
     ##We need to parse this results file;
     ##we take here a snippet from load_hmmsearch.py, and tune it to work for our curated seq header format
     for variant_query in SearchIO.parse(self.curated_search_results_file,
                                         "hmmer3-text"):
         self.log.info("Loading hmmsearch for variant: {}".format(
             variant_query.id))
         variant_model = Variant.objects.get(id=variant_query.id)
         for hit in variant_query:
             accession = hit.id.split("|")[1]
             seq = Sequence.objects.get(id=accession)
             # print hit
             try:  #sometimes we get this:    [No individual domains that satisfy reporting thresholds (although complete target did)]
                 best_hsp = max(hit, key=lambda hsp: hsp.bitscore)
                 add_score(seq, variant_model, best_hsp,
                           seq.variant == variant_model)
             except:
                 pass

Beispiel #22

0

Datei anzeigen

Datei: hmm.py Projekt: kemball/karkadann

def _call_hmmer(hmm, inputproteins):
	inputproteins = list(inputproteins)
	scores = {}
	for ip in inputproteins:
		scores[ip.id] = 0

	with ntf(prefix="/dev/shm/") as inputfasta:
		with ntf(prefix="/dev/shm/") as hmmoutput:
			SeqIO.write(inputproteins, inputfasta.name, 'fasta')
			hmmfile = os.path.join(hmm_location, hmm + '.hmm')
			sp.call(['hmmsearch', '-o', hmmoutput.name, hmmfile, inputfasta.name])
			hmmoutput.flush()
			hmmoutput.seek(0)
			QRS = SearchIO.parse(hmmoutput, format="hmmer3-text")
			for qr in QRS:
				# there's *always* a QR, even though it's usually empty.
				# qr.sort()
				# I'm kind of hoping this sorts by hit strength.
				# worth checking. I guess it doesn't matter anyway.

				for hit in qr:
					scores[hit.id] = max(scores[hit.id], hit.bitscore)
					for hsp in hit.hsps:
						def appropriate_hyphens(m):
							return '-' * len(m.group(0))

						if len(hsp.hit.seq) > 100:
							hitseq = re.sub('PPPPP+', appropriate_hyphens, str(hsp.hit.seq))
							hitseq = hitseq.translate(None,'-*').upper()
							yield hit.id, hsp.bitscore, hitseq

Beispiel #23

0

Datei anzeigen

Datei: FileManager.py Projekt: Falgunithakor/BioInfo

    def generate_blast_graph(self):
        evalue_filter = lambda hsp: hsp.evalue < self.evalue
        file_name = "{}/blast_graph.txt".format(self.blast_output_path)
        for blast_file in glob.glob(self.blast_data_path):
            print("working on " + blast_file)
            # Parse the Blast file
            qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
            for qresult in qresults:
                write_line = ""
                write_line += qresult.id + ":"
                # Go to the Hit section of query
                for hit in qresult[:]:
                    if not self.blast_graph.has_node(qresult.id):
                        self.blast_graph.add_node(qresult.id)
                    # Check if Hit has min value
                    filtered_hit = hit.filter(evalue_filter)
                    if filtered_hit is not None:
                        if not self.blast_graph.has_node(filtered_hit.id):
                            self.blast_graph.add_node(filtered_hit.id)
                        # Add Edge between graph nodes
                        self.blast_graph.add_edge(qresult.id, filtered_hit.id)
                        write_line += filtered_hit.id + ","
                if write_line != "":
                    with open(file_name, "a") as f_handle:
                        f_handle.write(write_line + '\n')

        # Write GML files
        if self.generate_gml_files:
            file_name = "{}/blast_graph.gml".format(self.blast_output_path)
            with open(file_name, "a") as f_handle:
                nx.write_gml(self.blast_graph, f_handle)

Beispiel #24

0

Datei anzeigen

Datei: extract_blast_hits.py Projekt: maxemil/misc-scripts

def get_unique_blastp_hits(infile, fasta):
    hits = set()
    for aln in SearchIO.parse(infile, 'blast-xml'):
        for hsp in aln.hsps:
            hits.add(hsp.hit_id)
    seqs = {rec.id: rec for rec in SeqIO.parse(fasta, 'fasta')}
    return [seqs[hit] for hit in hits]

Beispiel #25

0

Datei anzeigen

Datei: vsearch.py Projekt: xapple/seqsearch

    def results(self):
        """
        Parse the results and yield biopython SearchIO entries.

        Beware:
        Some databases are not unique on the id, and this causes the parser to
        complain about duplicate entries and raise exceptions such as:

            ValueError: The ID or alternative IDs of Hit 'DQ448783' exists
            in this QueryResult.

        Summary of the columns:
        https://www.metagenomics.wiki/tools/blast/blastn-output-format-6

            qseqid sseqid pident length mismatch gapopen qstart qend sstart send
            evalue bitscore

        Warning: Unlike BLAST results, if a sequence got no hits it is NOT
                 reported at all in VSEARCH. The number of entries yielded
                 will not match the number of sequences at input.
        """
        with open(self.out_path, 'rt') as handle:
            for entry in SearchIO.parse(
                    handle,
                    'blast-tab',
            ):
                yield entry

Beispiel #26

0

Datei anzeigen

Datei: PsiProfile.py Projekt: ezequieljsosa/sndg-bio

    def profile_search(database,
                       pssm_file,
                       search_result,
                       cpu=1,
                       evalue=0.00001):
        cmd = f"psiblast -db {database} -in_pssm {pssm_file} -num_threads {cpu} -evalue {evalue}  -outfmt 5 -out {search_result} 1>&2"
        execute(cmd)
        try:
            search_result = list(bpsio.parse(search_result, "blast-xml"))
        except ParseError:
            sys.stderr.write(
                f'PSIProfile: error parsing results from {search_result}')
            return None

        for query in search_result:
            for hit in list(query):
                for hsp in hit:
                    identity = 1.0 * hsp.ident_num / hsp.aln_span
                    data = [
                        hsp.query.id, hsp.query_start, hsp.query_end,
                        hsp.hit.id, hsp.hit_start, hsp.hit_end, hsp.evalue,
                        identity,
                        str(hsp.aln[0].seq),
                        str(hsp.aln[1].seq)
                    ]
                    yield {
                        f: data[i]
                        for i, f in enumerate(PsiProfile.search_result_fields)
                    }

Beispiel #27

0

Datei anzeigen

Datei: cogOffline4.py Projekt: bugds/COG

def blastSearch(query, speciesList, filename, blastDict):
    '''Run BLAST, save results of a search to a file and return its contents
    :param query: String with accession numbers divided by paragraphs
    :param species: String with all species, against which BLAST is performed
    :param filename: Name of original fasta file for saving results of BLAST
    '''

    xmlPath = rootFolder \
        + '/Blast_XML/' \
        + os.path.splitext(filename)[0] \
        + '.xml'

    query = createInputForBlast('.q', query, filename)
    taxidList = createInputForBlast('.t', speciesList, filename)

    blastNotVoid = bashBlast(query=query, out=xmlPath, taxidList=taxidList)

    if blastNotVoid:
        blast = SearchIO.parse(xmlPath, 'blast-xml')
        writeInBlastDict(blast, blastDict)

    os.remove(query)
    os.remove(taxidList)
    os.remove(xmlPath)

    return blastDict

Beispiel #28

0

Datei anzeigen

def blast_partition(path, partition_dict):
    partitions = defaultdict(lambda: [], {"no_hit": [], "else": []})
    for query in bpsio.parse(path, "blast-xml"):
        hits = list(query)
        if not hits:
            partitions["no_hit"].append(query.id)
        for hit in hits:
            for hsp in hit:
                hsp.identity = identity(hsp)
                hsp.coverage = coverage(query, hsp)
                hsp.hit_coverage = hit_coverage(hit, hsp)

                added = False
                for k, fn_filter in partition_dict.items():
                    if fn_filter(query, hit, hsp):
                        partitions[k].append((
                            query,
                            hit,
                            hsp,
                        ))
                        added = True
                        break
                if not added:
                    partitions["else"].append((
                        query,
                        hit,
                        hsp,
                    ))

    return partitions

Beispiel #29

0

Datei anzeigen

Datei: compare.py Projekt: bioinf/bi2014-neurospora-crassa

def parse_n_fill_run_data_searchio(run_path, run_data, querydb):
    run_id = get_run_id(run_path)
    run_format = get_run_format(run_path)
    for query in SearchIO.parse(run_path, run_format):
        for hit in query.hits:
            for hsp in hit.hsps:
                exons = [x.hit_range for x in hsp.fragments]
                coverage = 'N/A'

                if querydb is not None:
                    total_matched = sum(x.query_span for x in hsp.fragments)
                    coverage = '{:.2f}%'.format(100 * total_matched / len(querydb[query.id]))

                if hasattr(hsp, 'score'):
                    score = hsp.score
                elif hasattr(hsp, 'bitscore'):
                    score = hsp.bitscore
                else:
                    score = 'N/A'

                if hasattr(hsp, 'ident_num') and hasattr(query, 'seq_len'):
                    matched = '{:.2f}%'.format(100 * hsp.ident_num / query.seq_len)
                else:
                    matched = 'N/A'

                alignment = AlignmentData(run_id, score, matched, coverage, hsp.hit_range, exons)
                run_data[query.id][hit.id].append(alignment)

Beispiel #30

0

Datei anzeigen

Datei: subprocessing.py Projekt: serina-robinson/adenylpred_scratch

def run_hmmpfam2(query_hmmfile: str, target_sequence: str, extra_args: List[str] = None
                 ) -> List[SearchIO._model.query.QueryResult]:  # pylint: disable=protected-access
    """ Run hmmpfam2 over the provided HMM file and fasta input

        Arguments:
            query_hmmfile: the HMM file to use
            target_sequence: a string in fasta format of the sequence to run

        Returns:
            a list of results as parsed by SearchIO
    """
    config = get_config()
    command = ["hmmpfam2"]

    # Allow to disable multithreading for HMMer2 calls in the command line #TODO fix options for this
    if config.get('hmmer2') and 'multithreading' in config.hmmer2 and \
            config.hmmer2.multithreading:
        command.extend(["--cpu", str(config.cpus)])
    if extra_args:
        command.extend(extra_args)
    command.extend([query_hmmfile, '-'])

    result = execute(command, stdin=target_sequence)
    if not result.successful():
        logging.debug('hmmpfam2 returned %d: %r while searching %r', result.return_code,
                      result.stderr, query_hmmfile)
        raise RuntimeError("hmmpfam2 problem while running %s: %s" % (command, result.stderr))
    res_stream = StringIO(result.stdout)
    return list(SearchIO.parse(res_stream, 'hmmer2-text'))

Beispiel #31

0

Datei anzeigen

Datei: subprocessing.py Projekt: serina-robinson/adenylpred_scratch

def run_blastp(target_blastp_database: str, query_sequence: str,
               opts: List[str] = None, results_file: str = None
               ) -> List[SearchIO._model.query.QueryResult]:
    """ Runs blastp over a single sequence against a database and returns the
        results as parsed by Bio.SearchIO.

        Arguments:
            target_blastp_database: the blastp database to compare to
            query_sequence: the sequence being compared
            opts: a list of extra arguments to pass to blastp, or None
            results_file: a path to keep a copy of blastp results in, if provided

        Returns:
            a list of QueryResults as parsed from blast output by SearchIO
    """
    if not query_sequence:
        raise ValueError("Cannot run blastp on empty sequence")

    config = get_config()
    command = ["blastp", "-num_threads", str(config.cpus), "-db", target_blastp_database]

    if opts is not None:
        command.extend(opts)

    result = execute(command, stdin=query_sequence)
    if not result.successful():
        raise RuntimeError('blastp returned %d: %r while scanning %r' % (
                           result.return_code, result.stderr.replace("\n", ""),
                           query_sequence[:100]))

    if results_file is not None:
        with open(results_file, 'w') as fh:
            fh.write(result.stdout)

    return list(SearchIO.parse(StringIO(result.stdout), 'blast-text'))

Beispiel #32

0

Datei anzeigen

Datei: hmmer_parallel.py Projekt: wook2014/funannotate

def multiPFAMsearch(inputList, cpus, tmpdir, output):
    # run hmmerscan multithreaded by running at same time
    # input is a list of files, run multiprocessing on them
    pfam_results = os.path.join(os.path.dirname(tmpdir), 'pfam.txt')
    pfam_filtered = os.path.join(os.path.dirname(tmpdir), 'pfam.filtered.txt')
    lib.runMultiNoProgress(safe_run, inputList, cpus)

    # now grab results and combine, kind of tricky as there are header and footers for each
    resultList = [os.path.join(tmpdir, f) for f in os.listdir(
        tmpdir) if os.path.isfile(os.path.join(tmpdir, f)) and f.endswith('.pfam.txt')]
    combineHmmerOutputs(resultList, pfam_results)

    # now parse results
    with open(output, 'w') as out:
        with open(pfam_filtered, 'w') as filtered:
            with open(pfam_results, 'r') as results:
                for qresult in SearchIO.parse(results, "hmmsearch3-domtab"):
                    hits = qresult.hits
                    num_hits = len(hits)
                    if num_hits > 0:
                        for i in range(0, num_hits):
                            hit_evalue = hits[i].evalue
                            query = hits[i].id
                            pfam = qresult.accession.split('.')[0]
                            hmmLen = qresult.seq_len
                            hmm_aln = int(hits[i].hsps[0].hit_end) - \
                                int(hits[i].hsps[0].hit_start)
                            coverage = hmm_aln / float(hmmLen)
                            if coverage < 0.50:  # coverage needs to be at least 50%
                                continue
                            filtered.write("%s\t%s\t%s\t%f\n" %
                                           (query, pfam, hit_evalue, coverage))
                            out.write("%s\tdb_xref\tPFAM:%s\n" % (query, pfam))

Beispiel #33

0

Datei anzeigen

Datei: nrpslib.py Projekt: dkmva/nrps-oligo-designer

def get_adenylation_domains(fasta, known=None, lagging_strand=False):
    adenylation_domains = []

    fasta_seqs = []
    for fs in SeqIO.parse(fasta, 'fasta'):
        revcom=False
        seq = str(fs.seq)
        pepseq, rf = get_pepseq(seq)
        if rf < 0 == lagging_strand:
            revcom=True
            seq = utils.reverse_complement(seq)
        fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf})
    for fs in fasta_seqs:
        utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'],
                  '>header\n' + pepseq)
        with open('dump') as f:
            out = f.read()
        res_stream = StringIO(out)
        os.remove('dump')
        results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab'))

        for result in results:
            for i, hsp in enumerate(result.hsps, 1):
                s = hsp.hit_start
                e = hsp.hit_end

                adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e))

    return adenylation_domains

Beispiel #34

0

Datei anzeigen

Datei: bia_update_pdb_props.py Projekt: ezequieljsosa/sndg-bio

def important_pfam(seqs_from_pdb_hmm):
    for query in tqdm(bpsio.parse(seqs_from_pdb_hmm, 'hmmer3-text')):
        try:
            pdb, chain, start, end = query.id.split("_")  # @UnusedVariable
            if ExperimentalStructure.objects(name=pdb,residue_sets__name="important_pfam").count():
                continue

            strdoc = ExperimentalStructure.objects(name=pdb).get()

            if not strdoc.residue_set("important_pfam"):
                important_rs = ResidueSet(name="important_pfam")
                domain_rs = None
                for hit in query:
                    if len(hit):
                        hsp = hit[0]
                        domain_rs = ResidueSet(name=hit.id)
                        i = 0
                        for x in str(hsp.aln[1].seq):
                            residue = chain + "_" + str(i + int(start))
                            if x == x.upper():
                                important_rs.residues.append(residue)
                            i = i + 1
                            domain_rs.residues.append(residue)
                        if domain_rs:
                            strdoc.residue_sets.append(domain_rs)
                strdoc.residue_sets.append(important_rs)
                strdoc.save()
        except DoesNotExist:
            pass

Beispiel #35

0

Datei anzeigen

Datei: blast.py Projekt: UtrechtUniversity/microbiome

def parse_results(path, file_name, FA_FILES_PATH, top_k=3, add_to_db=bool):
    """Parses a result of a blast query

    Return top k matches and adds them to the database.
    """

    print(f"Parsing {file_name} at {path}")
    i = 0
    results = list()
    for bresults in SearchIO.parse(path, 'blast-xml'):
        for r in bresults:
            i += 1
            # Select only top k
            if i <= top_k:
                results.append({
                    "rank": i,
                    "id": r.id,
                    "query_id": r.query_id,
                    "full_name": r.description_all,
                    "bitscore": r.hsps[0].bitscore,
                    "evalue": r.hsps[0].bitscore,
                    "query_range": r.hsps[0].query_range,
                    "hit_range": r.hsps[0].hit_range,
                })
            elif i > top_k and add_to_db is True:
                print("Top 3 results saved to database")
                add_to_database(results=results, FA_FILES_PATH=FA_FILES_PATH)
                break
            else:
                return (results)

Beispiel #36

0

Datei anzeigen

Datei: Importer.py Projekt: ezequieljsosa/sndg-bio

def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):

    # if db_init:
    #     from SNDG.Sequence.ProteinAnnotator import PABase
    #     PABase.sqldb.initialize(db_init)
    # mkdir(annotation_dir)
    # out = annotation_dir + "/species_blast.tbl"
    #
    # tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    # species_tax = None
    # for tax in Tax.parents(tax):
    #     if tax.node_rank == "genus":
    #         species_tax = tax
    #         break
    # tax_data = "/data/xomeq/tax/"
    # species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()

Beispiel #37

0

Datei anzeigen

def exonerate_parser(exonerate_file):
    """
    parser the exonerate result, and return the position of the feather in 4-col bed format
    4 col bed4: [chro, start,end, name], example ["seq1", 1, 55, "trnP"]
    :param query:
    :param exonerate_file:
    :param prefix:
    :return: list of bed4
    """
    #fw=open(tbl_outname, "w") # change IO to list store
    bed4=[]

    texts=SearchIO.parse(StringIO(exonerate_file), format="exonerate-text")
    for record in texts:
        for hsp in record:
            for s in hsp:
                # the biopython.SearchIO interval is 0 based [start, end), so start+1, end+0 to get 1 based coords
                table_4=[s.fragment.query_id, s.fragment.query_start+1, s.fragment.query_end,s.fragment.hit_id]
                bed4.append(table_4)

                #fw.write("\t".join(table_4))
                #fw.write("\n")
    bed4.sort()
    #fw.close()
    return bed4

Beispiel #38

0

Datei anzeigen

Datei: annotate_to_bed.py Projekt: uc-cdis/covid-bioinformatics

 def run_hmmsearch(self, name, hmm):
     """
     Rum hmmsearch and return the highest scoring hit
     """
     out = tempfile.NamedTemporaryFile("w")
     cmd = [
         "hmmsearch",
         "--noali",
         "-o",
         out.name,
         os.path.join(self.cov_dir, hmm + "-nt.hmm"),
         os.path.join(self.cov_dir, name + ".fa"),
     ]
     if self.verbose:
         print("Command: {0}".format(cmd))
     try:
         subprocess.run(cmd, check=True)
     except (subprocess.CalledProcessError) as exception:
         print("Error: {}".format(exception))
         sys.exit("Error running hmmsearch using {}".format(hmm))
     bestscore = 0
     besthit = None
     # Get HSP with highest score
     for qresult in SearchIO.parse(out.name, "hmmer3-text"):
         for hit in qresult:
             for hsp in hit:
                 if hsp.bitscore > bestscore:
                     besthit = hsp
                     bestscore = hsp.bitscore
     return besthit

Beispiel #39

0

Datei anzeigen

Datei: peptblast.py Projekt: aurbn/peptblast

def process_blast_output(file, simple, argparser):
    qresults = SearchIO.parse(file, 'blast-xml')
    if simple:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    if ((hsp.aln_span == argparser.cont and (hsp.gap_num == 0) and
                             (hsp.aln_span == hsp.ident_num)) or (hsp.aln_span > argparser.cont)):
                        yield ([str(hsp), "\n\n"], None, hsp.aln_span)

                for hsp in hit:
                    if (hsp.aln_span >= argparser.cont and (hsp.gap_num == 0) and
                            (hsp.aln_span == hsp.ident_num)):
                        yield (None, [str(hsp), "\n\n"], hsp.aln_span)
    else:
        for qresult in qresults:
            for hit in qresult:
                for hsp in hit:
                    for v, c, p in encode(simstr(hsp.aln)):
                        if v == "1" and c >= argparser.cont:
                            yield (format_alignment(hsp, p, c), None, c)
                for hsp in hit:
                    for t0, t1, t2 in thrids(encode(simstr(hsp.aln))):
                        if t0[0] == "1":
                            assert (t0[2] < t1[2] < t2[2])
                            assert t2[0] == "1"
                            assert t1[0] == "0"
                            if t0[1] >= argparser.leftmin and t2[1] >= argparser.rightmin and \
                               (t0[1] + t2[1]) >= argparser.summin and \
                                t1[1] <= argparser.gapmax:
                                if not (argparser.S and
                                        (t0[1] >= argparser.cont or t2[1] >= argparser.cont)):
                                    yield (None, format_alignment(hsp, t0[2], t0[1], t2[2], t2[1]),
                                           t0[1]+t2[1]-t1[1])

Beispiel #40

0

Datei anzeigen

Datei: hmmscan_results.py Projekt: bliebeskind/PhyloPreprocessing

def parse_hmmscan_tab(infile, print_header=True):
    '''Parse hmmscan output in --tblout format'''
    if print_header:
        yield "query","top hit","evalue","certainty","num sig hits"
    records = SearchIO.parse(infile,'hmmer3-tab')
    for rec in records:
        query = rec.id
        if len(rec) > 1:
            hit1,hit2 = rec.hits[0],rec.hits[1]
            eval1,eval2 = hit1.evalue,hit2.evalue
            if eval1 != 0: # convert to -ln evalue
                eval1 = -np.log(eval1)
            if eval2 != 0:
                eval2 = -np.log(eval2)
            if eval1 == 0 and eval2 != 0: # this may be a hack, I don't care
                certainty = 1
            elif eval1 == 0 and eval2 == 0:
                certainty = 0
            else: # calculate certainty with info theoretic calc.
                total = eval1 + eval2
                p1,p2 = eval1/total, eval2/total
                certainty = 1 + (p1 * np.log2(p1)) + (p2 * np.log2(p2))
        else:
            certainty = 1
        yield query, rec.hits[0].id, rec.hits[0].evalue, certainty, len(rec)

Beispiel #41

0

Datei anzeigen

Datei: getdataset.py Projekt: wangdi2014/plasmidminer

def runHmmer(args, list_path, file_path, f):
    """run prodigal and hmmsearch on chr files"""
    if not os.path.exists(str(args.data) + '/tmp'):
        os.makedirs(str(args.data) + '/tmp')
    # get the sample group
    head, group = os.path.split(os.path.split(file_path)[0])
    basename = os.path.splitext(str(ntpath.basename(str(file_path))))[0]
    exportpath = str(args.data) + '/tmp/' + ntpath.basename(str(file_path))
    hmmpath = str(args.data) + '/tmp/' + ntpath.basename(
        str(file_path)) + '.out'
    print('Processing %s of group %s' % (basename, group))
    s = ""
    cmd = ("prodigal -p meta -i ", str(file_path), " -a ", exportpath,
           ' -d /dev/null > /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # run hmmsearch on faa ORF files
    s = " "
    cmd = ("hmmsearch -E 0.001 --domtblout", hmmpath, 'resources/remove.hmm',
           exportpath, '> /dev/null 2> /dev/null')
    os.system(s.join(cmd))
    # write it to output file if there is a hit
    with open(hmmpath, 'rU') as input:
        try:
            for qresult in SearchIO.parse(input, 'hmmscan3-domtab'):
                query_id = qresult.id
                hits = qresult.hits
                num_hits = len(hits)
                acc = qresult.accession
                if num_hits > 0:
                    f.write(''.join((basename, '\t', str(file_path), '\n')))
        except ValueError:
            print('parsing error on %s' % basename)

Beispiel #42

0

Datei anzeigen

Datei: msa_setup.py Projekt: kashmatic/BLCA

def get_hit_seq(fastafile, filename):
	yamlfile = yaml_load_file(fastafile)
	blout = SearchIO.parse(filename, 'blast-text')
	for query in blout:
		seqid = query.id.split("\n")[0]
		#print(seqid)
		fh = open("multi_" + seqid + ".fasta", 'a')
		yamlfile[seqid]['hits'] = {}
		for hit in query.hits:
			gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
			yamlfile[seqid]['hits'][gi] = {}
			#print(yamlfile[seqid]['hits'])
			for hsp in hit.hsps:
				#print(hsp.hit)
				#print(hsp.hit_strand)
				#print(hsp.hit_start)
				#print(hsp.hit_end)
				hitstart = hsp.hit_start + 1 - HIT_SEQUENCE_BPS
				hitstart = 1 if hitstart < 0 else hitstart
				hitend = hsp.hit_end + 1 + HIT_SEQUENCE_BPS
				hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
				#print(hsp.hit_end)
				out = os.popen(BLAST_BINARY + "/blastdbcmd -db " + BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
				fh.write(out)
				#print(hsp.hit.seq)
				#print(hsp.query.seq)
				#print("-----")
		fh.close()
		#break
	yaml_dump_file(fastafile, yamlfile)

Beispiel #43

0

Datei anzeigen

Datei: Exons_0.014.py Projekt: JMPflug/Exon_machine

def first_exonerate_parse(dir, newdir, prefix):
	cwd = os.getcwd()
	if not os.path.exists(cwd + newdir):
		os.makedirs(cwd + newdir)
	if not os.path.exists(cwd + '/merged_exons/'):
		os.makedirs(cwd + '/merged_exons/')
	for file in slistdir(cwd + dir):
		if 'DS_Store' not in file:
			result = SearchIO.parse(cwd + dir + file, 'exonerate-text')
			for h in result:
				for hh in h:
					for hhh in hh:
						hitcounter = 1
						for hhhh in hhh:
							hitseq =  hhhh.query
							rootname = file.split('.fasta')
							orthoname = file.split("_")
							orthosubdir = cwd + newdir + '/' + orthoname[0]
							if not os.path.exists(orthosubdir):
								os.makedirs(orthosubdir)
							newseqstr = str(hitseq.seq.ungap("-"))
							newid = prefix + str(hitcounter) + '_' + rootname[0]
							record = SeqRecord(Seq(newseqstr, generic_dna), id =  newid, description = '')
							fastaname = prefix + str(hitcounter) + '_' + rootname[0] + '.fasta'
							SeqIO.write(record, orthosubdir + '/' + fastaname, "fasta")
							hitcounter += 1

Beispiel #44

0

Datei anzeigen

Datei: search_tests_common.py Projekt: dzhang4/biopython

    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed),
                         "Should be %i records in %s, index says %i"
                         % (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed),
                             "Should be %i records in %s, index_db says %i"
                             % (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)

Beispiel #45

0

Datei anzeigen

Datei: SilixNetworkChart.py Projekt: Falgunithakor/BioInfo

 def retrieve_blast_data(self):
     for blast_file in glob.glob(self.blast_data_path):
         print(blast_file)
         print self.network_data
         qresults = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         for qresult in qresults:
             if(qresult.id in self.network_data):
                 print qresult.id

Beispiel #46

0

Datei anzeigen

Datei: getcontig.py Projekt: jangwen/python

def parse(target):
    blast_result = list(SearchIO.parse('BlastResult.xml', 'blast-xml'))
    for record in blast_result:
        if len(record) == 0:
            continue
        else:
            tophit = record[0]
        target.append([tophit[0][0].query, tophit[0][0].hit])

Beispiel #47

0

Datei anzeigen

Datei: SimilarityNetworks.py Projekt: Falgunithakor/BioInfo

 def generate_blast_data(self):
     self.initialize_variables()
     for blast_file in glob.glob(self.blast_data_path):
         # Parse each Blast file
         query_results = SearchIO.parse(blast_file, 'blast-tab', comments=True)
         filtered_query_results = self.apply_filtering(query_results)
         # Parse each blast record
         for query_result in filtered_query_results:
             print query_result.id
             self.generate_blast_graph(query_result)

Beispiel #48

0

Datei anzeigen

Datei: CommonFastaFunctions.py Projekt: bfrgoncalves/Allele_Calling

def runBlastParserTAB(cline,blast_out_file, False):
	startTime = datetime.now()
	os.system(str(cline))
	print 'Running BLAST:' + str(datetime.now() - startTime)
	startTime = datetime.now()
	blast_records = SearchIO.parse(blast_out_file, 'blast-tab', comments=False)

	print 'Parsing Results:' + str(datetime.now() - startTime)

	return blast_records

Beispiel #49

0

Datei anzeigen

Datei: process_tot.py Projekt: marcottelab/hmm_proteome_annotation

def process_tot():
        print "beginning process_tot"
	#level name
	level = sys.argv[3]+" "
	#get the name of the proteome from the file name
	omeid = sys.argv[1].replace(".fasta", "")
	omeid = omeid.split('/')
	omeid = omeid[len(omeid)-1] 
	#read in results
	results = SearchIO.parse(sys.argv[2], "hmmer3-text")
	
	#build up list of entries
	#processed = [] #intialize list to add entries to
        count = 0

        scans=len(results)
        cutoff=1.0/scans
        print scans
        print cutoff

	for protein in results: 
                processed = []
                if count % 100 ==0:
                     print count
                count = count + 1
		pid = protein.id+" "
		if len(protein) == 0: #if a protein has no hits groupid=proteinid and rank=0
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		elif protein[0].evalue > cutoff: #proteins with hits that do not meet the threshold are treated as those without any hits
			rank = "0 "
			OGid = protein.id+" "
			e = "n/a "
                        qr= "n/a "
			processed.append((rank, level, pid, OGid, e, qr, omeid))			
		else:
			i = 0
			while i<len(protein) and protein[i].evalue <= cutoff:
				rank = str(i+1)+" "
				OGid = protein[i].id.split('.')
				OGid = OGid[0]+"."+OGid[1]+" "
				e = str(protein[i].evalue)+" "
				qr = [] #empty list for domain ranges of this hit
				for d in protein[i]:
					qr.append(d.query_range)
				processed.append((rank, level, pid, OGid, e, str(qr).replace(" ", "")," ", omeid))
				i += 1
		#Write to file
	 	for i in processed:
			output.write("".join(str(s) for s in i) + "\n")

	output.close()

Beispiel #50

0

Datei anzeigen

Datei: primer.py Projekt: wpwupingwp/python

def validate(candidate_file, input_file, n_seqs, min_len, min_covrage,
             max_mismatch):
    # remove gap in old alignment file
    no_gap = 'validate.fasta'
    with open(no_gap, 'w') as new, open(input_file, 'r') as old:
        for line in old:
            if line.startswith('>'):
                new.write(line)
            else:
                new.write(line.replace('-', ''))

    # build blast db
    candidate_fasta = 'primer_candidate.fasta'
    SeqIO.convert(candidate_file, 'fastq', candidate_fasta, 'fasta')
    run('makeblastdb -in {} -dbtype nucl'.format(no_gap), shell=True)
    # blast
    blast_result_file = 'BlastResult.xml'
    cmd = nb(num_threads=cpu_count(),
             query=candidate_fasta,
             db=no_gap,
             task='blastn',
             evalue=1e-5,
             max_hsps=1,
             max_target_seqs=n_seqs,
             outfmt=5,
             out=blast_result_file)
    stdout, stderr = cmd()
    # parse
    min_bitscore_raw = min_len - max_mismatch
    blast_result = [['ID', 'Hits', 'Sum_Bitscore_raw'], ]
    blast_result.append(['All', n_seqs, min_len])
    for query in SearchIO.parse(blast_result_file, 'blast-xml'):
        if len(query) == 0:
            blast_result.append([query.id, 0, 0])
            continue
        sum_bitscore_raw = 0
        good_hits = 0
        start = 0
        for hit in query:
            hsp_bitscore_raw = hit[0].bitscore_raw
            if hsp_bitscore_raw >= min_bitscore_raw:
                sum_bitscore_raw += hsp_bitscore_raw
                good_hits += 1
                start += sum(hit[0].hit_range) / 2
        blast_result.append([query.id, good_hits/n_seqs, sum_bitscore_raw,
                             start/n_seqs])
    # validate
    # validate_result = [['ID', 'Hits', 'Sum_Bitscore_raw', 'Seq'], ]
    validate_result = list()
    for record in blast_result[2:]:
        if record[1] >= min_covrage:
            validate_result.append(record)
    validate_result.sort(key=lambda x: x[1], reverse=True)
    return validate_result

Beispiel #51

0

Datei anzeigen

Datei: make_single_records_and_Excel.py Projekt: Lathian212/ME_BLAST_Manipulations

def start_queryResult_generator(inFile, fDic, work_sheet):
    """ invoking the parse function to return a 'generator' that can allow you 
        to step though the record one QueryResult Object at a time but invoking
        nextQuery = (next)generator on it.This approach can allow you to save 
        on memory. I have found with my current task casting this generator with
        (list) works fine but it is really not called for in this current 
        task of parsing and sorting the records.
    """
    """ http://biopython.org/DIST/docs/api/Bio.SearchIO.BlastIO-module.html"""
    qGenerator = SearchIO.parse(inFile, 'blast-xml')
    max_hits = 0
    query_count = 1
    # Step through all the records in the lump xml data file and write out
    # each separate hit to file. Also write the summary information to the
    # work sheet.
    for query_result in qGenerator:
        print('Processing Query BLAST return ' + str(query_count))
        number_hits = int(len(query_result.hits))
        # Extend header out right if new MAXHITS
        if number_hits > max_hits:
            max_hits = number_hits       
        if number_hits == 0:
            # Construct path plus file name for no hit query
            filename = str(fDic['topDir'] + fDic['noHit'] + 'Query_' 
                           + str(query_count) + '_H_none.xml')
            # Write out any Queries that had to hits to a no Hit subfolder
            SearchIO.write(query_result, filename, 'blast-xml')
            write_qr_to_ws(query_count, query_result, work_sheet)
        else :
            # Now set up a counter of 'hits' in the QueryResult so hit's
            # can be sliced away into their own record cleanly.
            hit_count = 0;
            for hit in query_result.hits:
                total_hsps = len (hit.hsps)
                lowest_eval = hit.hsps[0].evalue
                best_hsp = hit.hsps[0]
                for hsp in hit.hsps:
                    if hsp.evalue < lowest_eval:
                        lowest_eval = hsp.evalue
                        best_hsp = hsp
                filename = str(fDic['topDir'] + outputFileName(query_count, hit, best_hsp))
                SearchIO.write(query_result[hit_count:(hit_count + 1)], filename , 'blast-xml')
                hit_count += 1
            # Write out query_result to worksheet           
            write_qr_to_ws(query_count, query_result, work_sheet)
        query_count += 1
        # break is debugging code
        # if query_count == 20:
        #   break
    build_ws_header(work_sheet, max_hits)
    return qGenerator

Beispiel #52

0

Datei anzeigen

Datei: fasta_stuff.py Projekt: flamholz/popseq

def FilterFASTAByPSL(fasta_fnames, psl_fnames, output_dir,
					 output_filename_postfix=None):
	"""Produces a FASTA file with only those sequences in the PSL.

	Iterates over pairs of FASTA and PSL files and produces a new
	FASTA file containing only those sequences for which there was a 
	record in the PSL file.

	Args:
		fasta_fnames: the FASTA filenames.
		psl_fnames: the PSL (BLAT output) filenames in the same order.
	
	Returns:
		A list of paths to filtered FASTA files.
	"""
	filtered_fnames = []
	for fasta_fname, psl_fname in zip(fasta_fnames, psl_fnames):
		filtered_fname = filename_util.MakeFASTAFilename(
			fasta_fname, dest_dir=output_dir,
			postfix=output_filename_postfix)
		filtered_fnames.append(filtered_fname)
		if path.exists(filtered_fname):
			print 'Skipping filtering of %s as output exists' % psl_fname

		# Get the IDs of all the matching sequences.
		parsed = SearchIO.parse(psl_fname, 'blat-psl')
		ids_with_hits = set()
		for record in parsed:
			for hsp in record.hsps:
				ids_with_hits.add(hsp.query_id)

		parsed = SeqIO.parse(fasta_fname, 'fasta')
		retained = []
		n_seqs = 0
		for record in parsed:
			n_seqs += 1
			if record.id in ids_with_hits:
				retained.append(record)

		assert len(ids_with_hits) == len(retained), 'Some sequences missing!'
		pct_retained = 100 * float(len(retained)) / float(n_seqs)

		print '\tRetained %d of %d records (%.2f%%)' % (len(retained),
													   n_seqs, pct_retained)
		print '\tWriting output to', filtered_fname
		SeqIO.write(retained, filtered_fname, 'fasta')

		# Force delete these lists since they might be very big.
		del retained
		del ids_with_hits
	return filtered_fnames

Beispiel #53

0

Datei anzeigen

Datei: test_model.py Projekt: Klortho/HistoneDB

def get_model_scores(model_output):
    """Get the bit score for each hit/domain in a hmmersearch result

    Parameters:
    -----------
    model_output : str or File-like object
        Path to hmmersearch output file

    Return:
    -------
    A list of all bitscores
    """
    return [hsp.bitscore for query in SearchIO.parse(model_output, "hmmer3-text") \
        for hit in query for hsp in hit]

Beispiel #54

0

Datei anzeigen

Datei: xml2fasta.py Projekt: wpwupingwp/python

def main():
    start = timer()
    arg = argparse.ArgumentParser()
    arg.add_argument('input', help='input BLAST result (xml format)')
    arg.add_argument('-s', '--simple', action='store_true',
                     help='only handle first hsp')
    arg.add_argument('-ss', '--very_simple', action='store_true',
                     help='only handle first hit')
    arg = arg.parse_args()

    def safe(old):
        return re.sub(r'\W', '_', old)

    xml = SearchIO.parse(arg.input, 'blast-xml')
    handle_tsv = open('{}.tsv'.format(arg.input), 'w')
    handle_tsv.write('Query\tbitscore\tSpecies name\thit\n')
    for query in xml:
        if query.description != '':
            query.id = ''.join([query.id, query.description])
        if len(query) == 0:
            with open(arg.input+'_not_found.log', 'a') as not_found:
                not_found.write('{} not found!\n'.format(query.id))
            continue
        handle = open('{}.fasta'.format(safe(query.id)), 'w')
        SeqIO.write(query[0][0].query, handle, 'fasta')
        for hit in query:
            for hsp in hit:
                species_name = hsp.hit.description.split(' ')
                if species_name[0].isupper():
                    species_name = '{}_{}_{}'.format(
                        *species_name[1:3], species_name[0].replace(':', ''))
                else:
                    species_name = '_'.join(species_name[:2])
                info = '{}\t{}\t{}\t{}{}\n'.format(
                    query.id,
                    hsp.bitscore,
                    species_name,
                    hsp.hit.id, hsp.hit.description)
                handle_tsv.write(info)
                hsp.hit.id = '{}|{}'.format(hsp.bitscore, hsp.hit.id)
                SeqIO.write(hsp.hit, handle, 'fasta')
                if arg.simple or arg.very_simple:
                    break
            if arg.very_simple:
                break
    end = timer()
    print('Cost {:.3f} seconds.'.format(end-start))

Beispiel #55

0

Datei anzeigen

Datei: process_small.py Projekt: marcottelab/hmm_proteome_annotation

def process_small():
	#get the name of the proteome from the file name
	pid = sys.argv[1].replace(".fasta", "")
	pid = pid.split('/')
	pid = pid[len(pid)-1] 
	#read in results
	results = SearchIO.parse(sys.argv[2], "hmmer3-text")

	#build up list of entries	
	processed = [] #initialize list to add entries to
	for protein in results:
		if len(protein) == 0 or protein[0].evalue > 0.01: #if a query has no significant hits groupid=proteinid and rank=0
			OGid = protein.id
			es = "n/a"
			rank = "0"
		elif len(protein) == 1: #if a query has 1 hit it is recorded with rank=1
			OG = protein[0]
			OGid = OG.id.replace(".meta_raw", "")
			es = str(OG.evalue)
			rank = "1"
		else:	#if a query has more hits the top hit is recorded with rank=1
			OG = protein[0]
			OGid = OG.id.replace(".meta_raw", "")
			e = OG.evalue
			es = str(e)
			OG2 = protein[1]
			e2 = OG2.evalue
			rank = "1"
			#if the difference between the evalues of the top 2 hits is within 10fold the 
			#second hit is also recorded with rank=2
			if e2==0:
				OG2id = OG2.id.replace(".meta_raw", "")
				processed.append(("2 ", sys.argv[3]+" ", protein.id+" ", OG2id+" ", str(e2)+" ", pid))
			elif e!=0 and math.log10(e/e2) >= -10: 
				OG2id = OG2.id.replace(".meta_raw", "")
				processed.append(("2 ", sys.argv[3]+" ", protein.id+" ", OG2id+" ", str(e2)+" ", pid))
		processed.append((rank+" ", sys.argv[3]+" ", protein.id+" ", OGid+" ", es+" ", pid))	

	#write entries to file
	if os.path.isfile(sys.argv[4]): #if an existing file was provided append entries to that file
		output = open(sys.argv[4], "a")
	else: #else make a new file and add a header
		output = open(sys.argv[4], "w")
		output.write("Rank Level ProteinID GroupID evalue ProteomeID \n")
	for i in processed:
		output.write("".join(str(s) for s in i) + "\n")
	output.close()

Beispiel #56

0

Datei anzeigen

Datei: divide.py Projekt: jangwen/python

def parse_blast():
    parse_result = list()
    blast_result = SearchIO.parse('out/BlastResult.xml', 'blast-xml')
    for record in blast_result:
        if len(record) == 0:
            continue
        else:
            tophit = record[0]
        query_info = ''.join([
            tophit[0][0].query_id,
            ' ',
            tophit[0][0].query_description
        ])
        hit_info = tophit[0][0].hit.id
        parse_result.append([query_info, hit_info])
    parse_result = dict(parse_result)
    return parse_result

Beispiel #57

0

Datei anzeigen

def blastparse(stdout, output, tname, ntname):
    global recorddict, minLength
    # evaluehit = True
    handle = open(output, 'w')  # open the target fasta file for writing
    blast_handle = cStringIO.StringIO(stdout)  # Convert string to IO object for use in SearchIO using StringIO
    try:  # Necessary to avoid bad genomes
        for qresult in SearchIO.parse(blast_handle, 'blast-tab'):  # Parse the blast output sting as if it were a file
            for hit in qresult:  # Hit object
                for hsp in hit:  # Hsp object
                    begin = hsp.query_range[0]  # Start of hsp
                    finish = hsp.query_range[1]  # End of hsp
                    if hsp.query_id in recorddict:
                        # Change the hit to lower case for the first time
                        sequence = recorddict[hsp.query_id].seq[begin:finish]  # make mutable
                        if sequence != "N" * len(sequence):
                            if str(sequence).isupper():
                                # sequence = sequence[begin:finish].tostring().lower()
                                recorddict[hsp.query_id].seq[begin:finish] = str(sequence).lower()
                                # print repr(recorddict[hsp.query_id].seq[begin:finish])
                            elif re.search('[A-Z]+', str(sequence)) is not None:
                                recorddict[hsp.query_id].seq[begin:finish] = str(sequence).lower()
                            # For the Contig name in the target fasta dictionary mask using coordinates
                            else:
                                if finish > begin:
                                    recorddict[hsp.query_id].seq = \
                                        recorddict[hsp.query_id].seq[:begin] + 'N' * (finish - begin + 1) \
                                        + recorddict[hsp.query_id].seq[finish:]
                                else:
                                    recorddict[hsp.query_id].seq \
                                        = recorddict[hsp.query_id].seq[:finish] + 'N' * (begin - finish + 1) \
                                        + recorddict[hsp.query_id].seq[begin:]
        recorddict_bak = deepcopy(recorddict)  # Copy the dictionary so we may iterate and modify the result
        for idline in recorddict_bak:
            recorddict[idline].seq = recorddict[idline].seq.toseq()
            # pattern = r'[^N]{'+ re.escape(str(minLength))+r'}' #  Find a sequence of at least the target length
            pattern = r'[^N]{'+ re.escape(str(minLength))+r',}|[ATCG]{20,}N{200,900}[ATCG]{20,}'
            if re.match(pattern, str(recorddict[idline].seq), re.IGNORECASE, overlapped=True) is not None:
                SeqIO.write(recorddict[idline], handle, "fasta")
                recorddict[idline].seq = recorddict[idline].seq.tomutable()
            else:
                # print 'Contig \'%s\' not written to file' % id
                recorddict.pop(idline)
    except ValueError:
        print 'Value Error: There was an error removing %s genome from %s' % (ntname, tname)

Beispiel #58

0

Datei anzeigen

Datei: myCommands.py Projekt: gregorylburgess/COIAlignmentDistance

def computeStats(aliFName, validCols):

    # Quick and VERY slow
    # convert o matrix and operate directly on it
    ali = AlignIO.read(aliFName, 'clustal')
    scores = {}

    for seqNumRemove in range(len(ali)):
        newSeqs = AlignIO.MultipleSeqAlignment([])
        aliWithValidCols = AlignIO.MultipleSeqAlignment([])

        for seqNum in range(len(ali)):
            if seqNum != seqNumRemove:
                newSeqs.append(ali[seqNum])

        aliWithValidCols = newSeqs[:,validCols[0]:validCols[0]+1]
        querySeq = ali[seqNumRemove, validCols[0]:validCols[0]+1]

        for col in validCols[1:]:
            aliWithValidCols += newSeqs[:,col:col+1]
            querySeq += ali[seqNumRemove, col:col+1]
        AlignIO.write(aliWithValidCols, open("/tmp/tempPartialAli", 'w'), 'clustal')
        SeqIO.write(querySeq, open('/tmp/querySeq', 'w'), 'fasta')
        print "seq %s out of %s completed" % (seqNumRemove, len(ali))
        print "running hmmer and gathering stats"

        hmmBuildCmd = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmbuild", "/tmp/tempPartialAli.hmm", "/tmp/tempPartialAli"]
        hmmScanCmd  = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmscan",
                      "/tmp/tempPartialAli.hmm", "/tmp/querySeq"]
        hmmPressCmd = ["/Users/mahdi/programs/hmmer-3.1b2-macosx-intel/binaries/hmmpress",
                      "/tmp/tempPartialAli.hmm"]
        with open(os.devnull, 'w') as fnull, open("/tmp/alResult", 'w') as out:
            subprocess.call(hmmBuildCmd, stdout=fnull)
            subprocess.call(hmmPressCmd, stderr=fnull, stdout=fnull)

            subprocess.call(hmmScanCmd, stderr=fnull, stdout=out)


            print hmmBuildCmd
            print hmmScanCmd
        search =  SearchIO.parse("/tmp/alResult", "hmmer3-text").next()
        bScore = search.hsps[0].bitscore
        scores[search.id] = bScore
    return scores

Beispiel #59

0

Datei anzeigen

Datei: blast-youdao.py Projekt: wpwupingwp/python

def Parse():
    parse_blast_results = list(SearchIO.parse(tmpfile_name, 'blast-xml'))
    for result in parse_blast_results:
        query_id = result.description
        add = list()
        for hit in result:
            hit_id = hit.description
            hit_score = hit[0].evalue  # Only use first HSP
            name = hit_id.split(sep=' ')[:3]
            # The first three words of description is usually enough to know the organism name
            name = ' '.join(name)
            dictionary[name] = None
            add.append([query_id, hit_id, str(hit_score), name])
        out.extend(add)

    # Translate organism name with YouDao API
    api_key = '1630771459'  # 1000 times per hour
    for words in dictionary.keys():
        if dictionary[words] is not None:
            continue
        youdao_results = urllib.request.urlopen(''.join([
            'http://fanyi.youdao.com/openapi.do?keyfrom=Blastit&key=',
            api_key,
            '&type=data&doctype=json&version=1.1&q=',
            words
        ])).read().decode('utf-8')
        parse_translate_results = json.loads(youdao_results)
        translation_results = parse_translate_results['translation']
        dictionary[words] = translation_results[0]

    # Output
    n = 0
    for item in out:
        if n % 3 is 0:
            print('\nQuery sequence id:\t', item[0])
        print(
            '\t', 'Description:', item[1], '\n',
            '\t', 'Evalue:', item[2], '\n',
            '\t', 'Possible name:', item[3], '\n',
            '\t', 'Chinese:', dictionary[item[3]],
            '\n'
        )
        n = n + 1
    return

Beispiel #60

0

Datei anzeigen

Datei: msa_setup.py Projekt: kashmatic/BLCA

def get_hit_seq_megan(filename):
	yamlfile = yaml_load_file()
	blout = SearchIO.parse(filename, 'blast-text')
	for query in blout:
		seqid = query.id.split("\n")[0]
		seqid = seqid
		#print(seqid)
		fh = open("multi_" + seqid + ".fasta", 'a')
		yamlfile[seqid]['hits'] = {}
		bcp = 1 - (my_module.BLAST_CUTOFF_PERCENT / 100)
		topscore = 0
		for hit in query.hits:
			gi = re.match(r"gi\|(.*)\|ref", hit.id).group(1)
			yamlfile[seqid]['hits'][gi] = {}
			#print(yamlfile[seqid]['hits'])
			for hsp in hit.hsps:
				if topscore == 0:
					topscore = hsp.bitscore
				if hsp.bitscore < (topscore * bcp):
					#print(seqid, " not included: ", gi, " score: ", str(hsp.bitscore), " topscore: ", topscore)
					continue
				if hsp.bitscore < my_module.BLAST_CUTOFF_SCORE:
					continue
				if int(100 * hsp.ident_num/hsp.aln_span) < my_module.BLAST_CUTOFF_PERCENT:
					continue
				if int(100 * (float(hsp.query_end - hsp.query_start + 1)/query.seq_len)) < my_module.BLAST_COVERAGE:
					continue
				#print(hsp.hit, "\t", hsp.hit_strand, "\t", hsp.hit_start, "\t", hsp.hit_end, "\t", hsp.bitscore)
				#print(hsp.query_end, "\t", hsp.query_start, "\t", query.seq_len , "\t", int(100 * (float(hsp.query_end - hsp.query_start + 1)/query.seq_len)))
				hitstart = hsp.hit_start + 1 - 10
				hitstart = 1 if hitstart <= 0 else hitstart
				hitend = hsp.hit_end + 1 + 10
				hitstrand = "plus" if (hsp.hit_strand == 1) else "minus"
				#print(hitstart, hitend)
				out = os.popen(my_module.BLAST_BINARY + "/blastdbcmd -db " + my_module.BLAST_DATABASE + " -dbtype nucl -entry " + str(gi) + " -range " + str(hitstart) + "-" + str(hitend) + " -strand " + str(hitstrand)).read()
				#print("out", out)
				fh.write(out)
				#print(hsp.hit.seq)
				#print(hsp.query.seq)
				#print("-----")
		fh.close()
		#break
	yaml_dump_file(yamlfile)