def parse( handle, sequences=None, query_file=None, query_ids=None, max_evalue=0.01, min_identity=30, min_coverage=50, ): """Parse Tabular results from remote BLAST search performed via API. Since the API provides no option for returning query coverage, which is a metric we want to use for filtering hits, query sequences must be passed to this function so that their lengths can be compared to the alignment length. Arguments: handle (list): File handle (or file handle-like) object corresponding to BLAST results. Note that this function expects an iterable of tab-delimited lines and performs no validation/error checking sequences (dict): Query sequences query_file (str): Path to FASTA format query file query_ids (list): NCBI sequence identifiers max_evalue (float): Maximum e-value min_identity (float): Minimum percent identity min_coverage (float): Minimum percent query coverage Returns: list: Hit objects corresponding to criteria passing BLAST hits """ if not sequences: sequences = helpers.get_sequences(query_file, query_ids) hits = [] for line in handle: qid, sid, pident, *_, qstart, qend, _, _, evalue, score, _ = line.split( "\t") # Manually calculate query coverage coverage = (int(qend) - int(qstart) + 1) / len(sequences[qid]) * 100 hit = Hit( query=qid, subject=sid, identity=pident, coverage=coverage, evalue=evalue, bitscore=score, ) if (float(hit.identity) > min_identity and float(hit.coverage) > min_coverage and hit.evalue < max_evalue): hits.append(hit) if len(hits) == 0: raise ValueError("No results found") return hits
def search( database, sequences=None, query_file=None, query_ids=None, blast_file=None, **kwargs, ): """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote). Arguments: database (str): Path to DIAMOND database sequences (dict): Query sequences query_file (str): Path to FASTA file containing query sequences query_ids (list): NCBI sequence accessions blast_file (str): Path to the file blast results are written to Raises: ValueError: No value given for query_file or query_ids Returns: list: Parsed rows with hits from DIAMOND results table """ if query_file: table = diamond(query_file, database, **kwargs) else: if not sequences: sequences = helpers.get_sequences(query_ids=query_ids) # delete=False since you cannot open tempfiles twice in Windows # see: https://stackoverflow.com/questions/46497842/passing-namedtemporaryfile-to-a-subprocess-on-windows fasta = NTF("w", delete=False) text = helpers.sequences_to_fasta(sequences) try: with fasta: fasta.write(text) table = diamond(fasta.name, database, **kwargs) finally: os.unlink(fasta.name) results = parse(table) if blast_file: LOG.info("Writing DIAMOND hit table to %s", blast_file) blast_table = "".join(table) with open(blast_file, "w") as f: f.write(blast_table) return results
def search( database, sequences=None, query_file=None, query_ids=None, blast_file=None, **kwargs, ): """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote). Arguments: database (str): Path to DIAMOND database sequences (dict): Query sequences query_file (str): Path to FASTA file containing query sequences query_ids (list): NCBI sequence accessions blast_file (TextIOWrapper): file blast results are written to Raises: ValueError: No value given for query_file or query_ids Returns: list: Parsed rows with hits from DIAMOND results table """ if query_file: table = diamond(query_file, database, **kwargs) else: if not sequences: sequences = helpers.get_sequences(query_ids=query_ids) # delete=False since you cannot open tempfiles twice in Windows fasta = NTF("w", delete=False) text = helpers.sequences_to_fasta(sequences) try: with fasta: fasta.write(text) table = diamond(fasta.name, database, **kwargs) finally: os.unlink(fasta.name) results = parse(table) if blast_file: LOG.info("Writing DIAMOND hit table to %s", blast_file.name) blast = "\n".join(results) blast_file.write(blast) return results
def start( sequences=None, query_file=None, query_ids=None, database="nr", program="blastp", megablast=False, filtering="F", evalue=0.1, nucl_reward=None, nucl_penalty=None, gap_costs="11 1", matrix="BLOSUM62", hitlist_size=500, threshold=11, word_size=6, comp_based_stats=2, entrez_query=None, ): """Launch a remote BLAST search using NCBI BLAST API. Note that the HITLIST_SIZE, ALIGNMENTS and DESCRIPTIONS parameters must all be set together in order to mimic max_target_seqs behaviour. Usage guidelines: 1. Don't contact server more than once every 10 seconds 2. Don't poll for a single RID more than once a minute 3. Use URL parameter email/tool 4. Run scripts weekends or 9pm-5am Eastern time on weekdays if >50 searches For a full description of the parameters, see: 1. `BLAST API documentation<https://ncbi.github.io/blast-cloud/dev/api.html>` 2. `BLAST documentation <https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp>` Parameters: sequences (dict): Query sequence dict generated by helpers.get_sequences() query_file (str): Path to a query FASTA file query_ids (list): Collection of NCBI sequence identifiers database (str): Target NCBI BLAST database program (str): BLAST variant to run megablast (bool): Enable megaBLAST option (only with BLASTn) filtering (str): Low complexity filtering evalue (float): E-value cutoff nucl_reward (int): Reward for matching bases (only with BLASTN/megaBLAST) nucl_penalty (int): Penalty for mismatched bases (only with BLASTN/megaBLAST) gap_costs (str): Gap existence and extension costs matrix (str): Scoring matrix name hitlist_size (int): Number of database sequences to keep threshold (int): Neighbouring score for initial words word_size (int): Size of word for initial matches comp_based_stats (int): Composition based statistics algorithm entrez_query (str): NCBI Entrez search term for pre-filtering the BLAST database Returns: rid (str): Request Identifier (RID) assigned to the search rtoe (int): Request Time Of Execution (RTOE), estimated run time of the search """ if not sequences: sequences = helpers.get_sequences(query_file=query_file, query_ids=query_ids) query = helpers.sequences_to_fasta(sequences) parameters = { "CMD": "PUT", "DATABASE": database, "PROGRAM": program, "FILTER": filtering, "EXPECT": evalue, "GAPCOSTS": gap_costs, "MATRIX": matrix, "HITLIST_SIZE": hitlist_size, "ALIGNMENTS": hitlist_size, "DESCRIPTIONS": hitlist_size, "WORD_SIZE": word_size, "COMPOSITION_BASED_STATISTICS": comp_based_stats, } if entrez_query: parameters["ENTREZ_QUERY"] = entrez_query if program == "blastn": if megablast: parameters["MEGABLAST"] = "on" if nucl_reward: parameters["NUCL_REWARD"] = nucl_reward if nucl_penalty: parameters["NUCL_PENALTY"] = nucl_penalty else: # Does not apply to blastn parameters["THRESHOLD"] = threshold response = requests.post(BLAST_API_URL, files={"QUERY": query}, params=parameters) LOG.debug("Search parameters: %s", parameters) LOG.debug("Search URL: %s", response.url) rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text) return rid, int(rtoe)
def test_get_sequences_bad_input(): with pytest.raises(ValueError): helpers.get_sequences()
def test_get_sequences_query_ids(mocker): mocker.patch("cblaster.helpers.efetch_sequences") helpers.get_sequences(query_ids=["seq1", "seq2"]) helpers.efetch_sequences.assert_called_once_with(["seq1", "seq2"])
def test_get_sequences_query_file(mocker): mocker.patch("cblaster.helpers.parse_fasta") helpers.get_sequences(query_file=TEST_DIR / "test.faa") helpers.parse_fasta.assert_called_once()
def cblaster( query_file=None, query_ids=None, mode=None, json_db=None, database=None, gap=20000, unique=3, min_hits=3, min_identity=30, min_coverage=50, max_evalue=0.01, entrez_query=None, output=None, output_hide_headers=False, output_delimiter=None, output_decimals=4, binary=None, binary_hide_headers=True, binary_delimiter=None, binary_key=len, binary_attr="identity", binary_decimals=4, rid=None, require=None, session_file=None, indent=None, plot=False, recompute=False, blast_file=None, ipg_file=None, hitlist_size=None, ): """Run cblaster. This function is the central workflow for the entire cblaster package. Arguments: query_file (str): Path to FASTA format query file query_ids (list): NCBI protein sequence identifiers mode (str): Search mode ('local' or 'remote') json_db (str): JSON database created with cblaster makedb database (str): Search database (NCBI if remote, DIAMOND if local) gap (int): Maximum gap (kilobase) between cluster hits unique (int): Minimum number of query sequences with hits in clusters min_hits (int): Minimum number of hits in clusters min_identity (float): Minumum identity (%) cutoff min_coverage (float): Minumum coverage (%) cutoff max_evalue (float): Maximum e-value threshold entrez_query (str): NCBI Entrez query to filter search database output (str): Path to cblaster summary output file output_hide_headers (bool): Hide headers in summary table output_delimiter (str): Delimiter used in summary table output_decimals (int): Total decimal places in hit scores in summary table binary (str): Path to cblaster binary output file binary_hide_headers (bool): Hide headers in binary table binary_delimiter (str): Delimiter used in binary table binary_key (str): Key function used in binary table (len, max or sum) binary_attr (str): Hit attribute used for calculating cell values in binary table binary_decimals (int): Total decimal places in cell values in binary table rid (str): NCBI BLAST search request identifier (RID) require (list): Query sequences that must be in hit clusters session_file (str): Path to cblaster session JSON file indent (int): Total spaces to indent JSON files plot (str): Path to cblaster plot HTML file recompute (str): Path to recomputed session JSON file Returns: Session: cblaster search Session object """ if session_file and all(Path(sf).exists() for sf in session_file): LOG.info("Loading session(s) %s", session_file) session = Session.from_files(session_file) if recompute: LOG.info("Filtering session with new thresholds") context.filter_session( session, min_identity, min_coverage, max_evalue, gap, unique, min_hits, require, ) if recompute is not True: LOG.info("Writing recomputed session to %s", recompute) with open(recompute, "w") as fp: session.to_json(fp, indent=indent) else: session = Session( queries=query_ids if query_ids else [], sequences=helpers.get_sequences( query_file=query_file, query_ids=query_ids, ), params={ "mode": mode, "database": database, "min_identity": min_identity, "min_coverage": min_coverage, "max_evalue": max_evalue, }, ) if query_file: # get_sequences() returns OrderedDict, so save keys to # preserve query order session.queries = list(session.sequences) session.params["query_file"] = query_file if json_db: session.params["json_db"] = json_db if mode == "local": LOG.info("Starting cblaster in local mode") results = local.search( database, sequences=session.sequences, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, blast_file=blast_file, ) elif mode == "remote": LOG.info("Starting cblaster in remote mode") if entrez_query: session.params["entrez_query"] = entrez_query rid, results = remote.search( sequences=session.sequences, rid=rid, database=database, min_identity=min_identity, min_coverage=min_coverage, max_evalue=max_evalue, entrez_query=entrez_query, blast_file=blast_file, hitlist_size=hitlist_size, ) session.params["rid"] = rid LOG.info("Found %i hits meeting score thresholds", len(results)) LOG.info("Fetching genomic context of hits") query_sequence_order = list(session.sequences.keys()) \ if any(query_file.endswith(ext) for ext in (".gbk", ".gb", ".genbank", ".gbff", ".embl", ".emb"))\ else None session.organisms = context.search( results, unique=unique, min_hits=min_hits, gap=gap, require=require, json_db=json_db, ipg_file=ipg_file, query_sequence_order=query_sequence_order) if session_file: LOG.info("Writing current search session to %s", session_file[0]) if len(session_file) > 1: LOG.warning("Multiple session files specified, using first") with open(session_file[0], "w") as fp: session.to_json(fp, indent=indent) if binary: LOG.info("Writing binary summary table to %s", binary) session.format( "binary", open(binary, "w"), hide_headers=binary_hide_headers, delimiter=binary_delimiter, key=binary_key, attr=binary_attr, decimals=binary_decimals, ) LOG.info("Writing summary to %s", "stdout" if output == sys.stdout else output) results = session.format( "summary", fp=open(output, "w") if output else sys.stdout, hide_headers=output_hide_headers, delimiter=output_delimiter, decimals=output_decimals, ) if plot: plot = None if plot is True else plot plot_session(session, output=plot) LOG.info("Done.") return session
def test_get_sequences_query_file(mocker): sequences = helpers.get_sequences(query_file=TEST_DIR / "test.faa") assert {'QBE85648.1', 'QBE85647.1', 'QBE85646.1'}.issubset(sequences)