Beispiel #1
0
def search(
    database,
    sequences=None,
    query_file=None,
    query_ids=None,
    blast_file=None,
    **kwargs,
):
    """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote).

    Arguments:
        database (str): Path to DIAMOND database
        sequences (dict): Query sequences
        query_file (str): Path to FASTA file containing query sequences
        query_ids (list): NCBI sequence accessions
        blast_file (str): Path to the file blast results are written to
    Raises:
        ValueError: No value given for query_file or query_ids
    Returns:
        list: Parsed rows with hits from DIAMOND results table
    """
    if query_file:
        table = diamond(query_file, database, **kwargs)
    else:
        if not sequences:
            sequences = helpers.get_sequences(query_ids=query_ids)

        # delete=False since you cannot open tempfiles twice in Windows
        # see: https://stackoverflow.com/questions/46497842/passing-namedtemporaryfile-to-a-subprocess-on-windows
        fasta = NTF("w", delete=False)
        text = helpers.sequences_to_fasta(sequences)
        try:
            with fasta:
                fasta.write(text)
            table = diamond(fasta.name, database, **kwargs)
        finally:
            os.unlink(fasta.name)

    results = parse(table)

    if blast_file:
        LOG.info("Writing DIAMOND hit table to %s", blast_file)
        blast_table = "".join(table)
        with open(blast_file, "w") as f:
            f.write(blast_table)

    return results
Beispiel #2
0
def search(
    database,
    sequences=None,
    query_file=None,
    query_ids=None,
    blast_file=None,
    **kwargs,
):
    """Launch a new BLAST search using either DIAMOND or command-line BLASTp (remote).

    Arguments:
        database (str): Path to DIAMOND database
        sequences (dict): Query sequences
        query_file (str): Path to FASTA file containing query sequences
        query_ids (list): NCBI sequence accessions
        blast_file (TextIOWrapper): file blast results are written to
    Raises:
        ValueError: No value given for query_file or query_ids
    Returns:
        list: Parsed rows with hits from DIAMOND results table
    """
    if query_file:
        table = diamond(query_file, database, **kwargs)
    else:
        if not sequences:
            sequences = helpers.get_sequences(query_ids=query_ids)

        # delete=False since you cannot open tempfiles twice in Windows
        fasta = NTF("w", delete=False)
        text = helpers.sequences_to_fasta(sequences)
        try:
            with fasta:
                fasta.write(text)
            table = diamond(fasta.name, database, **kwargs)
        finally:
            os.unlink(fasta.name)

    results = parse(table)

    if blast_file:
        LOG.info("Writing DIAMOND hit table to %s", blast_file.name)
        blast = "\n".join(results)
        blast_file.write(blast)

    return results
Beispiel #3
0
def start(
    sequences=None,
    query_file=None,
    query_ids=None,
    database="nr",
    program="blastp",
    megablast=False,
    filtering="F",
    evalue=0.1,
    nucl_reward=None,
    nucl_penalty=None,
    gap_costs="11 1",
    matrix="BLOSUM62",
    hitlist_size=500,
    threshold=11,
    word_size=6,
    comp_based_stats=2,
    entrez_query=None,
):
    """Launch a remote BLAST search using NCBI BLAST API.

    Note that the HITLIST_SIZE, ALIGNMENTS and DESCRIPTIONS parameters must all be set
    together in order to mimic max_target_seqs behaviour.

    Usage guidelines:

    1. Don't contact server more than once every 10 seconds
    2. Don't poll for a single RID more than once a minute
    3. Use URL parameter email/tool
    4. Run scripts weekends or 9pm-5am Eastern time on weekdays if >50 searches

    For a full description of the parameters, see:

        1. `BLAST API documentation<https://ncbi.github.io/blast-cloud/dev/api.html>`
        2. `BLAST documentation <https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp>`

    Parameters:
        sequences (dict): Query sequence dict generated by helpers.get_sequences()
        query_file (str): Path to a query FASTA file
        query_ids (list): Collection of NCBI sequence identifiers
        database (str): Target NCBI BLAST database
        program (str): BLAST variant to run
        megablast (bool): Enable megaBLAST option (only with BLASTn)
        filtering (str): Low complexity filtering
        evalue (float): E-value cutoff
        nucl_reward (int): Reward for matching bases (only with BLASTN/megaBLAST)
        nucl_penalty (int): Penalty for mismatched bases (only with BLASTN/megaBLAST)
        gap_costs (str): Gap existence and extension costs
        matrix (str): Scoring matrix name
        hitlist_size (int): Number of database sequences to keep
        threshold (int): Neighbouring score for initial words
        word_size (int): Size of word for initial matches
        comp_based_stats (int): Composition based statistics algorithm
        entrez_query (str): NCBI Entrez search term for pre-filtering the BLAST database

    Returns:
        rid (str): Request Identifier (RID) assigned to the search
        rtoe (int): Request Time Of Execution (RTOE), estimated run time of the search
    """
    if not sequences:
        sequences = helpers.get_sequences(query_file=query_file,
                                          query_ids=query_ids)

    query = helpers.sequences_to_fasta(sequences)

    parameters = {
        "CMD": "PUT",
        "DATABASE": database,
        "PROGRAM": program,
        "FILTER": filtering,
        "EXPECT": evalue,
        "GAPCOSTS": gap_costs,
        "MATRIX": matrix,
        "HITLIST_SIZE": hitlist_size,
        "ALIGNMENTS": hitlist_size,
        "DESCRIPTIONS": hitlist_size,
        "WORD_SIZE": word_size,
        "COMPOSITION_BASED_STATISTICS": comp_based_stats,
    }

    if entrez_query:
        parameters["ENTREZ_QUERY"] = entrez_query

    if program == "blastn":
        if megablast:
            parameters["MEGABLAST"] = "on"
        if nucl_reward:
            parameters["NUCL_REWARD"] = nucl_reward
        if nucl_penalty:
            parameters["NUCL_PENALTY"] = nucl_penalty
    else:
        # Does not apply to blastn
        parameters["THRESHOLD"] = threshold

    response = requests.post(BLAST_API_URL,
                             files={"QUERY": query},
                             params=parameters)

    LOG.debug("Search parameters: %s", parameters)
    LOG.debug("Search URL: %s", response.url)

    rid, rtoe = re.findall(r"(?:RID|RTOE) = (.+?)[\n\s]", response.text)
    return rid, int(rtoe)