Example #1
0
def run_starter_unit_blastp(
        cluster: Cluster,
        cds_hmm_hits: Dict[str,
                           List[HMMResult]]) -> Dict[str, List[HMMResult]]:
    """ Runs blastp on starter unit coding sequences in given cluster

        Arguments:
            cluster: Cluster on which the blastp shall be run
            cds_hmm_hits: HMMResults by cds from type II PKS hmmscan

        Returns:
            None if no starter unit cds are present otherwise a dictionary of key: cds and value: list of HMMresults, for blastp results of the cluster
    """
    starter_unit_cds = {}
    for cds, hmm_hits in cds_hmm_hits.items():
        starter_unit_hit_ids = [
            hit.hit_id for hit in hmm_hits
            if hit.hit_id in ['KSIII', 'AT', 'AMID', 'LIG']
        ]
        if starter_unit_hit_ids:
            starter_unit_cds[cluster.parent_record.get_cds_by_name(
                cds)] = starter_unit_hit_ids

    if starter_unit_cds:
        blastp_results = []
        blastp_fasta_files = set()
        for cds, starter_unit_hit_ids in starter_unit_cds.items():
            query_sequence = fasta.get_fasta_from_features([cds])
            for hit_id in starter_unit_hit_ids:
                blast_database = path.get_full_path(__file__, 'data', hit_id)
                blastp_results.extend(
                    subprocessing.run_blastp(blast_database, query_sequence))
                blastp_fasta_files.add(
                    path.get_full_path(__file__, 'data', hit_id + '.fasta'))

        fasta_lengths = {}
        for fasta_file in blastp_fasta_files:
            fasta_lengths.update(get_fasta_lengths(fasta_file))

        return refine_hmmscan_results(blastp_results, fasta_lengths)

    return {}
Example #2
0
def run_starter_unit_blastp(
    cds_hmm_hits: Dict[CDSFeature,
                       List[HMMResult]]) -> Dict[str, List[HMMResult]]:
    """ Runs blastp on starter unit coding sequences in given cluster

        Arguments:
            cds_hmm_hits: HMMResults by cds from type II PKS hmmscan

        Returns:
            a dictionary mapping CDS name to a list of HMMresults
    """
    blastp_results = []
    blastp_fasta_files = set()
    for cds, hmm_hits in cds_hmm_hits.items():
        query_sequence = fasta.get_fasta_from_features([cds])
        for hit in hmm_hits:
            if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']:
                continue
            blast_database = path.get_full_path(__file__, 'data', hit.hit_id)
            blastp_results.extend(
                subprocessing.run_blastp(blast_database, query_sequence))
            blastp_fasta_files.add(
                path.get_full_path(__file__, 'data', hit.hit_id + '.fasta'))

    if not blastp_results:
        return {}

    fasta_lengths = {}
    for fasta_file in blastp_fasta_files:
        fasta_lengths.update(get_fasta_lengths(fasta_file))

    results = refine_hmmscan_results(blastp_results, fasta_lengths)
    for hits in results.values():
        for i, hit in enumerate(hits):
            if not hit.hit_id.endswith("-CoA"):
                hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start,
                                    hit.query_end, hit.evalue, hit.bitscore)
    return results