def run_starter_unit_blastp( cluster: Cluster, cds_hmm_hits: Dict[str, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cluster: Cluster on which the blastp shall be run cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: None if no starter unit cds are present otherwise a dictionary of key: cds and value: list of HMMresults, for blastp results of the cluster """ starter_unit_cds = {} for cds, hmm_hits in cds_hmm_hits.items(): starter_unit_hit_ids = [ hit.hit_id for hit in hmm_hits if hit.hit_id in ['KSIII', 'AT', 'AMID', 'LIG'] ] if starter_unit_hit_ids: starter_unit_cds[cluster.parent_record.get_cds_by_name( cds)] = starter_unit_hit_ids if starter_unit_cds: blastp_results = [] blastp_fasta_files = set() for cds, starter_unit_hit_ids in starter_unit_cds.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit_id in starter_unit_hit_ids: blast_database = path.get_full_path(__file__, 'data', hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit_id + '.fasta')) fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) return refine_hmmscan_results(blastp_results, fasta_lengths) return {}
def run_starter_unit_blastp( cds_hmm_hits: Dict[CDSFeature, List[HMMResult]]) -> Dict[str, List[HMMResult]]: """ Runs blastp on starter unit coding sequences in given cluster Arguments: cds_hmm_hits: HMMResults by cds from type II PKS hmmscan Returns: a dictionary mapping CDS name to a list of HMMresults """ blastp_results = [] blastp_fasta_files = set() for cds, hmm_hits in cds_hmm_hits.items(): query_sequence = fasta.get_fasta_from_features([cds]) for hit in hmm_hits: if hit.hit_id not in ['KSIII', 'AT', 'AMID', 'LIG']: continue blast_database = path.get_full_path(__file__, 'data', hit.hit_id) blastp_results.extend( subprocessing.run_blastp(blast_database, query_sequence)) blastp_fasta_files.add( path.get_full_path(__file__, 'data', hit.hit_id + '.fasta')) if not blastp_results: return {} fasta_lengths = {} for fasta_file in blastp_fasta_files: fasta_lengths.update(get_fasta_lengths(fasta_file)) results = refine_hmmscan_results(blastp_results, fasta_lengths) for hits in results.values(): for i, hit in enumerate(hits): if not hit.hit_id.endswith("-CoA"): hits[i] = HMMResult(hit.hit_id + "-CoA", hit.query_start, hit.query_end, hit.evalue, hit.bitscore) return results