def run(self, record): logging.info('Preparing record %s', record.id) util.fix_record_locus(record) util.fix_duplicate_cds(record) util.fix_dna_alphabet(record) record_tmp_path = os.path.join(self.tmp_dir_path, util.sanitize_filename(record.id)) logging.debug('Using record TMP prefix: %s', record_tmp_path) num_proteins = len(util.get_protein_features(record)) if num_proteins: logging.info( 'Sequence already contains %s CDS features, skipping CDS detection', num_proteins) else: protein_annotator = ProdigalProteinRecordAnnotator( record=record, tmp_path_prefix=record_tmp_path, meta_mode=self.prodigal_meta_mode) protein_annotator.annotate() num_pfams = len(util.get_pfam_features(record)) if num_pfams: logging.info( 'Sequence already contains %s Pfam features, skipping Pfam detection', num_pfams) else: pfam_annotator = HmmscanPfamRecordAnnotator( record=record, tmp_path_prefix=record_tmp_path) pfam_annotator.annotate() util.sort_record_features(record)
def test_integration_prepare_default(tmpdir): tmpdir = str(tmpdir) outgbk = os.path.join(tmpdir, 'outfile.gbk') outtsv = os.path.join(tmpdir, 'outfile.tsv') run([ 'prepare', '--output-gbk', outgbk, '--output-tsv', outtsv, get_test_file('BGC0000015.fa') ]) records = list(SeqIO.parse(outgbk, 'genbank')) assert len(records) == 2 record = records[0] assert_sorted_features(record) proteins = util.get_protein_features(record) pfams = util.get_pfam_features(record) assert len(proteins) == 18 print([util.get_protein_id(f) for f in proteins]) assert len(pfams) == 111 record = records[1] assert_sorted_features(record) proteins = util.get_protein_features(record) pfams = util.get_pfam_features(record) assert len(proteins) == 27 assert len(pfams) == 36 domains = pd.read_csv(outtsv, sep='\t') records = domains.groupby('sequence_id') assert len(records) == 2 record = records.get_group('BGC0000015.1') print(record['protein_id'].unique()) # some of the proteins do not have any Pfam domains so they are not present assert len(record['protein_id'].unique()) == 17 assert len(record) == 111 record = records.get_group('BGC0000015.2') # some of the proteins do not have any Pfam domains so they are not present assert len(record['protein_id'].unique()) == 11 assert len(record) == 36
def test_unit_writer_single_feature(tmpdir, writer_cls, processed_record): out_path = os.path.join(str(tmpdir), 'file.png') cds_features = util.get_protein_features(processed_record) pfam_features = util.get_pfam_features(processed_record) cluster_features = util.get_cluster_features(processed_record) processed_record.features = cds_features[: 1] + pfam_features[: 1] + cluster_features[: 1] writer = writer_cls(out_path=out_path) writer.write(processed_record) writer.close()
def test_integration_protein_annotator(tmpdir): tmpdir = str(tmpdir) tmppath = os.path.join(tmpdir, 'test') records = SeqIO.parse(get_test_file('BGC0000015.fa'), format='fasta') record = next(records) annotator = ProdigalProteinRecordAnnotator(record=record, tmp_path_prefix=tmppath) annotator.annotate() proteins = util.get_protein_features(record) assert len(proteins) == 18 protein = proteins[0] assert protein.location.start == 3 assert protein.location.end == 1824 assert protein.id == 'BGC0000015.1_1' assert protein.qualifiers.get('locus_tag') == ['BGC0000015.1_BGC0000015.1_1'] assert_sorted_features(record)
def run(self, record): logging.info('Detecting BGCs using %s model in %s', self.detector_label, record.id) protein_features = util.get_protein_features(record) proteins_by_id = util.get_proteins_by_id(protein_features) pfam_features = util.get_pfam_features(record) if not len(pfam_features): logging.warning('Warning: No Pfam domains in record %s, skipping BGC detection', record.id) return # Filter out previous clusters detected with the same detector label num_prev_features = len(record.features) record.features = [f for f in record.features if not(f.type == 'cluster' and f.qualifiers.get('detector_label') == [self.detector_label])] num_removed_features = num_prev_features - len(record.features) if num_removed_features: logging.warning('Warning: Removed %s previously clusters detected clusters with same label "%s". ' 'Use --label DeepBGCMyLabel to preserve original clusters and add second set of clusters detected ' 'with same model but different parameters.', num_removed_features, self.detector_label) # Create DataFrame with Pfam sequence pfam_sequence = util.create_pfam_dataframe_from_features(pfam_features, proteins_by_id) # Predict BGC score of each Pfam pfam_sequence[self.score_column] = self.model.predict(pfam_sequence) # Get average BGC score for each protein protein_scores = pfam_sequence.groupby('protein_id', sort=False)[self.score_column].mean() # Add score to all Pfam features for i, feature in enumerate(pfam_features): feature.qualifiers[self.score_column] = ['{:.5f}'.format(pfam_sequence[self.score_column].iloc[i])] # Add score to all protein features for protein_id, score in protein_scores.items(): proteins_by_id[protein_id].qualifiers[self.score_column] = ['{:.5f}'.format(score)] clusters = [] active_proteins = [] gap_proteins = [] # Create a list of cluster features by merging consecutive proteins with score satisfying given threshold # Neighboring clusters within given number of nucleotides/proteins are merged for protein in protein_features: if self.score_column not in protein.qualifiers: # TODO: Should proteins with no Pfam domains also be considered? # Current protein did not have any Pfam domains, therefore it has no BGC score, ignore it continue score = float(protein.qualifiers[self.score_column][0]) # Inactive protein, add to gap if score < self.score_threshold: gap_proteins.append(protein) # We just changed from active to inactive, add current list of active proteins as a cluster if active_proteins: clusters.append(active_proteins) active_proteins = [] # Active protein else: # If no cluster is open, check if we should merge with the previous cluster if not active_proteins and clusters: prev_cluster_proteins = clusters[-1] prev_end = prev_cluster_proteins[-1].location.end if len(gap_proteins) <= self.merge_max_protein_gap or \ (protein.location.start - prev_end) <= self.merge_max_nucl_gap: # Remove previous candidate and continue where it left off clusters = clusters[:-1] active_proteins = prev_cluster_proteins + gap_proteins # Add current protein to cluster active_proteins.append(protein) gap_proteins = [] # Last protein was active, add list of active proteins as a cluster if active_proteins: clusters.append(active_proteins) # Add detected clusters as features record_num_detected = 0 for cluster_proteins in clusters: start = cluster_proteins[0].location.start end = cluster_proteins[-1].location.end candidate_id = '{}_{}-{}.1'.format(record.id, int(start), int(end)) if self.min_nucl > 1: nucl_length = end - start if nucl_length < self.min_nucl: logging.debug('Skipping cluster %s with %s < %s nucleotides', candidate_id, nucl_length, self.min_nucl) continue if self.min_proteins > 1: num_proteins = len(cluster_proteins) if num_proteins < self.min_proteins: logging.debug('Skipping cluster %s with %s < %s proteins', candidate_id, num_proteins, self.min_proteins) continue if self.min_domains > 1 or self.min_bio_domains > 0: pfam_ids = util.get_pfam_feature_ids(record) num_domains = len(pfam_features) if num_domains < self.min_domains: logging.debug('Skipping cluster %s with %s < %s protein domains', candidate_id, num_domains, self.min_domains) continue num_bio_domains = len(util.filter_biosynthetic_pfam_ids(pfam_ids)) if num_bio_domains < self.min_bio_domains: logging.debug('Skipping cluster %s with %s < %s known biosynthetic protein domains', candidate_id, num_bio_domains, self.min_bio_domains) continue scores = [float(feature.qualifiers[self.score_column][0]) for feature in cluster_proteins] location = FeatureLocation(start, end) qualifiers = { self.score_column: ['{:.5f}'.format(np.mean(scores))], 'detector': [self.detector_name], 'detector_label': [self.detector_label], 'detector_version': [self.model.version], 'detector_version_timestamp': [self.model.timestamp], 'product': ['{}_putative'.format(self.detector_name)], 'bgc_candidate_id': [candidate_id] } record.features.append(SeqFeature( location=location, type="cluster", qualifiers=qualifiers )) record_num_detected += 1 self.num_detected += 1 # Sort all features by location util.sort_record_features(record) # Add detector metadata to the record as a structured comment if 'structured_comment' not in record.annotations: record.annotations['structured_comment'] = {} comment_key = util.format_detector_meta_key(self.detector_label) record.annotations['structured_comment'][comment_key] = collections.OrderedDict( name=self.detector_name, label=self.detector_label, version=self.model.version, version_timestamp=self.model.timestamp, detection_timestamp_utc=datetime.utcnow().isoformat(), score_threshold=self.score_threshold, merge_max_nucl_gap=self.merge_max_nucl_gap, merge_max_protein_gap=self.merge_max_protein_gap, min_proteins=self.min_proteins, min_domains=self.min_domains, min_bio_domains=self.min_bio_domains ) logging.info('Detected %s BGCs using %s model in %s', record_num_detected, self.detector_label, record.id)
def annotate(self): proteins = util.get_protein_features(self.record) proteins_by_id = util.get_proteins_by_id(proteins) domtbl_path = self.tmp_path_prefix + '.pfam.domtbl.txt' if not proteins: logging.warning('No proteins in sequence %s, skipping protein domain detection', self.record.id) return if util.is_valid_hmmscan_output(domtbl_path): cached = True logging.info('Reusing already existing HMMER hmmscan result: %s', domtbl_path) else: cached = False protein_path = self.tmp_path_prefix + '.pfam.proteins.fa' # Write proteins to fasta file self._write_proteins(proteins, protein_path) logging.info('Detecting Pfam domains in "%s" using HMMER hmmscan, this might take a while...', self.record.id) start_time = datetime.now() self._run_hmmscan(protein_path, domtbl_path) logging.info('HMMER hmmscan Pfam detection done in %s', util.print_elapsed_time(start_time)) # Read domain matches in all proteins queries = SearchIO.parse(domtbl_path, 'hmmscan3-domtab') # Read descriptions from Pfam clan TSV pfam_descriptions = self._get_pfam_descriptions() # Extract all matched domain hits num = 0 pfam_ids = set() for query in queries: if cached and query.id not in proteins_by_id: raise ValueError('Found invalid protein ID "{}" in cached HMMER hmmscan result for record "{}", ' 'disable caching or delete the file: {}'.format(query.id, self.record.id, domtbl_path)) protein = proteins_by_id.get(query.id) for hit in query.hits: best_index = np.argmin([hsp.evalue for hsp in hit.hsps]) best_hsp = hit.hsps[best_index] pfam_id = hit.accession evalue = float(best_hsp.evalue) if evalue > self.max_evalue: continue location = self._get_pfam_loc(best_hsp.query_start, best_hsp.query_end, protein) qualifiers = { 'db_xref': [pfam_id], 'evalue': evalue, 'locus_tag': [query.id], 'database': [PFAM_DB_VERSION], } short_pfam_id = pfam_id.rsplit('.', 1)[0] description = pfam_descriptions.get(short_pfam_id) if description: qualifiers['description'] = [description] pfam = SeqFeature( location=location, id=pfam_id, type="PFAM_domain", qualifiers=qualifiers ) self.record.features.append(pfam) num += 1 pfam_ids.add(pfam_id) util.sort_record_features(self.record) logging.info('Added %s Pfam domains (%s unique PFAM_IDs)', num, len(pfam_ids))