def run(self, record): cluster_features = util.get_cluster_features(record) if not len(cluster_features): return logging.info('Classifying %s BGCs using %s model in %s', len(cluster_features), self.classifier_name, record.id) # Create list of DataFrames with Pfam sequences (one for each cluster) cluster_pfam_sequences = [] for feature in cluster_features: cluster_record = util.extract_cluster_record(feature, record) cluster_pfam_sequences.append( util.create_pfam_dataframe(cluster_record, add_scores=False)) # Predict BGC score of each Pfam class_scores = self.model.predict(cluster_pfam_sequences) predicted_classes = [] # Annotate classes to all cluster features for i, feature in enumerate(cluster_features): scores = class_scores.iloc[i] # Add predicted score for each class score_column = util.format_classification_score_column( self.classifier_name) feature.qualifiers[score_column] = [ util.encode_class_score_string(scores) ] # Add classes with score over given threshold new_classes = list( class_scores.columns[scores >= self.score_threshold]) class_column = util.format_classification_column( self.classifier_name) all_classes = new_classes if feature.qualifiers.get(class_column): prev_classes = feature.qualifiers.get(class_column)[0].split( '-') all_classes = sorted(list(set(all_classes + prev_classes))) if all_classes: feature.qualifiers[class_column] = ['-'.join(all_classes)] predicted_classes += new_classes or ['no confident class'] # Add detector metadata to the record as a structured comment if 'structured_comment' not in record.annotations: record.annotations['structured_comment'] = {} comment_key = util.format_classifier_meta_key(self.classifier_name) record.annotations['structured_comment'][ comment_key] = collections.OrderedDict( name=self.classifier_name, version=self.model.version, version_timestamp=self.model.timestamp, classification_timestamp_utc=datetime.utcnow().isoformat(), score_threshold=self.score_threshold) class_counts = pd.Series(predicted_classes).value_counts() self.total_class_counts = self.total_class_counts.add(class_counts, fill_value=0)
def test_unit_writer_single_feature(tmpdir, writer_cls, processed_record): out_path = os.path.join(str(tmpdir), 'file.png') cds_features = util.get_protein_features(processed_record) pfam_features = util.get_pfam_features(processed_record) cluster_features = util.get_cluster_features(processed_record) processed_record.features = cds_features[: 1] + pfam_features[: 1] + cluster_features[: 1] writer = writer_cls(out_path=out_path) writer.write(processed_record) writer.close()
def test_integration_pipeline_default(tmpdir, input_file): tmpdir = str(tmpdir) report_dir = os.path.join(tmpdir, 'report') run(['pipeline', '--output', report_dir, get_test_file(input_file)]) files = os.listdir(report_dir) for file in files: print(file) assert 'README.txt' in files assert 'report.bgc.gbk' in files assert 'report.bgc.tsv' in files assert 'report.full.gbk' in files assert 'report.pfam.tsv' in files evaluation_dir = os.path.join(report_dir, 'evaluation') files = os.listdir(evaluation_dir) for file in files: print(file) assert 'report.bgc.png' in files assert 'report.score.png' in files records = list( SeqIO.parse(os.path.join(report_dir, 'report.full.gbk'), 'genbank')) assert len(records) == 2 record = records[0] cluster_features = util.get_cluster_features(record) assert len(cluster_features) >= 1 record = records[1] cluster_features = util.get_cluster_features(record) assert len(cluster_features) >= 1 cluster_records = list( SeqIO.parse(os.path.join(report_dir, 'report.bgc.gbk'), 'genbank')) assert len(cluster_records) >= 2
def write(self, record): cluster_features = util.get_cluster_features(record) classifier_names = util.get_record_classifier_names(record) subregions = [] protoclusters = [] for cluster in cluster_features: subregion = self._create_cluster_json( cluster, classifier_names=classifier_names) subregions.append(subregion) # TODO add protocluster? self.record_ids.append(record.id) self.record_subregions.append(subregions) self.record_protoclusters.append(protoclusters) for detector_label, meta in util.get_record_detector_meta( record).items(): for k, v in meta.items(): self.tool_meta['{}_{}'.format(detector_label, k)] = str(v)
def write(self, record): clusters = util.get_cluster_features(record) for cluster in clusters: cluster_record = util.extract_cluster_record(cluster, record) SeqIO.write(cluster_record, self.fd, 'genbank')