Example #1
0
    def run(self, record):
        cluster_features = util.get_cluster_features(record)
        if not len(cluster_features):
            return

        logging.info('Classifying %s BGCs using %s model in %s',
                     len(cluster_features), self.classifier_name, record.id)

        # Create list of DataFrames with Pfam sequences (one for each cluster)
        cluster_pfam_sequences = []
        for feature in cluster_features:
            cluster_record = util.extract_cluster_record(feature, record)
            cluster_pfam_sequences.append(
                util.create_pfam_dataframe(cluster_record, add_scores=False))

        # Predict BGC score of each Pfam
        class_scores = self.model.predict(cluster_pfam_sequences)

        predicted_classes = []
        # Annotate classes to all cluster features
        for i, feature in enumerate(cluster_features):
            scores = class_scores.iloc[i]
            # Add predicted score for each class
            score_column = util.format_classification_score_column(
                self.classifier_name)
            feature.qualifiers[score_column] = [
                util.encode_class_score_string(scores)
            ]
            # Add classes with score over given threshold
            new_classes = list(
                class_scores.columns[scores >= self.score_threshold])
            class_column = util.format_classification_column(
                self.classifier_name)
            all_classes = new_classes
            if feature.qualifiers.get(class_column):
                prev_classes = feature.qualifiers.get(class_column)[0].split(
                    '-')
                all_classes = sorted(list(set(all_classes + prev_classes)))
            if all_classes:
                feature.qualifiers[class_column] = ['-'.join(all_classes)]
            predicted_classes += new_classes or ['no confident class']

        # Add detector metadata to the record as a structured comment
        if 'structured_comment' not in record.annotations:
            record.annotations['structured_comment'] = {}

        comment_key = util.format_classifier_meta_key(self.classifier_name)
        record.annotations['structured_comment'][
            comment_key] = collections.OrderedDict(
                name=self.classifier_name,
                version=self.model.version,
                version_timestamp=self.model.timestamp,
                classification_timestamp_utc=datetime.utcnow().isoformat(),
                score_threshold=self.score_threshold)

        class_counts = pd.Series(predicted_classes).value_counts()
        self.total_class_counts = self.total_class_counts.add(class_counts,
                                                              fill_value=0)
Example #2
0
def test_unit_writer_single_feature(tmpdir, writer_cls, processed_record):
    out_path = os.path.join(str(tmpdir), 'file.png')
    cds_features = util.get_protein_features(processed_record)
    pfam_features = util.get_pfam_features(processed_record)
    cluster_features = util.get_cluster_features(processed_record)
    processed_record.features = cds_features[:
                                             1] + pfam_features[:
                                                                1] + cluster_features[:
                                                                                      1]
    writer = writer_cls(out_path=out_path)
    writer.write(processed_record)
    writer.close()
Example #3
0
def test_integration_pipeline_default(tmpdir, input_file):
    tmpdir = str(tmpdir)
    report_dir = os.path.join(tmpdir, 'report')
    run(['pipeline', '--output', report_dir, get_test_file(input_file)])

    files = os.listdir(report_dir)
    for file in files:
        print(file)

    assert 'README.txt' in files
    assert 'report.bgc.gbk' in files
    assert 'report.bgc.tsv' in files
    assert 'report.full.gbk' in files
    assert 'report.pfam.tsv' in files

    evaluation_dir = os.path.join(report_dir, 'evaluation')
    files = os.listdir(evaluation_dir)
    for file in files:
        print(file)

    assert 'report.bgc.png' in files
    assert 'report.score.png' in files

    records = list(
        SeqIO.parse(os.path.join(report_dir, 'report.full.gbk'), 'genbank'))
    assert len(records) == 2

    record = records[0]
    cluster_features = util.get_cluster_features(record)
    assert len(cluster_features) >= 1

    record = records[1]
    cluster_features = util.get_cluster_features(record)
    assert len(cluster_features) >= 1

    cluster_records = list(
        SeqIO.parse(os.path.join(report_dir, 'report.bgc.gbk'), 'genbank'))
    assert len(cluster_records) >= 2
Example #4
0
    def write(self, record):
        cluster_features = util.get_cluster_features(record)
        classifier_names = util.get_record_classifier_names(record)
        subregions = []
        protoclusters = []
        for cluster in cluster_features:
            subregion = self._create_cluster_json(
                cluster, classifier_names=classifier_names)
            subregions.append(subregion)
            # TODO add protocluster?

        self.record_ids.append(record.id)
        self.record_subregions.append(subregions)
        self.record_protoclusters.append(protoclusters)
        for detector_label, meta in util.get_record_detector_meta(
                record).items():
            for k, v in meta.items():
                self.tool_meta['{}_{}'.format(detector_label, k)] = str(v)
Example #5
0
 def write(self, record):
     clusters = util.get_cluster_features(record)
     for cluster in clusters:
         cluster_record = util.extract_cluster_record(cluster, record)
         SeqIO.write(cluster_record, self.fd, 'genbank')