Beispiel #1
0
    def run(self, record):
        logging.info('Preparing record %s', record.id)

        util.fix_record_locus(record)
        util.fix_duplicate_cds(record)
        util.fix_dna_alphabet(record)

        record_tmp_path = os.path.join(self.tmp_dir_path,
                                       util.sanitize_filename(record.id))
        logging.debug('Using record TMP prefix: %s', record_tmp_path)

        num_proteins = len(util.get_protein_features(record))
        if num_proteins:
            logging.info(
                'Sequence already contains %s CDS features, skipping CDS detection',
                num_proteins)
        else:
            protein_annotator = ProdigalProteinRecordAnnotator(
                record=record,
                tmp_path_prefix=record_tmp_path,
                meta_mode=self.prodigal_meta_mode)
            protein_annotator.annotate()

        num_pfams = len(util.get_pfam_features(record))
        if num_pfams:
            logging.info(
                'Sequence already contains %s Pfam features, skipping Pfam detection',
                num_pfams)
        else:
            pfam_annotator = HmmscanPfamRecordAnnotator(
                record=record, tmp_path_prefix=record_tmp_path)
            pfam_annotator.annotate()

        util.sort_record_features(record)
def test_integration_prepare_default(tmpdir):
    tmpdir = str(tmpdir)
    outgbk = os.path.join(tmpdir, 'outfile.gbk')
    outtsv = os.path.join(tmpdir, 'outfile.tsv')
    run([
        'prepare', '--output-gbk', outgbk, '--output-tsv', outtsv,
        get_test_file('BGC0000015.fa')
    ])

    records = list(SeqIO.parse(outgbk, 'genbank'))

    assert len(records) == 2

    record = records[0]
    assert_sorted_features(record)
    proteins = util.get_protein_features(record)
    pfams = util.get_pfam_features(record)

    assert len(proteins) == 18
    print([util.get_protein_id(f) for f in proteins])
    assert len(pfams) == 111

    record = records[1]
    assert_sorted_features(record)
    proteins = util.get_protein_features(record)
    pfams = util.get_pfam_features(record)

    assert len(proteins) == 27
    assert len(pfams) == 36

    domains = pd.read_csv(outtsv, sep='\t')
    records = domains.groupby('sequence_id')

    assert len(records) == 2

    record = records.get_group('BGC0000015.1')
    print(record['protein_id'].unique())
    # some of the proteins do not have any Pfam domains so they are not present
    assert len(record['protein_id'].unique()) == 17
    assert len(record) == 111

    record = records.get_group('BGC0000015.2')
    # some of the proteins do not have any Pfam domains so they are not present
    assert len(record['protein_id'].unique()) == 11
    assert len(record) == 36
def test_unit_writer_single_feature(tmpdir, writer_cls, processed_record):
    out_path = os.path.join(str(tmpdir), 'file.png')
    cds_features = util.get_protein_features(processed_record)
    pfam_features = util.get_pfam_features(processed_record)
    cluster_features = util.get_cluster_features(processed_record)
    processed_record.features = cds_features[:
                                             1] + pfam_features[:
                                                                1] + cluster_features[:
                                                                                      1]
    writer = writer_cls(out_path=out_path)
    writer.write(processed_record)
    writer.close()
Beispiel #4
0
def test_integration_protein_annotator(tmpdir):
    tmpdir = str(tmpdir)
    tmppath = os.path.join(tmpdir, 'test')
    records = SeqIO.parse(get_test_file('BGC0000015.fa'), format='fasta')
    record = next(records)

    annotator = ProdigalProteinRecordAnnotator(record=record, tmp_path_prefix=tmppath)
    annotator.annotate()
    proteins = util.get_protein_features(record)

    assert len(proteins) == 18

    protein = proteins[0]
    assert protein.location.start == 3
    assert protein.location.end == 1824
    assert protein.id == 'BGC0000015.1_1'
    assert protein.qualifiers.get('locus_tag') == ['BGC0000015.1_BGC0000015.1_1']

    assert_sorted_features(record)
Beispiel #5
0
    def run(self, record):
        logging.info('Detecting BGCs using %s model in %s', self.detector_label, record.id)

        protein_features = util.get_protein_features(record)
        proteins_by_id = util.get_proteins_by_id(protein_features)
        pfam_features = util.get_pfam_features(record)

        if not len(pfam_features):
            logging.warning('Warning: No Pfam domains in record %s, skipping BGC detection', record.id)
            return

        # Filter out previous clusters detected with the same detector label
        num_prev_features = len(record.features)
        record.features = [f for f in record.features if
                           not(f.type == 'cluster' and f.qualifiers.get('detector_label') == [self.detector_label])]
        num_removed_features = num_prev_features - len(record.features)
        if num_removed_features:
            logging.warning('Warning: Removed %s previously clusters detected clusters with same label "%s". '
                  'Use --label DeepBGCMyLabel to preserve original clusters and add second set of clusters detected '
                  'with same model but different parameters.', num_removed_features, self.detector_label)

        # Create DataFrame with Pfam sequence
        pfam_sequence = util.create_pfam_dataframe_from_features(pfam_features, proteins_by_id)

        # Predict BGC score of each Pfam
        pfam_sequence[self.score_column] = self.model.predict(pfam_sequence)

        # Get average BGC score for each protein
        protein_scores = pfam_sequence.groupby('protein_id', sort=False)[self.score_column].mean()

        # Add score to all Pfam features
        for i, feature in enumerate(pfam_features):
            feature.qualifiers[self.score_column] = ['{:.5f}'.format(pfam_sequence[self.score_column].iloc[i])]

        # Add score to all protein features
        for protein_id, score in protein_scores.items():
            proteins_by_id[protein_id].qualifiers[self.score_column] = ['{:.5f}'.format(score)]

        clusters = []
        active_proteins = []
        gap_proteins = []

        # Create a list of cluster features by merging consecutive proteins with score satisfying given threshold
        # Neighboring clusters within given number of nucleotides/proteins are merged
        for protein in protein_features:
            if self.score_column not in protein.qualifiers:
                # TODO: Should proteins with no Pfam domains also be considered?
                # Current protein did not have any Pfam domains, therefore it has no BGC score, ignore it
                continue
            score = float(protein.qualifiers[self.score_column][0])
            # Inactive protein, add to gap
            if score < self.score_threshold:
                gap_proteins.append(protein)
                # We just changed from active to inactive, add current list of active proteins as a cluster
                if active_proteins:
                    clusters.append(active_proteins)
                    active_proteins = []
            # Active protein
            else:
                # If no cluster is open, check if we should merge with the previous cluster
                if not active_proteins and clusters:
                    prev_cluster_proteins = clusters[-1]
                    prev_end = prev_cluster_proteins[-1].location.end
                    if len(gap_proteins) <= self.merge_max_protein_gap or \
                            (protein.location.start - prev_end) <= self.merge_max_nucl_gap:
                        # Remove previous candidate and continue where it left off
                        clusters = clusters[:-1]
                        active_proteins = prev_cluster_proteins + gap_proteins

                # Add current protein to cluster
                active_proteins.append(protein)
                gap_proteins = []

        # Last protein was active, add list of active proteins as a cluster
        if active_proteins:
            clusters.append(active_proteins)

        # Add detected clusters as features
        record_num_detected = 0
        for cluster_proteins in clusters:
            start = cluster_proteins[0].location.start
            end = cluster_proteins[-1].location.end
            candidate_id = '{}_{}-{}.1'.format(record.id, int(start), int(end))

            if self.min_nucl > 1:
                nucl_length = end - start
                if nucl_length < self.min_nucl:
                    logging.debug('Skipping cluster %s with %s < %s nucleotides', candidate_id, nucl_length, self.min_nucl)
                    continue

            if self.min_proteins > 1:
                num_proteins = len(cluster_proteins)
                if num_proteins < self.min_proteins:
                    logging.debug('Skipping cluster %s with %s < %s proteins', candidate_id, num_proteins, self.min_proteins)
                    continue

            if self.min_domains > 1 or self.min_bio_domains > 0:
                pfam_ids = util.get_pfam_feature_ids(record)
                num_domains = len(pfam_features)
                if num_domains < self.min_domains:
                    logging.debug('Skipping cluster %s with %s < %s protein domains', candidate_id, num_domains, self.min_domains)
                    continue
                num_bio_domains = len(util.filter_biosynthetic_pfam_ids(pfam_ids))
                if num_bio_domains < self.min_bio_domains:
                    logging.debug('Skipping cluster %s with %s < %s known biosynthetic protein domains', candidate_id, num_bio_domains, self.min_bio_domains)
                    continue

            scores = [float(feature.qualifiers[self.score_column][0]) for feature in cluster_proteins]
            location = FeatureLocation(start, end)
            qualifiers = {
                self.score_column: ['{:.5f}'.format(np.mean(scores))],
                'detector': [self.detector_name],
                'detector_label': [self.detector_label],
                'detector_version': [self.model.version],
                'detector_version_timestamp': [self.model.timestamp],
                'product': ['{}_putative'.format(self.detector_name)],
                'bgc_candidate_id': [candidate_id]
            }
            record.features.append(SeqFeature(
                location=location,
                type="cluster",
                qualifiers=qualifiers
            ))
            record_num_detected += 1
            self.num_detected += 1

        # Sort all features by location
        util.sort_record_features(record)

        # Add detector metadata to the record as a structured comment
        if 'structured_comment' not in record.annotations:
            record.annotations['structured_comment'] = {}
        comment_key = util.format_detector_meta_key(self.detector_label)
        record.annotations['structured_comment'][comment_key] = collections.OrderedDict(
            name=self.detector_name,
            label=self.detector_label,
            version=self.model.version,
            version_timestamp=self.model.timestamp,
            detection_timestamp_utc=datetime.utcnow().isoformat(),
            score_threshold=self.score_threshold,
            merge_max_nucl_gap=self.merge_max_nucl_gap,
            merge_max_protein_gap=self.merge_max_protein_gap,
            min_proteins=self.min_proteins,
            min_domains=self.min_domains,
            min_bio_domains=self.min_bio_domains
        )
        logging.info('Detected %s BGCs using %s model in %s', record_num_detected, self.detector_label, record.id)
Beispiel #6
0
    def annotate(self):

        proteins = util.get_protein_features(self.record)
        proteins_by_id = util.get_proteins_by_id(proteins)
        domtbl_path = self.tmp_path_prefix + '.pfam.domtbl.txt'

        if not proteins:
            logging.warning('No	proteins in sequence %s, skipping protein domain detection', self.record.id)
            return

        if util.is_valid_hmmscan_output(domtbl_path):
            cached = True
            logging.info('Reusing already existing HMMER hmmscan result: %s', domtbl_path)
        else:
            cached = False
            protein_path = self.tmp_path_prefix + '.pfam.proteins.fa'

            # Write proteins to fasta file
            self._write_proteins(proteins, protein_path)

            logging.info('Detecting Pfam domains in "%s" using HMMER hmmscan, this might take a while...', self.record.id)
            start_time = datetime.now()
            self._run_hmmscan(protein_path, domtbl_path)

            logging.info('HMMER hmmscan Pfam detection done in %s', util.print_elapsed_time(start_time))

        # Read domain matches in all proteins
        queries = SearchIO.parse(domtbl_path, 'hmmscan3-domtab')

        # Read descriptions from Pfam clan TSV
        pfam_descriptions = self._get_pfam_descriptions()

        # Extract all matched domain hits
        num = 0
        pfam_ids = set()
        for query in queries:
            if cached and query.id not in proteins_by_id:
                raise ValueError('Found invalid protein ID "{}" in cached HMMER hmmscan result for record "{}", '
                                     'disable caching or delete the file: {}'.format(query.id, self.record.id, domtbl_path))
            protein = proteins_by_id.get(query.id)
            for hit in query.hits:
                best_index = np.argmin([hsp.evalue for hsp in hit.hsps])
                best_hsp = hit.hsps[best_index]
                pfam_id = hit.accession
                evalue = float(best_hsp.evalue)
                if evalue > self.max_evalue:
                    continue
                location = self._get_pfam_loc(best_hsp.query_start, best_hsp.query_end, protein)
                qualifiers = {
                    'db_xref': [pfam_id],
                    'evalue': evalue,
                    'locus_tag': [query.id],
                    'database': [PFAM_DB_VERSION],
                }
                short_pfam_id = pfam_id.rsplit('.', 1)[0]
                description = pfam_descriptions.get(short_pfam_id)
                if description:
                    qualifiers['description'] = [description]
                pfam = SeqFeature(
                    location=location,
                    id=pfam_id,
                    type="PFAM_domain",
                    qualifiers=qualifiers
                )
                self.record.features.append(pfam)
                num += 1
                pfam_ids.add(pfam_id)

        util.sort_record_features(self.record)
        logging.info('Added %s Pfam domains (%s unique PFAM_IDs)', num, len(pfam_ids))