def run_glimmerhmm(record: Record) -> None:
    """ Run glimmerhmm on the record, parse the results and add all detected
        genes to the record
    """
    with TemporaryDirectory(change=True):
        # glimmerHMM/gff_parser handles some record names poorly (e.g. leading - or only '.')
        orig_id = record.id
        record.id = "input"
        # Write FASTA file and run GlimmerHMM
        fasta_file = write_search_fasta(record)
        record.id = orig_id
        results_text = run_external(fasta_file)

    if not "CDS" in results_text:
        return

    handle = StringIO(results_text)
    features = get_features_from_file(handle)["input"]
    for feature in features:
        record.add_biopython_feature(feature)
Beispiel #2
0
 def test_parse_all_multi_cluster(self):
     # test we partition correctly by cluster number
     sample_data = self.read_sample_data("data/diamond_output_sample_multicluster.txt")
     clusters_by_number, queries_by_number = core.parse_all_clusters(sample_data, Record(), 0, 0)
     self.assertEqual(len(clusters_by_number), 3)
     self.assertEqual(sorted(clusters_by_number), [1, 2, 4])
     self.assertEqual(len(queries_by_number), 3)
     self.assertEqual(sorted(queries_by_number), [1, 2, 4])
     for i in [1, 2, 4]:
         self.assertEqual(len(clusters_by_number[i]), i)
         self.assertEqual(len(queries_by_number[i]), i)
 def test_labyrinthopeptin(self):
     "Test lanthipeptide prediction for labyrinthopeptin"
     filename = path.get_full_path(__file__, 'data', 'labyrinthopeptin.gbk')
     rec = Record.from_biopython(seqio.read(filename), taxon="bacteria")
     assert not rec.get_cds_motifs()
     result = run_specific_analysis(rec)
     motifs = self.gather_all_motifs(result)
     assert len(motifs) == 2
     assert not rec.get_cds_motifs()
     result.add_to_record(rec)
     assert len(rec.get_cds_motifs()) == 2
def run_on_record(record: Record, results: Optional[SMCOGTreeResults],
                  options: ConfigType) -> SMCOGTreeResults:
    """ Generates phylogeny trees of the classifications made by SMCOGs
    """
    if results and isinstance(results, SMCOGTreeResults):
        return results
    # create the smcogs output directory if required
    relative_output_dir = os.path.relpath(
        os.path.join(options.output_dir, "smcogs"), os.getcwd())
    smcogs_dir = os.path.abspath(relative_output_dir)
    if not os.path.exists(smcogs_dir):
        os.mkdir(smcogs_dir)

    nrpspks_genes = record.get_nrps_pks_cds_features()
    with path.changed_directory(smcogs_dir):
        trees = generate_trees(smcogs_dir,
                               record.get_cds_features_within_regions(),
                               nrpspks_genes)

    return SMCOGTreeResults(record.id, relative_output_dir, trees)
Beispiel #5
0
def generate_results(record: Record,
                     options: ConfigType) -> ClusterFinderResults:
    """ Find and construct cluster borders """
    rule_clusters = find_rule_based_clusters(record, options)
    prob_clusters = find_probabilistic_clusters(record, options)
    new_clusters = []
    new_clusters.extend(rule_clusters)
    for cluster in prob_clusters:
        new_cluster = ClusterBorder(cluster.location,
                                    tool="clusterfinder",
                                    probability=cluster.probability,
                                    product=PUTATIVE_PRODUCT,
                                    high_priority_product=False)
        new_clusters.append(new_cluster)
    if options.cf_create_clusters:
        for border in new_clusters:
            record.add_cluster_border(border)
    return ClusterFinderResults(record.id,
                                new_clusters,
                                create=options.cf_create_clusters)
Beispiel #6
0
def generate_pfam2go_tooltip(record: Record, feature: CDSFeature) -> List[html_renderer.Markup]:
    """Create tooltip text for Pfam to Gene Ontologies results."""
    go_notes = []
    unique_pfams_with_gos = {}
    for pfam in record.get_pfam_domains_in_cds(feature):
        if pfam.gene_ontologies:
            pfam_id = pfam.full_identifier
            unique_pfams_with_gos[pfam_id] = pfam.gene_ontologies
    for unique_id, go_qualifier in sorted(unique_pfams_with_gos.items()):
        go_notes.extend(build_pfam2go_links(go_qualifier, prefix=f"{unique_id}: "))
    return list(map(html_renderer.Markup, go_notes))
Beispiel #7
0
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None:
    """Store information about promoter sequences to a SeqRecord"""
    for promoter in promoters:
        # remember to account for 0-indexed start location
        new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1),
                                                 promoter.end),
                                 type="promoter")
        new_feature.qualifiers = {
            "locus_tag": promoter.get_gene_names(
            ),  # already a list with one or two elements
            "seq": [str(promoter.seq)],
        }

        if isinstance(promoter, CombinedPromoter):
            new_feature.qualifiers["note"] = ["bidirectional promoter"]

        secmet_version = Feature.from_biopython(new_feature)
        secmet_version.created_by_antismash = True

        record.add_feature(secmet_version)
def write_search_fasta(record: Record) -> str:
    """ Constructs a FASTA representation of a record and writes it to a
        file in the current directory.

        Returns:
            the name of the file created
    """
    filename = "{}.fasta".format(record.id)
    with open(filename, 'w') as handle:
        seqio.write([record.to_biopython()], handle, 'fasta')
    return filename
Beispiel #9
0
 def test_result_conversion(self):
     nisin = Record.from_genbank(helpers.get_path_to_nisin_with_detection())[0]
     with open(path.get_full_path(__file__, "data", "nisin.out")) as handle:
         trimmed_output = handle.read()
     with patch.object(subprocessing, "run_diamond_search", return_value=trimmed_output):
         results = cluster_compare.run_on_record(nisin, None, self.options)
     assert results.by_database["MIBiG"].by_region[1]
     # ensure JSON conversion of results gives the same result
     raw = json.loads(json.dumps(results.to_json()))
     regenerated = cluster_compare.regenerate_previous_results(raw, nisin, self.options)
     regen_raw = json.loads(json.dumps(regenerated.to_json()))
     assert regen_raw == raw
 def test_sco_cluster3(self):
     "Test lanthipeptide prediction for SCO cluster #3"
     filename = path.get_full_path(__file__, 'data', 'sco_cluster3.gbk')
     rec = Record.from_biopython(seqio.read(filename), taxon="bacteria")
     assert not rec.get_cds_motifs()
     result = run_specific_analysis(rec)
     motifs = self.gather_all_motifs(result)
     assert len(motifs) == 1
     assert not rec.get_cds_motifs()
     result.add_to_record(rec)
     assert len(rec.get_cds_motifs()) == 1
     self.assertEqual('Class I', motifs[0].peptide_subclass)
def check_content(sequence: Record) -> Record:
    """ Checks if the sequence of a record is correct for the input type. If not
        the record's skip flag will be marked.

        Arguments:
            record: the Record instance to check

        Returns:
            the Record instance provided
    """
    cdsfeatures = sequence.get_cds_features()
    cdsfeatures_with_translations = len(
        [cds for cds in cdsfeatures if cds.translation])
    assert cdsfeatures_with_translations == len(cdsfeatures)
    if not isinstance(sequence.seq.alphabet, Bio.Alphabet.NucleotideAlphabet)\
            and not is_nucl_seq(sequence.seq):
        logging.error("Record %s is a protein record, skipping.", sequence.id)
        sequence.skip = "protein record"
    else:
        sequence.seq.alphabet = Bio.Alphabet.generic_dna
    return sequence
Beispiel #12
0
def convert_tta_codons(tta_codons: List[Feature], record: Record) -> List[Dict[str, Any]]:
    """Convert found TTA codon features to JSON"""
    js_codons = []
    for codon in tta_codons:
        cdses = record.get_cds_features_within_location(codon.location, with_overlapping=True)
        js_codons.append({
            'start': codon.location.start + 1,
            'end': codon.location.end,
            'strand': codon.strand if codon.strand is not None else 1,
            'containedBy': [cds.get_name() for cds in cdses]
        })
    return js_codons
Beispiel #13
0
def run_and_regenerate_results_for_module(input_file,
                                          module,
                                          options,
                                          expected_record_count=1,
                                          callback=None):
    """ Runs antismash end to end over the given file with the given options
        and returns the given modules regenerated results

        if callback is supplied, it will be called with the output directory path
        as an argument before the output directory is cleared
    """
    with TemporaryDirectory(change=True) as tempdir:
        orig_output = options.output_dir
        update_config({"output_dir": tempdir})
        base_filename = os.path.join(
            options.output_dir,
            os.path.basename(input_file).rsplit('.', 1)[0])
        json_filename = base_filename + ".json"
        assert not os.path.exists(json_filename)
        try:
            antismash.main.run_antismash(input_file, options)
        except:
            update_config({"output_dir": orig_output})
            raise
        update_config({"output_dir": orig_output})
        results = serialiser.AntismashResults.from_file(json_filename)
        # remove things that were added by results, because otherwise the add isn't tested by detection
        # result regeneration
        for record in results.records:
            record.strip_antismash_annotations()
        if callback:
            callback(tempdir)
        # and while the genbank output still exists, grab that and check it's readable
        assert len(Record.from_genbank(base_filename +
                                       ".gbk")) == expected_record_count
    # not the responsibility of modules, but if it's wrong then everything is
    assert len(results.results) == expected_record_count
    assert len(results.records) == expected_record_count
    # ensure all detection stages add their relevant parts
    modules_to_regenerate = antismash.main.get_detection_modules()
    final = []
    for record, rec_results in zip(results.records, results.results):
        regenerate_results_for_record(record, options, modules_to_regenerate,
                                      module, rec_results)
        # post (other) detection has run, regenerate (since they may need regions etc)
        final.append(
            module.regenerate_previous_results(
                rec_results.get(module.__name__), record, options))
    for res in final:
        assert isinstance(res, ModuleResults)
    if expected_record_count == 1:
        return final[0]
    return final
Beispiel #14
0
def find_all_orfs(record: Record,
                  cluster: Optional[Cluster] = None) -> List[CDSFeature]:
    """ Find all ORFs of at least 60 bases that don't overlap with existing
        CDS features.

        Can (and should) be limited to just within a cluster.

        Arguments:
            record: the record to search
            cluster: the specific Cluster to search within, or None

        Returns:
            a list of CDSFeatures, one for each ORF
    """
    # Get sequence for the range
    offset = 0
    seq = record.seq
    existing = record.get_cds_features()
    if cluster:
        seq = record.seq[cluster.location.start:cluster.location.end]
        offset = cluster.location.start
        existing = tuple(cluster.cds_children)

    # Find orfs throughout the range
    forward_matches = scan_orfs(seq, 1, offset)
    reverse_matches = scan_orfs(seq.reverse_complement(), -1, offset)
    locations = forward_matches + reverse_matches

    orfnr = 1
    new_features = []

    for location in locations:
        if cluster:
            if isinstance(location.start, (BeforePosition, AfterPosition)):
                continue
            if isinstance(location.end, (BeforePosition, AfterPosition)):
                continue
        dummy_feature = Feature(location, feature_type="dummy")
        # skip if overlaps with existing CDSs
        if any(dummy_feature.overlaps_with(cds) for cds in existing):
            continue

        feature = create_feature_from_location(record, location, orfnr)

        # skip if not wholly contained in the cluster
        if cluster and not feature.is_contained_by(cluster):
            continue

        new_features.append(feature)
        orfnr += 1

    return new_features
Beispiel #15
0
def analyse_biosynthetic_order(nrps_pks_features: List[CDSFeature],
                               consensus_predictions: Dict[str, str],
                               record: Record) -> Dict[int, Tuple[str, bool]]:
    """ For each NRPS or PKS cluster, determines if that cluster is docking or not
        then calls generate_substrates_order()

        Arguments:
            nrps_pks_features: all NRPS/PKS features within the record
            consensus_predictions: a dictionary mapping each NRPS/PKS domain name to its prediction
            record: the Record being analysed

        Returns:
            a dictionary mapping cluster number to
                a tuple of
                    prediction string
                    and whether docking domain analysis was used for the prediction
    """
    compound_predictions = {}  # type: Dict[int, Tuple[str, bool]]
    # Find NRPS/PKS gene clusters
    nrpspksclusters = [
        cluster for cluster in record.get_clusters()
        if "nrps" in cluster.products or "pks" in "-".join(cluster.products)
    ]
    if not nrpspksclusters:
        return {}
    # Predict biosynthetic gene order in gene cluster using starter domains,
    # thioesterase domains, gene order and docking domains
    for cluster in nrpspksclusters:
        cluster_number = cluster.get_cluster_number()
        cds_in_cluster = [
            gene for gene in nrps_pks_features if gene.overlaps_with(cluster)
        ]
        if not cds_in_cluster:
            continue
        pks_count, nrps_count, hybrid_count = find_cluster_modular_enzymes(
            cds_in_cluster)
        # If more than three PKS cds features, use dock_dom_analysis if possible to identify order
        if 3 < pks_count < 11 and not nrps_count and not hybrid_count:
            logging.debug(
                "Cluster %d monomer ordering method: domain docking analysis",
                cluster_number)
            geneorder = perform_docking_domain_analysis(cds_in_cluster)
            docking = True
        else:
            logging.debug("Cluster %d monomer ordering method: colinear",
                          cluster_number)
            geneorder = find_colinear_order(cds_in_cluster)
            docking = False
        prediction = generate_substrates_order(geneorder,
                                               consensus_predictions)
        compound_predictions[cluster_number] = (prediction, docking)
    return compound_predictions
    def from_json(json: Dict[str, Any], record: Record) -> "CDSResults":
        """ Constructs a CDSResults instance from a JSON representation """
        domains = []
        for json_domain in json["domains"]:
            domains.append(SecMetQualifier.Domain.from_json(json_domain))

        cds = record.get_cds_by_name(json["cds_name"])
        definition_domains = {
            key: set(val)
            for key, val in json["definition_domains"].items()
        }

        return CDSResults(cds, domains, definition_domains)
Beispiel #17
0
def annotate_domains(record: Record) -> None:
    """ Annotates NRPS/PKS domains on CDS features. The `nrps_pks` member of
        each feature will be updated, along with creating CDSMotif features
        when relevant.

        Arguments:
            record: the secmet.Record of which to annotate CDS features

        Returns:
            None
    """
    cds_within_clusters = record.get_cds_features_within_clusters()
    assert cds_within_clusters  # because every cluster should have genes

    fasta = get_fasta_from_features(cds_within_clusters)
    cds_domains = find_domains(fasta, record)
    cds_motifs = find_ab_motifs(fasta)

    for cds in cds_within_clusters:
        cds_name = cds.get_name()
        # gather domains and classify
        domains = cds_domains.get(cds_name)
        if not domains:
            continue
        domain_type = classify_feature([domain.hit_id for domain in domains])
        cds.nrps_pks.type = domain_type

        for domain in domains:
            cds.nrps_pks.add_domain(domain)

        # construct motif features
        motifs = cds_motifs.get(cds_name)
        if not motifs:
            continue
        motif_features = generate_motif_features(record, cds, motifs)

        for motif in motif_features:
            record.add_cds_motif(motif)
        cds.motifs.extend(motif_features)
Beispiel #18
0
def get_cds_lengths(record: secmet.Record) -> Dict[str, int]:
    """ Calculates the lengths of each CDS feature in a Record.

        Arguments:
            record: the Record to gather CDS features from

        Returns:
            a dictionary mapping CDS accession to length of the CDS
    """
    lengths = {}
    for cds in record.get_cds_features():
        lengths[cds.get_accession()] = len(cds.translation)
    return lengths
Beispiel #19
0
        def parse_all_wrapper(coverage_threshold, ident_threshold):
            clusters_by_number, queries_by_number = core.parse_all_clusters(self.sample_data,
                                Record(), coverage_threshold, ident_threshold)
            # make sure we only found one cluster number
            self.assertEqual(len(clusters_by_number), 1)
            self.assertEqual(list(clusters_by_number), [24])
            self.assertEqual(len(queries_by_number), 1)
            self.assertEqual(list(queries_by_number), [24])

            # now test the values of those queries
            queries = queries_by_number[24]
            clusters = clusters_by_number[24]
            return queries, clusters
Beispiel #20
0
    def from_json(json: Dict[str, Any], record: secmet.Record) -> "ASFResults":
        if ASFResults.schema_version != json.pop("schema version", None):
            logging.warning("Dropping ASF results, schema version has changed")
            return None
        if record.id != json.pop("record id", None):
            raise ValueError("ASF results contained mismatching record ids")

        pairings = []
        for domain_name, labels in json["pairings"]:
            domain = record.get_domain_by_name(domain_name)
            pairings.append((domain, labels))

        return ASFResults(record.id, pairings)
Beispiel #21
0
 def add_to_record(self, record: Record) -> None:
     """ Adds the hits as PFAMDomains to the given record """
     db_version = pfamdb.get_db_version_from_path(self.database)
     for i, hit in enumerate(self.hits):
         protein_location = FeatureLocation(hit.protein_start,
                                            hit.protein_end)
         pfam_feature = PFAMDomain(location_from_string(hit.location),
                                   description=hit.description,
                                   protein_location=protein_location,
                                   identifier=hit.identifier,
                                   tool=self.tool,
                                   locus_tag=hit.locus_tag)
         for key in [
                 "label", "locus_tag", "domain", "evalue", "score",
                 "translation"
         ]:
             setattr(pfam_feature, key, getattr(hit, key))
         pfam_feature.database = db_version
         pfam_feature.detection = "hmmscan"
         pfam_feature.domain_id = "{}_{}_{:04d}".format(
             self.tool, pfam_feature.locus_tag, i + 1)
         record.add_pfam_domain(pfam_feature)
Beispiel #22
0
def run_specific_analysis(record: Record,
                          options: ConfigType) -> LanthiResults:
    """ Runs the full lanthipeptide analysis over the given record

        Arguments:
            record: the Record instance to analyse

        Returns:
            A populated LanthiResults object
    """
    results = LanthiResults(record.id)
    counter = 0
    for cluster in record.get_protoclusters():
        if cluster.product != 'lanthipeptide':
            continue

        # find core biosynthetic enzyme locations
        core_domain_names = {
            'Lant_dehydr_N', 'Lant_dehydr_C', 'DUF4135', 'Pkinase'
        }
        core_genes = []
        for gene in cluster.cds_children:
            if not gene.sec_met:
                continue
            # We seem to hit Lant_dehydr_C on some O-Methyltranferases that also hit PCMT
            if 'PCMT' in gene.sec_met.domain_ids:
                continue
            if core_domain_names.intersection(set(gene.sec_met.domain_ids)):
                core_genes.append(gene)

        precursor_candidates = find_lan_a_features(cluster)
        # Find candidate ORFs that are not yet annotated
        extra_orfs = all_orfs.find_all_orfs(record, cluster)
        for orf in extra_orfs:
            if len(orf.translation) < 80:
                precursor_candidates.append(orf)

        for gene in core_genes:
            neighbours = find_neighbours_in_range(gene, precursor_candidates)
            if not neighbours:
                continue
            run_lanthi_on_genes(record, gene, cluster, neighbours, results)

        # Analyze the cluster with RREfinder
        counter += 1
        name = '%s_%s_%s' % (record.id, cluster.product, counter)
        RRE_main(cluster, results, name, options)

    logging.debug("Lanthipeptide module marked %d motifs",
                  sum(map(len, results.motifs_by_locus)))
    return results
Beispiel #23
0
    def add_to_record(self, record: Record) -> None:
        """ Save substrate specificity predictions in NRPS/PKS domain sec_met info of record
        """
        for candidate_cluster_preds in self.region_predictions.values():
            for cluster_pred in candidate_cluster_preds:
                assert isinstance(
                    cluster_pred,
                    CandidateClusterPrediction), type(cluster_pred)
                candidate = record.get_candidate_cluster(
                    cluster_pred.candidate_cluster_number)
                candidate.smiles_structure = cluster_pred.smiles

        for cds_feature in record.get_nrps_pks_cds_features():
            assert cds_feature.region, "CDS parent region removed since analysis"
            nrps_qualifier = cds_feature.nrps_pks
            for domain in nrps_qualifier.domains:
                feature = record.get_domain_by_name(domain.feature_name)
                assert isinstance(feature, AntismashDomain)

                domain.predictions.clear()
                if domain.name in ["AMP-binding", "A-OX"]:
                    self._annotate_a_domain(domain)
                elif domain.name == "PKS_AT":
                    self._annotate_at_domain(
                        domain, "transatpks" in cds_feature.region.products)
                elif domain.name == "CAL_domain":
                    self._annotate_cal_domain(domain)
                elif domain.name == "PKS_KR":
                    self._annotate_kr_domain(domain)
                # otherwise one of many without prediction methods/relevance (PCP, Cglyc, etc)

                for method, pred in domain.predictions.items():
                    feature.specificity.append("%s: %s" % (method, pred))

                mapping = DOMAIN_TYPE_MAPPING.get(domain.name)
                if mapping:
                    feature.domain_subtype = domain.name
                    feature.domain = mapping
Beispiel #24
0
def run_on_record(record: Record, results: Optional[SMCOGResults],
                  options: ConfigType) -> SMCOGResults:
    """ Classifies gene functions and, if requested, generates phylogeny trees
        of the classifications
    """
    relative_output_dir = os.path.relpath(
        os.path.join(options.output_dir, "smcogs"), os.getcwd())
    smcogs_dir = os.path.abspath(relative_output_dir)
    if not os.path.exists(smcogs_dir):
        os.mkdir(smcogs_dir)

    if not results:
        results = SMCOGResults(record.id)

        genes = record.get_cds_features_within_clusters()
        hmm_results = classify_genes(genes)
        for gene in genes:
            gene_name = gene.get_name()
            hits = hmm_results.get(gene_name)
            if not hits:
                continue
            results.best_hits[gene.get_name()] = hits[0]
        write_smcogs_file(hmm_results, genes,
                          record.get_nrps_pks_cds_features(), options)

    if not results.tree_images and options.smcogs_trees:
        # create the smcogs output directory if required
        results.relative_tree_path = relative_output_dir
        original_dir = os.getcwd()
        os.chdir(smcogs_dir)  # TODO make a context manager
        nrpspks_genes = record.get_nrps_pks_cds_features()
        nrpspks_genes = []
        results.tree_images = generate_trees(smcogs_dir, hmm_results, genes,
                                             nrpspks_genes)

        os.chdir(original_dir)

    return results
Beispiel #25
0
def generate_pfam2go_tooltip(record: Record, feature: CDSFeature) -> List[html_renderer.Markup]:
    """Create tooltip text for Pfam to Gene Ontologies results."""
    go_notes = []
    unique_pfams_with_gos = {}
    go_url = 'http://amigo.geneontology.org/amigo/term/'
    go_info_line = "{pf_id}: <a class='external-link' href='{url}{go_id}' target='_blank'>{go_id}</a>: {go_desc}"
    for pfam in record.get_pfam_domains_in_cds(feature):
        if pfam.gene_ontologies:
            pfam_id = pfam.full_identifier
            unique_pfams_with_gos[pfam_id] = pfam.gene_ontologies
    for unique_id, go_qualifier in sorted(unique_pfams_with_gos.items()):
        for go_id, go_description in sorted(go_qualifier.go_entries.items()):
            go_notes.append(go_info_line.format(pf_id=unique_id, url=go_url, go_id=go_id, go_desc=go_description))
    return list(map(html_renderer.Markup, go_notes))
def convert_record(record: secmet.Record,
                   fasta: IO,
                   skip_contig_edge: bool = True) -> Dict[str, Any]:
    result = {
        "regions": [],
        "cds_mapping": {},
    }  # type: Dict[str, Any]
    cds_index = Counter()
    for region in record.get_regions():
        if skip_contig_edge and region.contig_edge:
            continue
        result["regions"].append(
            convert_region(region, result["cds_mapping"], cds_index, fasta))
    return result
Beispiel #27
0
def blastparse(blasttext: str, record: secmet.Record, min_seq_coverage: float = -1.,
               min_perc_identity: float = -1.) -> Tuple[Dict[str, Query], Dict[str, List[Query]]]:
    """ Parses blast output into a usable form, limiting to a single best hit
        for every query. Results can be further trimmed by minimum thresholds of
        both coverage and percent identity.

        Arguments:
            blasttext: the output from diamond in blast format
            record: used to get all gene ids in the cluster, and used as a
                    backup to fetch sequence length if missing from seqlengths
            min_seq_coverage: the exclusive lower bound of sequence coverage for a match
            min_perc_identity: the exclusive lower bound of identity similarity for a match

        Returns:
            a tuple of
                a dictionary mapping query id to Query instance
                a dictionary mapping cluster number to
                    a list of Query instances from that cluster
    """
    seqlengths = get_cds_lengths(record)
    names = set(cds.get_name() for cds in record.get_cds_features_within_clusters())
    queries = OrderedDict()  # type: Dict[str, Query]
    clusters = OrderedDict()  # type: Dict[str, List[Query]]
    blastlines = remove_duplicate_hits([line.split("\t") for line in blasttext.rstrip().split("\n")])
    current_query = None

    for tabs in blastlines:
        query = tabs[0]
        subject = parse_subject(tabs, seqlengths, names, record)

        # only process the pairing if limits met
        if subject.perc_ident <= min_perc_identity \
                or subject.perc_coverage <= min_seq_coverage:
            continue

        new_query = query not in queries
        new_hit = subject.genecluster not in clusters

        if new_query:
            current_query = Query(query, len(queries))
            queries[query] = current_query

        if new_hit:
            clusters[subject.genecluster] = []
        clusters[subject.genecluster].append(current_query)

        # link the subject to the query
        current_query.add_subject(subject)

    return queries, clusters
Beispiel #28
0
def specific_analysis(record: secmet.Record) -> SactiResults:
    """ Analyse each sactipeptide cluster and find precursors within it.
        If an unannotated ORF would contain the precursor, it will be annotated.

        Arguments:
            record: the Record to analyse

        Returns:
            a SactiResults instance holding all found precursors and new ORFs
    """
    results = SactiResults(record.id)
    new_feature_hits = 0
    motif_count = 0
    for cluster in record.get_protoclusters():
        if cluster.product != 'sactipeptide':
            continue

        # Find candidate ORFs that are not yet annotated
        new_orfs = all_orfs.find_all_orfs(record, cluster)
        hmm_results = run_non_biosynthetic_phmms(
            fasta.get_fasta_from_features(new_orfs))
        annotate_orfs(new_orfs, hmm_results)

        # Get all CDS features to evaluate for RiPP-likeness
        candidates = list(cluster.cds_children) + new_orfs
        domains = get_detected_domains(cluster)

        # Evaluate each candidate precursor peptide
        for candidate in candidates:
            motif = run_sactipred(cluster, candidate, domains)
            if motif is None:
                continue

            results.motifs_by_locus[candidate.get_name()].append(motif)
            motif_count += 1
            results.clusters[cluster.get_protocluster_number()].add(
                candidate.get_name())
            # track new CDSFeatures if found with all_orfs
            if candidate.region is None:
                results.new_cds_features.add(candidate)
                new_feature_hits += 1

    if not motif_count:
        logging.debug("Found no sactipeptide motifs")
    else:
        verb = "is" if new_feature_hits == 1 else "are"
        logging.debug(
            "Found %d sactipeptide motif(s) in %d feature(s), %d of which %s new",
            motif_count, len(results.motifs_by_locus), new_feature_hits, verb)
    return results
Beispiel #29
0
def ensure_cds_info(single_entry: bool, genefinding: Callable[[Record, Any], None], sequence: Record) -> Record:
    """ Ensures the given record has CDS features with unique locus tags.
        CDS features are retrieved from GFF file or via genefinding, depending
        on antismash options.

        Records without CDS features will have their skip flag marked.

        Arguments:
            single_entry: whether gff_parser can ignore mismatching record ids
                          provided there's only one record provided here and in
                          the GFF file
            genefinding: the relevant run_on_record(record, options) function to
                         use for finding genes if no GFF file being used
            record: the Record instance to ensure CDS features for

        Returns:
            the Record instance provided
    """
    options = get_config()
    if sequence.skip:
        return sequence
    if not sequence.get_cds_features():
        if options.genefinding_gff3:
            logging.info("No CDS features found in record %r but GFF3 file provided, running GFF parser.", sequence.id)
            gff_parser.run(sequence, single_entry, options)
            if not sequence.get_cds_features():
                logging.warning("Record %s has no genes even after running GFF parser, skipping.", sequence.id)
                sequence.skip = "No genes found"
                return sequence
        elif options.genefinding_tool != "none":
            logging.info("No CDS features found in record %r, running gene finding.", sequence.id)
            genefinding(sequence, options)
        if not sequence.get_cds_features():
            logging.info("No genes found, skipping record")
            sequence.skip = "No genes found"
            return sequence
    return sequence
def specific_analysis(record: Record, results: NRPS_PKS_Results,
                      options: ConfigType) -> NRPS_PKS_Results:
    """ Runs the various NRPS/PKS analyses on a record and returns their results """
    nrps_pks_genes = record.get_nrps_pks_cds_features()

    if not nrps_pks_genes:
        logging.debug("No NRPS or PKS genes found, skipping analysis")
        return results

    a_domains = get_a_domains_from_cds_features(record, nrps_pks_genes)
    if a_domains:
        logging.info(
            "Predicting A domain substrate specificities with NRPSPredictor2")
        results.add_method_results("NRPSPredictor2",
                                   run_nrpspredictor(a_domains, options))

    #TODO: add call to run_siderophore_predictions

    pks_results = run_pks_substr_spec_predictions(nrps_pks_genes)
    for method, method_results in pks_results.items():
        results.add_method_results(method, method_results)
    consensus_pair = calculate_consensus_prediction(nrps_pks_genes,
                                                    results.domain_predictions)
    results.consensus, results.consensus_transat = consensus_pair

    #TODO: add call to run_lipopptide_predictions

    candidate_cluster_predictions = analyse_biosynthetic_order(
        nrps_pks_genes, results.consensus, record)
    for prediction in candidate_cluster_predictions:
        candidate_cluster = record.get_candidate_cluster(
            prediction.candidate_cluster_number)
        region = candidate_cluster.parent
        assert isinstance(region, Region), type(region)
        results.region_predictions[region.get_region_number()].append(
            prediction)
    return results