Ejemplo n.º 1
0
    def test_check_cluster_predictions(self):
        seq_record = create_fake_record()
        promoters = [
            Promoter("gene1", 1, 5),
            Promoter("gene2", 6, 10),
            CombinedPromoter("gene3", "gene4", 11, 15)
        ]
        ignored_genes = [  # see captured logging
            Gene(FeatureLocation(1, 5), locus_tag="gene5")
        ]
        clusters = [
            ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)),
                              ClusterMarker("gene4", Motif(3, 3, score=1)))
        ]
        expected = [
            ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)),
                              ClusterMarker("gene4", Motif(3, 3, score=1)))
        ]
        expected[0].start.promoter = "gene1"
        expected[0].end.promoter = "gene3+gene4"
        expected[0].genes = 4
        expected[0].promoters = 3

        assert check_cluster_predictions(clusters, seq_record, promoters,
                                         ignored_genes) == expected
Ejemplo n.º 2
0
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction],
                           record: Record) -> List[ClusterBorder]:
    """ Create the predicted ClusterBorders """
    if not clusters:
        return []
    borders = []
    for i, cluster in enumerate(clusters):
        # cluster borders returned by hmmdetect are based on CDS features
        # in contrast, cluster borders returned by cassis are based on gene features
        # --> hmmdetect derived clusters have exact loctions, like the CDSs have
        # --> cassis derived clusters may have fuzzy locations, like the genes have
        left_name = cluster.start.gene
        right_name = cluster.end.gene
        left = None
        right = None
        for gene in record.get_genes():
            if gene.get_name() == left_name:
                left = gene
            if gene.get_name() == right_name:
                right = gene
            if left and right:
                break

        new_feature = SeqFeature(FeatureLocation(left.location.start,
                                                 right.location.end),
                                 type="cluster_border")
        new_feature.qualifiers = {
            "aStool": ["cassis"],
            "anchor": [anchor],
            "abundance": [cluster.start.abundance + cluster.end.abundance],
            "motif_score":
            ["{:.1e}".format(cluster.start.score + cluster.end.score)],
            "gene_left": [cluster.start.gene],
            "promoter_left": [cluster.start.promoter],
            "abundance_left": [cluster.start.abundance],
            "motif_left": [cluster.start.pairing_string],
            "motif_score_left": ["{:.1e}".format(cluster.start.score)],
            "gene_right": [cluster.end.gene],
            "promoter_right": [cluster.end.promoter],
            "abundance_right": [cluster.end.abundance],
            "motif_right": [cluster.end.pairing_string],
            "motif_score_right": ["{:.1e}".format(cluster.end.score)],
            "genes": [cluster.genes],
            "promoters": [cluster.promoters],
        }

        if i == 0:
            new_feature.qualifiers["note"] = [
                "best prediction (most abundant) for anchor gene {}".format(
                    anchor)
            ]
        else:
            new_feature.qualifiers["note"] = [
                "alternative prediction ({}) for anchor gene {}".format(
                    i, anchor)
            ]

        new_feature = ClusterBorder.from_biopython(new_feature)
        borders.append(new_feature)
    return borders
Ejemplo n.º 3
0
    def setUp(self):
        self.config = build_config([
            "--cf-create-clusters", "--cf-mean-threshold", "0.6",
            "--cf-min-cds", "5", "--cf-min-pfams", "5"
        ],
                                   modules=[clusterfinder],
                                   isolated=True)
        update_config({"enabled_cluster_types": []})

        self.record = DummyRecord(seq=Seq("A" * 2000))
        for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'),
                                                 (30, 40, 0.3, 'PF00106'),
                                                 (50, 60, 0.4, 'PF00107'),
                                                 (60, 70, 0.7, 'PF00109'),
                                                 (70, 80, 0.98, 'PF08484'),
                                                 (90, 100, 0.8, 'PF02401'),
                                                 (100, 110, 0.32, 'PF04369'),
                                                 (110, 120, 1.0, 'PF00128'),
                                                 (130, 140, 0.2, 'FAKE234'),
                                                 (500, 505, None, 'FAKE505'),
                                                 (1010, 1020, 0.1, 'FAKE007'),
                                                 (1030, 1040, 0.3, 'PF00106'),
                                                 (1050, 1060, 0.4, 'PF00107'),
                                                 (1060, 1070, 0.7, 'PF00109'),
                                                 (1070, 1080, 0.98, 'PF08484'),
                                                 (1090, 1100, 0.8, 'PF02401'),
                                                 (1100, 1110, 0.32, 'PF04369'),
                                                 (1110, 1120, 1.0, 'PF00128')]:
            location = FeatureLocation(start, end)
            self.record.add_cds_feature(
                CDSFeature(location, locus_tag=str(start)))
            pfam = PFAMDomain(location, "dummy_description")
            pfam.db_xref.append(pfam_id)
            pfam.probability = probability
            self.record.add_pfam_domain(pfam)
Ejemplo n.º 4
0
 def create_border(self, rule_name, start, end):
     rule = self.rules_by_name[rule_name]
     return ClusterBorder(FeatureLocation(start, end),
                          tool="testing",
                          cutoff=rule.cutoff,
                          extent=rule.extent,
                          product=rule_name)
Ejemplo n.º 5
0
def build_hits(record, hmmscan_results, min_score: float, max_evalue: float,
               database: str) -> List[Dict[str, Any]]:
    "Builds PFAMDomains from the given hmmscan results"
    logging.debug("Generating feature objects for PFAM hits")

    hits = []
    feature_by_id = record.get_cds_name_mapping()

    for result in hmmscan_results:
        for hsp in result.hsps:
            if hsp.bitscore <= min_score or hsp.evalue >= max_evalue:
                continue

            if hsp.query_id not in hsp.query_id:
                continue

            feature = feature_by_id[hsp.query_id]

            start, end = calculate_start_and_end(feature, hsp)

            dummy_feature = PFAMDomain(FeatureLocation(
                start, end, feature.location.strand),
                                       description="")

            hit = {
                "start":
                start,
                "end":
                end,
                "strand":
                feature.location.strand,
                "label":
                result.id,
                "locus_tag":
                feature.locus_tag,
                "domain":
                hsp.hit_id,
                "evalue":
                hsp.evalue,
                "score":
                hsp.bitscore,
                "translation":
                str(
                    dummy_feature.extract(
                        record.seq).translate(table=feature.transl_table)),
                "db_xref":
                [pfamdb.get_pfam_id_from_name(hsp.hit_id, database)],
                "description":
                hsp.hit_description
            }
            hits.append(hit)
    return hits
Ejemplo n.º 6
0
def find_clusters(
    record: Record, cds_by_cluster_type: Dict[str, Set[str]],
    rules_by_name: Dict[str,
                        rule_parser.DetectionRule]) -> List[ClusterBorder]:
    """ Detects gene clusters based on the identified core genes """
    clusters = []  # type: List[ClusterBorder]

    cds_feature_by_name = record.get_cds_name_mapping()

    for cluster_type, cds_names in cds_by_cluster_type.items():
        cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names])
        rule = rules_by_name[cluster_type]
        cutoff = rule.cutoff
        extent = rule.extent
        start, end = sorted(
            [cds_features[0].location.start, cds_features[0].location.end])
        cluster = ClusterBorder(FeatureLocation(start, end),
                                tool="rule-based-clusters",
                                cutoff=cutoff,
                                extent=extent,
                                product=cluster_type)
        assert cds_features[0].is_contained_by(cluster)
        assert cds_features[0] in record.get_cds_features_within_location(
            cluster.location)
        clusters.append(cluster)
        for cds in cds_features[1:]:
            feature_start, feature_end = sorted(
                [cds.location.start, cds.location.end])
            dummy_location = FeatureLocation(cluster.location.start - cutoff,
                                             cluster.location.end + cutoff)
            if cds.overlaps_with(dummy_location):
                start = min(feature_start, start)
                end = max(feature_end, end)
                cluster.location = FeatureLocation(start, end)
            else:
                start = feature_start
                end = feature_end
                cluster = ClusterBorder(FeatureLocation(start, end),
                                        tool="rule-based-clusters",
                                        cutoff=cutoff,
                                        extent=extent,
                                        product=cluster_type)
                clusters.append(cluster)

    for cluster in clusters:
        cluster.rule = str(rules_by_name[cluster.product].conditions)
        if cluster.location.start < 0:
            cluster.location = FeatureLocation(0, cluster.location.end)
            cluster.contig_edge = True
        if cluster.location.end > len(record):
            cluster.location = FeatureLocation(cluster.location.start,
                                               len(record))
            cluster.contig_edge = True

    clusters = remove_redundant_borders(clusters, rules_by_name)

    logging.debug("%d rule-based cluster(s) found in record", len(clusters))
    return clusters
Ejemplo n.º 7
0
    def new_feature_from_basics(self, start: int, strand: int) -> Feature:
        """ Constructs a new TTA marking feature from a start position and
            a strand
        """
        tta_feature = Feature(FeatureLocation(start, start + 3, strand),
                              feature_type="misc_feature",
                              created_by_antismash=True)
        tta_feature.notes.append(
            "tta leucine codon, possible target for bldA regulation")

        self.codon_starts.append((start, strand))
        self.features.append(tta_feature)

        return tta_feature
Ejemplo n.º 8
0
def find_nr_cds(cluster_position: Tuple[int, int], record: Record) -> Tuple[Tuple[int, int], int]:
    """ Find the number of CDSs in candidate cluster and adjust the cluster starts
        and ends to match the CDS starts and ends """
    area = FeatureLocation(cluster_position[0], cluster_position[1])
    cds_features = record.get_cds_features_within_location(area, with_overlapping=True)

    if not cds_features:
        return cluster_position, 0

    startlocations = [int(cds.location.start) for cds in cds_features]
    endlocations = [int(cds.location.end) for cds in cds_features]
    # avoid getting the complete genome as cluster if one CDS
    # starts at end and finishes at start of genome
    if not (0 in startlocations and len(record.seq) in endlocations):
        cluster_position = (min(startlocations), max(endlocations))
    return cluster_position, len(cds_features)
Ejemplo n.º 9
0
 def test_classification_with_colon(self):
     # since SMCOG id and description are stored in a string separated by :,
     # ensure that descriptions containing : are properly handled
     cds = CDSFeature(FeatureLocation(0, 100),
                      locus_tag="test",
                      translation="AAA")
     record = helpers.DummyRecord(features=[cds], seq="A" * 100)
     record.add_cluster(helpers.DummyCluster(0, 100))
     results = SMCOGResults(record.id)
     results.best_hits[cds.get_name()] = HMMResult(
         "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416)
     results.add_to_record(record)
     gene_functions = cds.gene_functions.get_by_tool("smcogs")
     assert len(gene_functions) == 1
     assert str(gene_functions[0]).startswith(
         "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter"
         " (Score: 416; E-value: 2.3e-126)")
Ejemplo n.º 10
0
 def add_to_record(self, record: Record) -> None:
     db_version = pfamdb.get_db_version_from_path(self.database)
     for i, hit in enumerate(self.hits):
         pfam_feature = PFAMDomain(FeatureLocation(hit["start"], hit["end"],
                                                   hit["strand"]),
                                   description=hit["description"])
         for key in [
                 "label", "locus_tag", "domain", "evalue", "score",
                 "translation", "db_xref"
         ]:
             setattr(pfam_feature, key, hit[key])
         pfam_feature.tool = self.tool
         pfam_feature.database = db_version
         pfam_feature.detection = "hmmscan"
         pfam_feature.domain_id = "{}_{}_{:04d}".format(
             self.tool, pfam_feature.locus_tag, i + 1)
         record.add_pfam_domain(pfam_feature)
Ejemplo n.º 11
0
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None:
    """Store information about promoter sequences to a SeqRecord"""
    logging.critical("adding promoters based on biopython features")
    for promoter in promoters:
        # remember to account for 0-indexed start location
        new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1),
                                                 promoter.end),
                                 type="promoter")
        new_feature.qualifiers = {
            "locus_tag": promoter.get_gene_names(
            ),  # already a list with one or two elements
            "seq": [str(promoter.seq)],  # TODO save string or Seq object?
        }

        if isinstance(promoter, CombinedPromoter):
            new_feature.qualifiers["note"] = ["bidirectional promoter"]

        secmet_version = Feature.from_biopython(new_feature)
        secmet_version.created_by_antismash = True

        record.add_feature(secmet_version)
Ejemplo n.º 12
0
    def test_merges(self):
        clusterfinder.generate_results(self.record, self.config)
        assert len(self.record.get_cluster_borders()) == 2

        for start, end in [(10, 40), (1040, 1050), (110, 400)]:
            loc = FeatureLocation(start, end)
            self.record.add_cluster_border(ClusterBorder(loc, "testtool", product=str(start)))

        assert not self.record.get_clusters()

        self.record.create_clusters_from_borders()

        clusters = self.record.get_clusters()
        assert len(clusters) == 2

        assert clusters[0].location.start == 10
        assert clusters[0].location.end == 400
        assert clusters[0].products == ("10", "110")
        assert clusters[1].location.start == 1030
        assert clusters[1].location.end == 1120
        assert clusters[1].products == ("1040",)
Ejemplo n.º 13
0
 def __init__(self, positions, probability):
     self.location = FeatureLocation(positions[0], positions[1])
     self.probability = probability
Ejemplo n.º 14
0
def detect_borders_and_signatures(record: Record, signature_file: str,
                                  seeds_file: str, rules_file: str,
                                  filter_file: str,
                                  tool: str) -> RuleDetectionResults:
    """ Compares all CDS features in a record with HMM signatures and generates
        Cluster features based on those hits and the current cluster detection
        rules.

        Arguments:
            record: the record to analyse
            signature_file: a tab separated file; each row being a single HMM reference
                        with columns: label, description, minimum score cutoff, hmm path
            seeds_file: the file containing all HMM profiles
            rules_file: the file containing all the rules to use for cluster definition
            filter_file: a file containing equivalence sets of HMMs
            tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters)
    """
    full_fasta = fasta.get_fasta_from_record(record)
    # if there's no CDS features, don't try to do anything
    if not full_fasta:
        return None
    sig_by_name = {
        sig.name: sig
        for sig in get_signature_profiles(signature_file)
    }
    rules = create_rules(rules_file, set(sig_by_name))
    results = []
    results_by_id = {}  # type: Dict[str, HSP]

    runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True)
    for runresult in runresults:
        acc = runresult.accession.split('.')[0]
        # Store result if it is above cut-off
        for hsp in runresult.hsps:
            if hsp.query_id in sig_by_name:
                sig = sig_by_name[hsp.query_id]
            elif acc in sig_by_name:
                sig = sig_by_name[acc]
            else:
                raise ValueError(
                    'Failed to find signature for ID %s / ACC %s' %
                    (hsp.query_id, acc))
            if hsp.bitscore > sig.cutoff:
                results.append(hsp)
                if hsp.hit_id not in results_by_id:
                    results_by_id[hsp.hit_id] = [hsp]
                else:
                    results_by_id[hsp.hit_id].append(hsp)

    # Filter results by comparing scores of different models (for PKS systems)
    results, results_by_id = filter_results(results, results_by_id,
                                            filter_file, set(sig_by_name))

    # Filter multiple results of the same model in one gene
    results, results_by_id = filter_result_multiple(results, results_by_id)

    # Use rules to determine gene clusters
    cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules(
        record, results_by_id, rules)

    # Find number of sequences on which each pHMM is based
    num_seeds_per_hmm = get_sequence_counts(signature_file)

    # Save final results to record
    rules_by_name = {rule.name: rule for rule in rules}
    clusters = find_clusters(record, cluster_type_hits, rules_by_name)
    strip_inferior_domains(cds_domains_by_cluster, rules_by_name)

    cds_results_by_cluster = {}
    for cluster in clusters:
        record.add_cluster_border(cluster)
        cds_results = []
        cluster_extent = FeatureLocation(
            cluster.location.start - cluster.extent,
            cluster.location.end + cluster.extent)
        for cds in record.get_cds_features_within_location(cluster_extent):
            domains = []
            for hsp in results_by_id.get(cds.get_name(), []):
                domains.append(
                    SecMetQualifier.Domain(hsp.query_id, hsp.evalue,
                                           hsp.bitscore,
                                           num_seeds_per_hmm[hsp.query_id],
                                           tool))
            if domains:
                cds_results.append(
                    CDSResults(cds, domains,
                               cds_domains_by_cluster.get(cds.get_name(), {})))
        cds_results_by_cluster[cluster] = cds_results

    return RuleDetectionResults(cds_results_by_cluster, tool)
Ejemplo n.º 15
0
def apply_cluster_rules(
    record: Record, results_by_id: Dict[str, List[HSP]],
    rules: List[rule_parser.DetectionRule]
) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[str, Set[str]]]:
    """
        Run detection rules over each CDS and classify them if relevant.
        A CDS can satisfy multiple rules. If so, all rules satisfied
        will form part of the type string, separated by '-'.

        The 'other' type has a lower precedence than other rules and a hit with
        the 'other' rule will be ignored if another rule is also satisfied.

        Args:
            record: the record being checked
            results_by_id: A dict of CDS ID to a list of HSP results
            rules: A list of DetectionRule instances

        Returns:
            A tuple of
                a dictionary mapping CDS ID to
                    a dictionary mapping cluster type string to
                        a set of domains used to determine the cluster
                and a dictionary mapping rule name to
                    a set of CDS feature names that matched the rule
    """
    if not results_by_id:
        return {}, {}

    cds_with_hits = sorted(
        results_by_id,
        key=lambda gene_id: record.get_cds_by_name(gene_id).location.start)

    cds_domains_by_cluster_type = {}
    cluster_type_hits = defaultdict(set)  # type: Dict[str, Set[str]]
    for cds_name in cds_with_hits:
        feature = record.get_cds_by_name(cds_name)
        feature_start, feature_end = sorted(
            [feature.location.start, feature.location.end])
        results = []  # type: List[str]
        rule_texts = []
        info_by_range = {
        }  # type: Dict[int, Tuple[Dict[str, CDSFeature], Dict[str, List[HSP]]]]
        domain_matches = set()  # type: Set[str]
        domains_by_cluster = {}  # type: Dict[str, Set[str]]
        for rule in rules:
            if rule.cutoff not in info_by_range:
                # TODO: improve performance
                location = FeatureLocation(feature_start - rule.cutoff,
                                           feature_end + rule.cutoff)
                nearby = record.get_cds_features_within_location(
                    location, with_overlapping=True)
                nearby_features = {
                    neighbour.get_name(): neighbour
                    for neighbour in nearby
                }
                nearby_results = {
                    neighbour: results_by_id[neighbour]
                    for neighbour in nearby_features
                    if neighbour in results_by_id
                }
                info_by_range[rule.cutoff] = (nearby_features, nearby_results)
            nearby_features, nearby_results = info_by_range[rule.cutoff]
            matching = rule.detect(cds_name, nearby_features, nearby_results)
            if matching.met and matching.matches:
                domains_by_cluster[rule.name] = matching.matches
                results.append(rule.name)
                rule_texts.append(rule.reconstruct_rule_text())
                domain_matches.update(matching.matches)
                cluster_type_hits[rule.name].add(cds_name)
        if domains_by_cluster:
            cds_domains_by_cluster_type[cds_name] = domains_by_cluster
    return cds_domains_by_cluster_type, cluster_type_hits
Ejemplo n.º 16
0
 def __init__(self, positions: Tuple[int, int], probability: float) -> None:
     self.location = FeatureLocation(positions[0], positions[1])
     self.probability = probability