def test_check_cluster_predictions(self): seq_record = create_fake_record() promoters = [ Promoter("gene1", 1, 5), Promoter("gene2", 6, 10), CombinedPromoter("gene3", "gene4", 11, 15) ] ignored_genes = [ # see captured logging Gene(FeatureLocation(1, 5), locus_tag="gene5") ] clusters = [ ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)), ClusterMarker("gene4", Motif(3, 3, score=1))) ] expected = [ ClusterPrediction(ClusterMarker("gene1", Motif(3, 3, score=1)), ClusterMarker("gene4", Motif(3, 3, score=1))) ] expected[0].start.promoter = "gene1" expected[0].end.promoter = "gene3+gene4" expected[0].genes = 4 expected[0].promoters = 3 assert check_cluster_predictions(clusters, seq_record, promoters, ignored_genes) == expected
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction], record: Record) -> List[ClusterBorder]: """ Create the predicted ClusterBorders """ if not clusters: return [] borders = [] for i, cluster in enumerate(clusters): # cluster borders returned by hmmdetect are based on CDS features # in contrast, cluster borders returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived clusters may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="cluster_border") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = ClusterBorder.from_biopython(new_feature) borders.append(new_feature) return borders
def setUp(self): self.config = build_config([ "--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5" ], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'FAKE234'), (500, 505, None, 'FAKE505'), (1010, 1020, 0.1, 'FAKE007'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end) self.record.add_cds_feature( CDSFeature(location, locus_tag=str(start))) pfam = PFAMDomain(location, "dummy_description") pfam.db_xref.append(pfam_id) pfam.probability = probability self.record.add_pfam_domain(pfam)
def create_border(self, rule_name, start, end): rule = self.rules_by_name[rule_name] return ClusterBorder(FeatureLocation(start, end), tool="testing", cutoff=rule.cutoff, extent=rule.extent, product=rule_name)
def build_hits(record, hmmscan_results, min_score: float, max_evalue: float, database: str) -> List[Dict[str, Any]]: "Builds PFAMDomains from the given hmmscan results" logging.debug("Generating feature objects for PFAM hits") hits = [] feature_by_id = record.get_cds_name_mapping() for result in hmmscan_results: for hsp in result.hsps: if hsp.bitscore <= min_score or hsp.evalue >= max_evalue: continue if hsp.query_id not in hsp.query_id: continue feature = feature_by_id[hsp.query_id] start, end = calculate_start_and_end(feature, hsp) dummy_feature = PFAMDomain(FeatureLocation( start, end, feature.location.strand), description="") hit = { "start": start, "end": end, "strand": feature.location.strand, "label": result.id, "locus_tag": feature.locus_tag, "domain": hsp.hit_id, "evalue": hsp.evalue, "score": hsp.bitscore, "translation": str( dummy_feature.extract( record.seq).translate(table=feature.transl_table)), "db_xref": [pfamdb.get_pfam_id_from_name(hsp.hit_id, database)], "description": hsp.hit_description } hits.append(hit) return hits
def find_clusters( record: Record, cds_by_cluster_type: Dict[str, Set[str]], rules_by_name: Dict[str, rule_parser.DetectionRule]) -> List[ClusterBorder]: """ Detects gene clusters based on the identified core genes """ clusters = [] # type: List[ClusterBorder] cds_feature_by_name = record.get_cds_name_mapping() for cluster_type, cds_names in cds_by_cluster_type.items(): cds_features = sorted([cds_feature_by_name[cds] for cds in cds_names]) rule = rules_by_name[cluster_type] cutoff = rule.cutoff extent = rule.extent start, end = sorted( [cds_features[0].location.start, cds_features[0].location.end]) cluster = ClusterBorder(FeatureLocation(start, end), tool="rule-based-clusters", cutoff=cutoff, extent=extent, product=cluster_type) assert cds_features[0].is_contained_by(cluster) assert cds_features[0] in record.get_cds_features_within_location( cluster.location) clusters.append(cluster) for cds in cds_features[1:]: feature_start, feature_end = sorted( [cds.location.start, cds.location.end]) dummy_location = FeatureLocation(cluster.location.start - cutoff, cluster.location.end + cutoff) if cds.overlaps_with(dummy_location): start = min(feature_start, start) end = max(feature_end, end) cluster.location = FeatureLocation(start, end) else: start = feature_start end = feature_end cluster = ClusterBorder(FeatureLocation(start, end), tool="rule-based-clusters", cutoff=cutoff, extent=extent, product=cluster_type) clusters.append(cluster) for cluster in clusters: cluster.rule = str(rules_by_name[cluster.product].conditions) if cluster.location.start < 0: cluster.location = FeatureLocation(0, cluster.location.end) cluster.contig_edge = True if cluster.location.end > len(record): cluster.location = FeatureLocation(cluster.location.start, len(record)) cluster.contig_edge = True clusters = remove_redundant_borders(clusters, rules_by_name) logging.debug("%d rule-based cluster(s) found in record", len(clusters)) return clusters
def new_feature_from_basics(self, start: int, strand: int) -> Feature: """ Constructs a new TTA marking feature from a start position and a strand """ tta_feature = Feature(FeatureLocation(start, start + 3, strand), feature_type="misc_feature", created_by_antismash=True) tta_feature.notes.append( "tta leucine codon, possible target for bldA regulation") self.codon_starts.append((start, strand)) self.features.append(tta_feature) return tta_feature
def find_nr_cds(cluster_position: Tuple[int, int], record: Record) -> Tuple[Tuple[int, int], int]: """ Find the number of CDSs in candidate cluster and adjust the cluster starts and ends to match the CDS starts and ends """ area = FeatureLocation(cluster_position[0], cluster_position[1]) cds_features = record.get_cds_features_within_location(area, with_overlapping=True) if not cds_features: return cluster_position, 0 startlocations = [int(cds.location.start) for cds in cds_features] endlocations = [int(cds.location.end) for cds in cds_features] # avoid getting the complete genome as cluster if one CDS # starts at end and finishes at start of genome if not (0 in startlocations and len(record.seq) in endlocations): cluster_position = (min(startlocations), max(endlocations)) return cluster_position, len(cds_features)
def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled cds = CDSFeature(FeatureLocation(0, 100), locus_tag="test", translation="AAA") record = helpers.DummyRecord(features=[cds], seq="A" * 100) record.add_cluster(helpers.DummyCluster(0, 100)) results = SMCOGResults(record.id) results.best_hits[cds.get_name()] = HMMResult( "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416) results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter" " (Score: 416; E-value: 2.3e-126)")
def add_to_record(self, record: Record) -> None: db_version = pfamdb.get_db_version_from_path(self.database) for i, hit in enumerate(self.hits): pfam_feature = PFAMDomain(FeatureLocation(hit["start"], hit["end"], hit["strand"]), description=hit["description"]) for key in [ "label", "locus_tag", "domain", "evalue", "score", "translation", "db_xref" ]: setattr(pfam_feature, key, hit[key]) pfam_feature.tool = self.tool pfam_feature.database = db_version pfam_feature.detection = "hmmscan" pfam_feature.domain_id = "{}_{}_{:04d}".format( self.tool, pfam_feature.locus_tag, i + 1) record.add_pfam_domain(pfam_feature)
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None: """Store information about promoter sequences to a SeqRecord""" logging.critical("adding promoters based on biopython features") for promoter in promoters: # remember to account for 0-indexed start location new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1), promoter.end), type="promoter") new_feature.qualifiers = { "locus_tag": promoter.get_gene_names( ), # already a list with one or two elements "seq": [str(promoter.seq)], # TODO save string or Seq object? } if isinstance(promoter, CombinedPromoter): new_feature.qualifiers["note"] = ["bidirectional promoter"] secmet_version = Feature.from_biopython(new_feature) secmet_version.created_by_antismash = True record.add_feature(secmet_version)
def test_merges(self): clusterfinder.generate_results(self.record, self.config) assert len(self.record.get_cluster_borders()) == 2 for start, end in [(10, 40), (1040, 1050), (110, 400)]: loc = FeatureLocation(start, end) self.record.add_cluster_border(ClusterBorder(loc, "testtool", product=str(start))) assert not self.record.get_clusters() self.record.create_clusters_from_borders() clusters = self.record.get_clusters() assert len(clusters) == 2 assert clusters[0].location.start == 10 assert clusters[0].location.end == 400 assert clusters[0].products == ("10", "110") assert clusters[1].location.start == 1030 assert clusters[1].location.end == 1120 assert clusters[1].products == ("1040",)
def __init__(self, positions, probability): self.location = FeatureLocation(positions[0], positions[1]) self.probability = probability
def detect_borders_and_signatures(record: Record, signature_file: str, seeds_file: str, rules_file: str, filter_file: str, tool: str) -> RuleDetectionResults: """ Compares all CDS features in a record with HMM signatures and generates Cluster features based on those hits and the current cluster detection rules. Arguments: record: the record to analyse signature_file: a tab separated file; each row being a single HMM reference with columns: label, description, minimum score cutoff, hmm path seeds_file: the file containing all HMM profiles rules_file: the file containing all the rules to use for cluster definition filter_file: a file containing equivalence sets of HMMs tool: the name of the tool providing the HMMs (e.g. clusterfinder, rule_based_clusters) """ full_fasta = fasta.get_fasta_from_record(record) # if there's no CDS features, don't try to do anything if not full_fasta: return None sig_by_name = { sig.name: sig for sig in get_signature_profiles(signature_file) } rules = create_rules(rules_file, set(sig_by_name)) results = [] results_by_id = {} # type: Dict[str, HSP] runresults = run_hmmsearch(seeds_file, full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: raise ValueError( 'Failed to find signature for ID %s / ACC %s' % (hsp.query_id, acc)) if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) # Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id, filter_file, set(sig_by_name)) # Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) # Use rules to determine gene clusters cds_domains_by_cluster, cluster_type_hits = apply_cluster_rules( record, results_by_id, rules) # Find number of sequences on which each pHMM is based num_seeds_per_hmm = get_sequence_counts(signature_file) # Save final results to record rules_by_name = {rule.name: rule for rule in rules} clusters = find_clusters(record, cluster_type_hits, rules_by_name) strip_inferior_domains(cds_domains_by_cluster, rules_by_name) cds_results_by_cluster = {} for cluster in clusters: record.add_cluster_border(cluster) cds_results = [] cluster_extent = FeatureLocation( cluster.location.start - cluster.extent, cluster.location.end + cluster.extent) for cds in record.get_cds_features_within_location(cluster_extent): domains = [] for hsp in results_by_id.get(cds.get_name(), []): domains.append( SecMetQualifier.Domain(hsp.query_id, hsp.evalue, hsp.bitscore, num_seeds_per_hmm[hsp.query_id], tool)) if domains: cds_results.append( CDSResults(cds, domains, cds_domains_by_cluster.get(cds.get_name(), {}))) cds_results_by_cluster[cluster] = cds_results return RuleDetectionResults(cds_results_by_cluster, tool)
def apply_cluster_rules( record: Record, results_by_id: Dict[str, List[HSP]], rules: List[rule_parser.DetectionRule] ) -> Tuple[Dict[str, Dict[str, Set[str]]], Dict[str, Set[str]]]: """ Run detection rules over each CDS and classify them if relevant. A CDS can satisfy multiple rules. If so, all rules satisfied will form part of the type string, separated by '-'. The 'other' type has a lower precedence than other rules and a hit with the 'other' rule will be ignored if another rule is also satisfied. Args: record: the record being checked results_by_id: A dict of CDS ID to a list of HSP results rules: A list of DetectionRule instances Returns: A tuple of a dictionary mapping CDS ID to a dictionary mapping cluster type string to a set of domains used to determine the cluster and a dictionary mapping rule name to a set of CDS feature names that matched the rule """ if not results_by_id: return {}, {} cds_with_hits = sorted( results_by_id, key=lambda gene_id: record.get_cds_by_name(gene_id).location.start) cds_domains_by_cluster_type = {} cluster_type_hits = defaultdict(set) # type: Dict[str, Set[str]] for cds_name in cds_with_hits: feature = record.get_cds_by_name(cds_name) feature_start, feature_end = sorted( [feature.location.start, feature.location.end]) results = [] # type: List[str] rule_texts = [] info_by_range = { } # type: Dict[int, Tuple[Dict[str, CDSFeature], Dict[str, List[HSP]]]] domain_matches = set() # type: Set[str] domains_by_cluster = {} # type: Dict[str, Set[str]] for rule in rules: if rule.cutoff not in info_by_range: # TODO: improve performance location = FeatureLocation(feature_start - rule.cutoff, feature_end + rule.cutoff) nearby = record.get_cds_features_within_location( location, with_overlapping=True) nearby_features = { neighbour.get_name(): neighbour for neighbour in nearby } nearby_results = { neighbour: results_by_id[neighbour] for neighbour in nearby_features if neighbour in results_by_id } info_by_range[rule.cutoff] = (nearby_features, nearby_results) nearby_features, nearby_results = info_by_range[rule.cutoff] matching = rule.detect(cds_name, nearby_features, nearby_results) if matching.met and matching.matches: domains_by_cluster[rule.name] = matching.matches results.append(rule.name) rule_texts.append(rule.reconstruct_rule_text()) domain_matches.update(matching.matches) cluster_type_hits[rule.name].add(cds_name) if domains_by_cluster: cds_domains_by_cluster_type[cds_name] = domains_by_cluster return cds_domains_by_cluster_type, cluster_type_hits
def __init__(self, positions: Tuple[int, int], probability: float) -> None: self.location = FeatureLocation(positions[0], positions[1]) self.probability = probability