def setUp(self): self.index = 0 self.old_blast_inputs = core.create_blast_inputs core.create_blast_inputs = self.dummy_blast_inputs self.dummy_cluster = DummyCluster(1, 100) self.supercluster = DummySuperCluster([self.dummy_cluster]) self.region = Region([self.supercluster], []) self.regions = [self.region, self.region]
def create_blast_inputs(region: secmet.Region) -> Tuple[List[str], List[str]]: """ Creates fasta file contents for the cluster's CDS features Arguments: region: the secmet.Region to pull data from Returns: a tuple of: a list of CDS names a matching list of CDS sequences """ names = [] seqs = [] for cds in region.cds_children: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join([ "input", "c%d" % region.get_region_number(), "%d-%d" % (cds.location.start, cds.location.end), strand, cds.get_accession(), cds.product ]) names.append(fullname) seqs.append(cds.translation) return names, seqs
def setUp(self): self.genes = [] self.regions = [] domain_names = self.gen_domain_names() for product in ['not_atpks', 'transatpks']: cluster = helpers.DummyProtocluster(1, 2, product=product) candidate_cluster = helpers.DummyCandidateCluster([cluster]) self.regions.append(Region(candidate_clusters=[candidate_cluster])) for i in range(7): locus_tag = chr(ord('a') + i) if i == 6: locus_tag = "all" cds = helpers.DummyCDS(1, 2, locus_tag=locus_tag) cds.product = product cds.nrps_pks = DummyNRPSQualfier() cds.nrps_pks.domain_names = domain_names["nrpspksdomains_" + locus_tag] cds.cluster = cluster cluster.add_cds(cds) self.genes.append(cds) self.regions[-1].add_cds(cds) assert cds.region == self.regions[-1] self.predictions = [ 'redmxmal', 'ccmal', 'ohemal', 'ohmxmal', 'ohmmal', 'ccmmal', 'emal', 'redmmal', 'mmal', 'ccmxmal', 'mxmal', 'redemal', 'ohmal', 'mal', 'ccemal' ]
def get_motifs_for_region(self, region: Region) -> Dict[str, List[Prepeptide]]: """ Given a region, return a subset of motifs_by_locus for hits within that region """ results = {} for cluster in region.get_unique_protoclusters(): for locus in self.clusters.get(cluster.get_protocluster_number(), []): results[locus] = self.motifs_by_locus[locus] return results
def __init__(self, region_feature: secmet.Region) -> None: region_number = region_feature.get_region_number() super().__init__(region_number, str(region_number), "%s_%d" % (region_feature.parent_record.id, region_number), "Query sequence", list(region_feature.cds_children), rank=0, cluster_type="query")
def get_RREs_for_region(self, region: Region) -> Dict[str, List[RREResult]]: """ Given a region, return a subset of motifs_by_locus for hits within that region """ results = {} for cluster in region.get_unique_protoclusters(): for locus in cluster.cds_children: name = locus.get_name() if name in self.RRE_by_locus: results[name] = self.RRE_by_locus[name] return results
def __init__(self, results: NRPS_PKS_Results, region_feature: Region, record: RecordLayer) -> None: self.url_strict = {} # type: Dict[str, str] # gene name -> url self.url_relaxed = {} # type: Dict[str, str] # gene name -> url self._build_urls(region_feature.cds_children) super().__init__(record, region_feature) assert isinstance(results, NRPS_PKS_Results), type(results) self.results = results region_number = region_feature.get_region_number() self.candidate_clusters = [] # type: List[CandidateClusterLayer] for candidate_cluster_pred in results.region_predictions.get(region_number, []): candidate_cluster = record.get_candidate_cluster(candidate_cluster_pred.candidate_cluster_number) self.candidate_clusters.append(CandidateClusterLayer(candidate_cluster, candidate_cluster_pred))
def convert_region(region: secmet.Region, cds_mapping: Dict[int, str], cds_index: Counter, fasta: IO) -> Dict[str, Any]: result = { "products": region.products, "protoclusters": [convert_protocluster(pc) for pc in region.get_unique_protoclusters()], "cdses": {cds.get_name(): convert_cds(cds) for cds in region.cds_children}, "start": min(cds.location.start for cds in region.cds_children), # trim any intergenic areas "end": max(cds.location.end for cds in region.cds_children), } for cds in region.cds_children: index = cds_index.next() fasta.write(">%s|%d\n%s\n" % (region.parent_record.id, index, cds.translation)) cds_mapping[index] = cds.get_name() return result
def __init__(self, region: secmet.Region, ranking: List[Tuple[ReferenceCluster, Score]], reference_proteins: Dict[str, Protein], prefix: str) -> None: if ranking: assert reference_proteins self.prefix = prefix self.query_cluster = QueryRegion(region) region_number = region.get_region_number() cluster_limit = get_config().cb_nclusters self.colour_lookup = build_colour_groups(list(region.cds_children), ranking[:cluster_limit]) self.hits = [] # type: List[Cluster] record_prefix = region.parent_record.id.split(".", 1)[0] num_added = 0 queries = set() for cluster, score in ranking: if record_prefix == cluster.accession.split("_", 1)[0]: continue # determine overall strand direction of hits hit_genes = set() strand = determine_strand_of_cluster(region, score.scored_pairings) for query, subject in score.scored_pairings: queries.add(query.id) hit_genes.add(subject.name) svg_cluster = Cluster.from_reference_cluster( cluster, region_number, score, reference_proteins, num_added + 1, len(hit_genes), strand, self.prefix) self.hits.append(svg_cluster) num_added += 1 # obey the cluster display limit from options if num_added >= cluster_limit: break self.max_length = self._size_of_largest_cluster() self._organise_strands()
class TestInputGeneration(unittest.TestCase): def setUp(self): self.index = 0 self.old_blast_inputs = core.create_blast_inputs core.create_blast_inputs = self.dummy_blast_inputs self.dummy_cluster = DummyCluster(1, 100) self.supercluster = DummySuperCluster([self.dummy_cluster]) self.region = Region([self.supercluster], []) self.regions = [self.region, self.region] def tearDown(self): core.create_blast_inputs = self.old_blast_inputs def dummy_blast_inputs(self, cluster): names = [] seqs = [] for _ in cluster.cds_children: index = self.index self.index += 1 names.append("L%d" % index) seqs.append("S%d" % index) return names, seqs def add_cdses_to_region(self, cdses): for cds in cdses: self.region.add_cds(cds) def test_empty(self): with TemporaryDirectory(change=True): with self.assertRaisesRegex( ValueError, "Diamond search space contains no sequences"): core.write_fastas_with_all_genes(self.regions, "test") def test_bad_partitions(self): with TemporaryDirectory(change=True): for i in [-10, -1, 0]: with self.assertRaisesRegex( ValueError, "Partitions must be greater than 0"): core.write_fastas_with_all_genes(self.regions, "test", partitions=i) for i in ["str", None, 1.5]: with self.assertRaisesRegex( TypeError, "Partitions must be an int greater than 0"): core.write_fastas_with_all_genes(self.regions, "test", partitions=i) def test_single_file(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) with TemporaryDirectory(change=True): files = core.write_fastas_with_all_genes(self.regions, "test.fasta") assert files == ["test.fasta"] assert os.path.exists("test.fasta") expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.regions) * 3)) assert open("test.fasta").read() == expected def test_single_partition(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) with TemporaryDirectory(change=True): files = core.write_fastas_with_all_genes(self.regions, "test.fasta", partitions=1) assert files == ["test.fasta"] assert os.path.exists("test.fasta") expected = "".join(">L{0}\nS{0}\n".format(i) for i in range(len(self.regions) * 3)) assert open("test.fasta").read() == expected def test_multiple_files(self): self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)]) for partitions in [2, 3]: with TemporaryDirectory(change=True): self.index = 0 chunk_size = (len(self.regions) * 3) // partitions files = core.write_fastas_with_all_genes(self.regions, "test.fasta", partitions=partitions) assert files == ["test%d.fasta" % i for i in range(partitions)] for index in range(partitions): assert os.path.exists("test%d.fasta" % index) print(index, chunk_size) contents = open("test%d.fasta" % index).read() assert contents.count(">") == chunk_size expected = "".join( ">L{0}\nS{0}\n".format(i + index * chunk_size) for i in range(chunk_size)) assert contents == expected
def build_anchor_id(region: Region) -> str: """ Builds a consistent HTML anchor identifier for a Region """ return "r{}c{}".format(region.parent_record.record_index, region.get_region_number())
def get_clusters_from_region(region: Region) -> List[Dict[str, Any]]: """ Converts all Protoclusters in a collection of CandidateCluster features to JSON """ js_clusters = [] candidate_clusters = sorted(region.candidate_clusters, key=lambda x: (x.location.start, -len(x.location))) candidate_cluster_groupings = _find_non_overlapping_cluster_groups(candidate_clusters) start_index = 0 for candidate_cluster in candidate_clusters: # if it's the only candidate_cluster in the region and it's single, don't draw it to minimise noise parent = candidate_cluster.parent assert isinstance(parent, Region), type(parent) if len(parent.candidate_clusters) == 1 and not parent.subregions and len(candidate_cluster.protoclusters) == 1: continue js_cluster = {"start": candidate_cluster.location.start + 1, "end": candidate_cluster.location.end - 1, "tool": "", "neighbouring_start": candidate_cluster.location.start, "neighbouring_end": candidate_cluster.location.end, "product": "CC %d: %s" % (candidate_cluster.get_candidate_cluster_number(), candidate_cluster.kind), "kind": "candidatecluster", "prefix": ""} js_cluster['height'] = candidate_cluster_groupings[candidate_cluster] js_clusters.append(js_cluster) if candidate_cluster_groupings: start_index += max(candidate_cluster_groupings.values()) for subregion in sorted(region.subregions, key=lambda x: (x.location.start, -len(x.location), x.tool)): start_index += 1 prefix = "" tool = "" if isinstance(subregion, SideloadedSubRegion): prefix = subregion.tool + (":" if subregion.label else "") else: tool = subregion.tool js_cluster = {"start": subregion.location.start, "end": subregion.location.end, "tool": tool, "neighbouring_start": subregion.location.start, "neighbouring_end": subregion.location.end, "product": subregion.label, "height": start_index, "prefix": prefix, "kind": "subregion"} js_clusters.append(js_cluster) start_index += 2 # allow for label above clusters = region.get_unique_protoclusters() cluster_groupings = _find_non_overlapping_cluster_groups(clusters) for cluster in clusters: prefix = "" if isinstance(cluster, SideloadedProtocluster): prefix = f"{cluster.tool}:" js_cluster = {"start": cluster.core_location.start, "end": cluster.core_location.end, "tool": cluster.tool, "neighbouring_start": cluster.location.start, "neighbouring_end": cluster.location.end, "product": cluster.product, "height": cluster_groupings[cluster] * 2 + start_index, "kind": "protocluster", "prefix": prefix} js_clusters.append(js_cluster) return js_clusters
def generate_javascript_data(_record: Record, region: Region, results: ClusterCompareResults) -> Dict[str, Any]: """ Generates JSON data for the javascript to draw relevant results in HTML output Arguments: record: the relevant Record for the results region: the specific Region to generate data for results: the ClusterCompareResults that need data extracted Returns: a JSON-friendly dictionary with the relevant data """ data: Dict[str, Any] = {} for label, db_results in results.by_database.items(): data[label] = {} variant_results = db_results.by_region.get(region.get_region_number(), {}) for variant, result in sorted(variant_results.items()): scores = sorted(result.scores_by_region, key=lambda x: x[1], reverse=True)[:DISPLAY_LIMIT] if not scores: continue variant_data: Dict[str, Dict[str, Any]] = { "reference_clusters": {} } data[label][variant] = variant_data for reference, _ in scores: ref_entry: Dict[str, Any] = { "start": reference.start, "end": reference.end, "links": [], # added to afterwards "reverse": False, # potentially changed later } genes = {} for cds in reference.cdses.values(): gene_json = cds.get_minimal_json() gene_json["linked"] = {} genes[cds.name] = gene_json variant_data["reference_clusters"][ reference.get_identifier()] = ref_entry mismatching_strands = 0 for ref_cds_id, hit in result.hits_by_region.get( reference, {}).items(): assert locations.locations_overlap(hit.cds.location, region.location) query_cds = hit.cds query_point = query_cds.location.start + ( query_cds.location.end - query_cds.location.start) // 2 ref_cds = reference.cdses[ref_cds_id] subject_point = ref_cds.location.start + ( ref_cds.location.end - ref_cds.location.start) // 2 if query_cds.location.strand != ref_cds.location.strand: mismatching_strands += 1 genes[ref_cds.name]["linked"][ region.get_region_number()] = query_cds.get_name() ref_entry["links"].append({ "query": query_cds.get_name(), "subject": ref_cds.name, "query_loc": query_point, "subject_loc": subject_point, }) ref_entry["reverse"] = mismatching_strands > len( ref_entry["links"]) / 2 ref_entry["genes"] = list(genes.values()) return data