Beispiel #1
0
 def setUp(self):
     self.index = 0
     self.old_blast_inputs = core.create_blast_inputs
     core.create_blast_inputs = self.dummy_blast_inputs
     self.dummy_cluster = DummyCluster(1, 100)
     self.supercluster = DummySuperCluster([self.dummy_cluster])
     self.region = Region([self.supercluster], [])
     self.regions = [self.region, self.region]
Beispiel #2
0
def create_blast_inputs(region: secmet.Region) -> Tuple[List[str], List[str]]:
    """ Creates fasta file contents for the cluster's CDS features

        Arguments:
            region: the secmet.Region to pull data from

        Returns:
            a tuple of:
                a list of CDS names
                a matching list of CDS sequences
    """
    names = []
    seqs = []
    for cds in region.cds_children:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join([
            "input",
            "c%d" % region.get_region_number(),
            "%d-%d" % (cds.location.start, cds.location.end), strand,
            cds.get_accession(), cds.product
        ])
        names.append(fullname)
        seqs.append(cds.translation)

    return names, seqs
Beispiel #3
0
 def setUp(self):
     self.genes = []
     self.regions = []
     domain_names = self.gen_domain_names()
     for product in ['not_atpks', 'transatpks']:
         cluster = helpers.DummyProtocluster(1, 2, product=product)
         candidate_cluster = helpers.DummyCandidateCluster([cluster])
         self.regions.append(Region(candidate_clusters=[candidate_cluster]))
         for i in range(7):
             locus_tag = chr(ord('a') + i)
             if i == 6:
                 locus_tag = "all"
             cds = helpers.DummyCDS(1, 2, locus_tag=locus_tag)
             cds.product = product
             cds.nrps_pks = DummyNRPSQualfier()
             cds.nrps_pks.domain_names = domain_names["nrpspksdomains_" +
                                                      locus_tag]
             cds.cluster = cluster
             cluster.add_cds(cds)
             self.genes.append(cds)
             self.regions[-1].add_cds(cds)
             assert cds.region == self.regions[-1]
     self.predictions = [
         'redmxmal', 'ccmal', 'ohemal', 'ohmxmal', 'ohmmal', 'ccmmal',
         'emal', 'redmmal', 'mmal', 'ccmxmal', 'mxmal', 'redemal', 'ohmal',
         'mal', 'ccemal'
     ]
 def get_motifs_for_region(self, region: Region) -> Dict[str, List[Prepeptide]]:
     """ Given a region, return a subset of motifs_by_locus for hits within
         that region
     """
     results = {}
     for cluster in region.get_unique_protoclusters():
         for locus in self.clusters.get(cluster.get_protocluster_number(), []):
             results[locus] = self.motifs_by_locus[locus]
     return results
Beispiel #5
0
 def __init__(self, region_feature: secmet.Region) -> None:
     region_number = region_feature.get_region_number()
     super().__init__(region_number,
                      str(region_number),
                      "%s_%d" %
                      (region_feature.parent_record.id, region_number),
                      "Query sequence",
                      list(region_feature.cds_children),
                      rank=0,
                      cluster_type="query")
Beispiel #6
0
 def get_RREs_for_region(self,
                         region: Region) -> Dict[str, List[RREResult]]:
     """ Given a region, return a subset of motifs_by_locus for hits within
         that region
     """
     results = {}
     for cluster in region.get_unique_protoclusters():
         for locus in cluster.cds_children:
             name = locus.get_name()
             if name in self.RRE_by_locus:
                 results[name] = self.RRE_by_locus[name]
     return results
    def __init__(self, results: NRPS_PKS_Results, region_feature: Region, record: RecordLayer) -> None:
        self.url_strict = {}  # type: Dict[str, str]  # gene name -> url
        self.url_relaxed = {}  # type: Dict[str, str]  # gene name -> url
        self._build_urls(region_feature.cds_children)
        super().__init__(record, region_feature)
        assert isinstance(results, NRPS_PKS_Results), type(results)
        self.results = results

        region_number = region_feature.get_region_number()
        self.candidate_clusters = []  # type: List[CandidateClusterLayer]
        for candidate_cluster_pred in results.region_predictions.get(region_number, []):
            candidate_cluster = record.get_candidate_cluster(candidate_cluster_pred.candidate_cluster_number)
            self.candidate_clusters.append(CandidateClusterLayer(candidate_cluster, candidate_cluster_pred))
def convert_region(region: secmet.Region, cds_mapping: Dict[int, str],
                   cds_index: Counter, fasta: IO) -> Dict[str, Any]:
    result = {
        "products":
        region.products,
        "protoclusters":
        [convert_protocluster(pc) for pc in region.get_unique_protoclusters()],
        "cdses":
        {cds.get_name(): convert_cds(cds)
         for cds in region.cds_children},
        "start":
        min(cds.location.start
            for cds in region.cds_children),  # trim any intergenic areas
        "end":
        max(cds.location.end for cds in region.cds_children),
    }
    for cds in region.cds_children:
        index = cds_index.next()
        fasta.write(">%s|%d\n%s\n" %
                    (region.parent_record.id, index, cds.translation))
        cds_mapping[index] = cds.get_name()
    return result
Beispiel #9
0
    def __init__(self, region: secmet.Region,
                 ranking: List[Tuple[ReferenceCluster, Score]],
                 reference_proteins: Dict[str, Protein], prefix: str) -> None:
        if ranking:
            assert reference_proteins
        self.prefix = prefix
        self.query_cluster = QueryRegion(region)
        region_number = region.get_region_number()
        cluster_limit = get_config().cb_nclusters
        self.colour_lookup = build_colour_groups(list(region.cds_children),
                                                 ranking[:cluster_limit])
        self.hits = []  # type: List[Cluster]
        record_prefix = region.parent_record.id.split(".", 1)[0]
        num_added = 0
        queries = set()

        for cluster, score in ranking:
            if record_prefix == cluster.accession.split("_", 1)[0]:
                continue
            # determine overall strand direction of hits
            hit_genes = set()
            strand = determine_strand_of_cluster(region, score.scored_pairings)
            for query, subject in score.scored_pairings:
                queries.add(query.id)
                hit_genes.add(subject.name)
            svg_cluster = Cluster.from_reference_cluster(
                cluster, region_number, score, reference_proteins,
                num_added + 1, len(hit_genes), strand, self.prefix)
            self.hits.append(svg_cluster)
            num_added += 1
            # obey the cluster display limit from options
            if num_added >= cluster_limit:
                break

        self.max_length = self._size_of_largest_cluster()
        self._organise_strands()
Beispiel #10
0
class TestInputGeneration(unittest.TestCase):
    def setUp(self):
        self.index = 0
        self.old_blast_inputs = core.create_blast_inputs
        core.create_blast_inputs = self.dummy_blast_inputs
        self.dummy_cluster = DummyCluster(1, 100)
        self.supercluster = DummySuperCluster([self.dummy_cluster])
        self.region = Region([self.supercluster], [])
        self.regions = [self.region, self.region]

    def tearDown(self):
        core.create_blast_inputs = self.old_blast_inputs

    def dummy_blast_inputs(self, cluster):
        names = []
        seqs = []
        for _ in cluster.cds_children:
            index = self.index
            self.index += 1
            names.append("L%d" % index)
            seqs.append("S%d" % index)
        return names, seqs

    def add_cdses_to_region(self, cdses):
        for cds in cdses:
            self.region.add_cds(cds)

    def test_empty(self):
        with TemporaryDirectory(change=True):
            with self.assertRaisesRegex(
                    ValueError, "Diamond search space contains no sequences"):
                core.write_fastas_with_all_genes(self.regions, "test")

    def test_bad_partitions(self):
        with TemporaryDirectory(change=True):
            for i in [-10, -1, 0]:
                with self.assertRaisesRegex(
                        ValueError, "Partitions must be greater than 0"):
                    core.write_fastas_with_all_genes(self.regions,
                                                     "test",
                                                     partitions=i)
            for i in ["str", None, 1.5]:
                with self.assertRaisesRegex(
                        TypeError, "Partitions must be an int greater than 0"):
                    core.write_fastas_with_all_genes(self.regions,
                                                     "test",
                                                     partitions=i)

    def test_single_file(self):
        self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
        with TemporaryDirectory(change=True):
            files = core.write_fastas_with_all_genes(self.regions,
                                                     "test.fasta")
            assert files == ["test.fasta"]
            assert os.path.exists("test.fasta")
            expected = "".join(">L{0}\nS{0}\n".format(i)
                               for i in range(len(self.regions) * 3))
            assert open("test.fasta").read() == expected

    def test_single_partition(self):
        self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
        with TemporaryDirectory(change=True):
            files = core.write_fastas_with_all_genes(self.regions,
                                                     "test.fasta",
                                                     partitions=1)
            assert files == ["test.fasta"]
            assert os.path.exists("test.fasta")
            expected = "".join(">L{0}\nS{0}\n".format(i)
                               for i in range(len(self.regions) * 3))
            assert open("test.fasta").read() == expected

    def test_multiple_files(self):
        self.add_cdses_to_region([DummyCDS(1, i) for i in range(3, 6)])
        for partitions in [2, 3]:
            with TemporaryDirectory(change=True):
                self.index = 0
                chunk_size = (len(self.regions) * 3) // partitions
                files = core.write_fastas_with_all_genes(self.regions,
                                                         "test.fasta",
                                                         partitions=partitions)
                assert files == ["test%d.fasta" % i for i in range(partitions)]
                for index in range(partitions):
                    assert os.path.exists("test%d.fasta" % index)
                    print(index, chunk_size)
                    contents = open("test%d.fasta" % index).read()
                    assert contents.count(">") == chunk_size
                    expected = "".join(
                        ">L{0}\nS{0}\n".format(i + index * chunk_size)
                        for i in range(chunk_size))
                    assert contents == expected
Beispiel #11
0
 def build_anchor_id(region: Region) -> str:
     """ Builds a consistent HTML anchor identifier for a Region """
     return "r{}c{}".format(region.parent_record.record_index,
                            region.get_region_number())
Beispiel #12
0
def get_clusters_from_region(region: Region) -> List[Dict[str, Any]]:
    """ Converts all Protoclusters in a collection of CandidateCluster features to JSON """
    js_clusters = []
    candidate_clusters = sorted(region.candidate_clusters, key=lambda x: (x.location.start, -len(x.location)))
    candidate_cluster_groupings = _find_non_overlapping_cluster_groups(candidate_clusters)
    start_index = 0
    for candidate_cluster in candidate_clusters:
        # if it's the only candidate_cluster in the region and it's single, don't draw it to minimise noise
        parent = candidate_cluster.parent
        assert isinstance(parent, Region), type(parent)
        if len(parent.candidate_clusters) == 1 and not parent.subregions and len(candidate_cluster.protoclusters) == 1:
            continue
        js_cluster = {"start": candidate_cluster.location.start + 1,
                      "end": candidate_cluster.location.end - 1,
                      "tool": "",
                      "neighbouring_start": candidate_cluster.location.start,
                      "neighbouring_end": candidate_cluster.location.end,
                      "product": "CC %d: %s" % (candidate_cluster.get_candidate_cluster_number(),
                                                candidate_cluster.kind),
                      "kind": "candidatecluster",
                      "prefix": ""}
        js_cluster['height'] = candidate_cluster_groupings[candidate_cluster]
        js_clusters.append(js_cluster)

    if candidate_cluster_groupings:
        start_index += max(candidate_cluster_groupings.values())

    for subregion in sorted(region.subregions, key=lambda x: (x.location.start, -len(x.location), x.tool)):
        start_index += 1
        prefix = ""
        tool = ""
        if isinstance(subregion, SideloadedSubRegion):
            prefix = subregion.tool + (":" if subregion.label else "")
        else:
            tool = subregion.tool
        js_cluster = {"start": subregion.location.start,
                      "end": subregion.location.end,
                      "tool": tool,
                      "neighbouring_start": subregion.location.start,
                      "neighbouring_end": subregion.location.end,
                      "product": subregion.label,
                      "height": start_index,
                      "prefix": prefix,
                      "kind": "subregion"}
        js_clusters.append(js_cluster)

    start_index += 2  # allow for label above
    clusters = region.get_unique_protoclusters()
    cluster_groupings = _find_non_overlapping_cluster_groups(clusters)
    for cluster in clusters:
        prefix = ""
        if isinstance(cluster, SideloadedProtocluster):
            prefix = f"{cluster.tool}:"
        js_cluster = {"start": cluster.core_location.start,
                      "end": cluster.core_location.end,
                      "tool": cluster.tool,
                      "neighbouring_start": cluster.location.start,
                      "neighbouring_end": cluster.location.end,
                      "product": cluster.product,
                      "height": cluster_groupings[cluster] * 2 + start_index,
                      "kind": "protocluster",
                      "prefix": prefix}
        js_clusters.append(js_cluster)

    return js_clusters
Beispiel #13
0
def generate_javascript_data(_record: Record, region: Region,
                             results: ClusterCompareResults) -> Dict[str, Any]:
    """ Generates JSON data for the javascript to draw relevant results in HTML output

        Arguments:
            record: the relevant Record for the results
            region: the specific Region to generate data for
            results: the ClusterCompareResults that need data extracted

        Returns:
            a JSON-friendly dictionary with the relevant data
    """
    data: Dict[str, Any] = {}
    for label, db_results in results.by_database.items():
        data[label] = {}
        variant_results = db_results.by_region.get(region.get_region_number(),
                                                   {})
        for variant, result in sorted(variant_results.items()):
            scores = sorted(result.scores_by_region,
                            key=lambda x: x[1],
                            reverse=True)[:DISPLAY_LIMIT]
            if not scores:
                continue

            variant_data: Dict[str, Dict[str, Any]] = {
                "reference_clusters": {}
            }
            data[label][variant] = variant_data

            for reference, _ in scores:
                ref_entry: Dict[str, Any] = {
                    "start": reference.start,
                    "end": reference.end,
                    "links": [],  # added to afterwards
                    "reverse": False,  # potentially changed later
                }
                genes = {}
                for cds in reference.cdses.values():
                    gene_json = cds.get_minimal_json()
                    gene_json["linked"] = {}
                    genes[cds.name] = gene_json
                variant_data["reference_clusters"][
                    reference.get_identifier()] = ref_entry

                mismatching_strands = 0
                for ref_cds_id, hit in result.hits_by_region.get(
                        reference, {}).items():
                    assert locations.locations_overlap(hit.cds.location,
                                                       region.location)
                    query_cds = hit.cds
                    query_point = query_cds.location.start + (
                        query_cds.location.end - query_cds.location.start) // 2
                    ref_cds = reference.cdses[ref_cds_id]
                    subject_point = ref_cds.location.start + (
                        ref_cds.location.end - ref_cds.location.start) // 2
                    if query_cds.location.strand != ref_cds.location.strand:
                        mismatching_strands += 1
                    genes[ref_cds.name]["linked"][
                        region.get_region_number()] = query_cds.get_name()
                    ref_entry["links"].append({
                        "query": query_cds.get_name(),
                        "subject": ref_cds.name,
                        "query_loc": query_point,
                        "subject_loc": subject_point,
                    })
                ref_entry["reverse"] = mismatching_strands > len(
                    ref_entry["links"]) / 2
                ref_entry["genes"] = list(genes.values())
    return data