예제 #1
0
    def test_retrieval(self):
        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()

        transcript = Transcript(accept_undefined_multi=True)
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"

        chrom_one = Chrom("1", 10**8)
        chrom_fifteen = Chrom("15", 5 * 10**8)
        session.add_all([chrom_one, chrom_fifteen])
        session.commit()
        # junction_start, junction_end, name, strand, score, chrom_id)
        # This junction is on a different chrom
        junction_chrom_one = Junction(47704669 + 1, 47762671 - 1, "chrom_one",
                                      "+", 10, chrom_one.chrom_id)
        # This junction is too far away
        outside_chrom_15 = Junction(47704669 - 10**6 + 1, 47762671 - 10**6 - 1,
                                    "chrom_15_outside", "+", 10,
                                    chrom_fifteen.chrom_id)
        # This junction is in the right place but wrong strand
        wrong_strand_chrom_15 = Junction(47704669 + 1, 47762671 - 1,
                                         "chrom_15_wrong_strand", "-", 10,
                                         chrom_fifteen.chrom_id)
        # This one is correct
        chrom_15_junction = Junction(47704669 + 1, 47762671 - 1, "chrom_15",
                                     "+", 10, chrom_fifteen.chrom_id)
        session.add_all([
            junction_chrom_one, outside_chrom_15, wrong_strand_chrom_15,
            chrom_15_junction
        ])
        session.commit()

        self.assertEqual(junction_chrom_one.chrom, "1")
        for junc in [
                outside_chrom_15, wrong_strand_chrom_15, chrom_15_junction
        ]:
            self.assertEqual(junc.chrom, "15")

        for strand, stranded in itertools.product(("+", "-", None),
                                                  (True, False)):
            transcript.unfinalize()
            transcript.strand = strand
            transcript.finalize()
            sup = Superlocus(transcript, stranded=stranded)
            self.assertTrue(
                (chrom_15_junction.junction_start, chrom_15_junction.end)
                in sup.introns, (chrom_15_junction, sup.introns))
            sup.session = session
            asyncio.run(sup._load_introns())
            if stranded is True and strand is not None:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end, strand)},
                    (stranded, strand))
            elif stranded is False:
                self.assertEqual(
                    sup.locus_verified_introns,
                    {(chrom_15_junction.junction_start,
                      chrom_15_junction.junction_end,
                      chrom_15_junction.strand),
                     (wrong_strand_chrom_15.junction_start,
                      wrong_strand_chrom_15.junction_end,
                      wrong_strand_chrom_15.strand)}, (stranded, strand))
            elif stranded is True and strand is None:
                self.assertEqual(sup.locus_verified_introns, set())
예제 #2
0
    def test_get_external(self):
        checked_conf = load_and_validate_config(None).copy()
        checked_conf.pick.output_format.report_all_external_metrics = True
        transcript = Transcript()
        transcript.chrom = "15"
        transcript.source = "protein_coding"
        transcript.start = 47631264
        transcript.end = 48051999

        exons = [(47631264, 47631416), (47704590, 47704669),
                 (47762671, 47762742), (47893062, 47893093),
                 (47895572, 47895655), (48051942, 48051999)]

        transcript.strand = "+"
        transcript.add_exons(exons)
        transcript.id = "ENST00000560636"
        transcript.parent = "ENSG00000137872"
        transcript2 = transcript.copy()
        transcript2.id = "ENST00000560637"
        checked_conf.scoring.scoring["attributes.tpm"] = MinMaxScore.Schema(
        ).load({
            "rescaling": "max",
            "default": 0,
            "rtype": "float",
            'multiplier': 4,
            'use_raw': True,
            'percentage': True
        })
        transcript.attributes["tpm"] = 10

        int_source = ExternalSource('int', 'int', 0)
        float_source = ExternalSource('float', 'float', 0)
        bool_source = ExternalSource('bool', 'bool', 0)

        raw_int_source = ExternalSource('raw_int', 'int', 1)
        raw_float_source = ExternalSource('raw_float', 'float', 1)
        raw_bool_source = ExternalSource('raw_bool', 'bool', 1)

        int_score = External(1, 1, 10)
        float_score = External(1, 2, 10.0)
        bool_score = External(
            1, 3, int(False)
        )  # We cast as int here following external.py serialize function

        raw_int_score = External(1, 4, 8)
        raw_float_score = External(1, 5, 8.0)
        raw_bool_score = External(
            1, 6, int(True)
        )  # We cast as int here following external.py serialize function

        query = Query(transcript.id, transcript.cdna_length)
        query2 = Query(transcript2.id, transcript2.cdna_length)

        engine = create_engine("sqlite:///:memory:")
        db.metadata.create_all(engine)
        SessionMaker = sessionmaker(bind=engine)
        session = SessionMaker()
        session.add_all([
            int_source, float_source, bool_source, raw_int_source,
            raw_float_source, raw_bool_source
        ])
        session.add_all([query, query2])
        session.add_all([
            int_score, float_score, bool_score, raw_int_score, raw_float_score,
            raw_bool_score
        ])
        session.commit()
        sup = Superlocus(transcript, configuration=checked_conf)
        sup.session = session
        tid = transcript.id
        self.assertIn(tid, sup.transcripts)
        from collections import namedtuple
        qobj = {1: namedtuple('t', field_names=('query_name'))}
        qobj[1].query_name = 'ENST00000560636'
        external = asyncio.run(sup.get_external(qobj, [1]))

        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'float': (10.0, False),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_float': (8.0, True),
                    'raw_bool': (True, True)
                }
            })

        sup.configuration.pick.output_format.report_all_external_metrics = False
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(len(external), 0)
        # These are meaningless it's just to verify we are loading *only* these metrics.
        # We should *NOT* have 'float' as it is not present in any section.
        sup.configuration.scoring.scoring["external.int"] = MinMaxScore(
            rescaling="max", filter=None)
        sup.configuration.scoring.requirements.parameters[
            "external.raw_float"] = SizeFilter(operator="gt", value=100)
        sup.configuration.scoring.cds_requirements.parameters[
            "external.raw_int"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.as_requirements.parameters[
            "external.raw_bool"] = SizeFilter(operator="lt", value=1)
        sup.configuration.scoring.not_fragmentary.parameters[
            "external.bool"] = SizeFilter(operator="ne", value=False)
        external = asyncio.run(sup.get_external(qobj, [1]))
        self.assertEqual(
            external, {
                'ENST00000560636': {
                    'int': (10, False),
                    'raw_float': (8.0, True),
                    'bool': (False, False),
                    'raw_int': (8, True),
                    'raw_bool': (True, True)
                }
            })
def create_transcript(tid: str, parent: str, lines: List[GtfLine],
                      args: argparse.Namespace):
    """"""

    chroms = defaultdict(list)
    for line in lines:
        chroms[line.chrom].append(line)

    if len(chroms) > 1:
        # Recursively
        for chrom in chroms:
            newtid = tid + "." + chrom
            newparent = parent + "." + chrom
            for transcript in create_transcript(newtid, newparent,
                                                chroms[chrom], args):
                assert transcript.id == newtid, (newtid, transcript.id)
                assert transcript.parent[0] == newparent
                yield transcript
    else:
        # Now we are sure that we only have one chromosome
        exons = sorted([line for line in lines if line.is_exon],
                       key=operator.attrgetter("chrom", "start", "end"))

        if len(exons) == 1:
            transcript = Transcript(exons[0])
            transcript.id = tid
            transcript.parent = parent
            transcript.finalize()
            yield transcript
        else:
            new_exons = deque()
            identifier = ord("A") - 1
            current = exons[0]

            for exon in exons[1:]:
                if ((overlap((exon.start, exon.end),
                             (current.start, current.end)) > 0)
                        or (exon.start - current.end + 1 <= args.min_intron
                            and args.split is False)):
                    # Merge the two exons
                    current.end = exon.end
                elif ((exon.start - current.end + 1 <= args.min_intron
                       and args.split is True)
                      or exon.start - current.end + 1 > args.max_intron):
                    # TODO: split
                    new_exons.append(current)
                    transcript = Transcript(new_exons.popleft())
                    transcript.add_exons(new_exons)
                    transcript.finalize()
                    identifier += 1
                    transcript.parent = parent + "." + chr(identifier)
                    transcript.id = tid + "." + chr(identifier)
                    yield transcript
                    current = exon
                    new_exons = deque()
                else:
                    new_exons.append(current)
                    current = exon

            new_exons.append(current)
            transcript = Transcript(new_exons.popleft())
            transcript.add_exons(new_exons)

            if identifier == ord("A") - 1:
                transcript.id = tid
                transcript.parent = parent
            else:
                identifier += 1
                transcript.id = tid + "." + chr(identifier)
                transcript.parent = parent + "." + chr(identifier)

            transcript.finalize()
            yield transcript