Ejemplo n.º 1
0
def UniGeneParser(lines):
    """Treats lines as a stream of unigene records"""
    for record in GbFinder(lines):
        curr = LinesToUniGene(record)
        del curr["//"]  # clean up delimiter
        yield curr
Ejemplo n.º 2
0
    def test_LinesToUniGene(self):
        """LinesToUniGene should give expected results on sample data"""
        fake_file = """ID          Mm.1
TITLE       S100 calcium binder
GENE        S100a10
CYTOBAND    3 41.7 cM
LOCUSLINK   20194
EXPRESS     embryo ; whole body ; mammary gland ; brain
CHROMOSOME  3
STS         ACC=RH128467 UNISTS=211775
STS         ACC=M16465 UNISTS= 178878
PROTSIM     ORG=H**o sapiens; PROTGI=107251; PROTID=pir:JC1139; PCT=91; ALN=97
PROTSIM     ORG=Mus musculus; PROTGI=116487; PROTID=sp:P08207; PCT=100; ALN=97
PROTSIM     ORG=Rattus norvegicus; PROTGI=116489; PROTID=sp:P05943; PCT=94; ALN=94
SCOUNT      5
SEQUENCE    ACC=BC025044.1; NID=g19263549; PID=g19263550; SEQTYPE=mRNA
SEQUENCE    ACC=AA471893.1; NID=g2199884; CLONE=IMAGE:872193; END=5'; LID=539; SEQTYPE=EST
SEQUENCE    ACC=AI842963.1; NID=g5477176; CLONE=UI-M-AO1-aem-f-10-0-UI; END=3'; LID=1944; SEQTYPE=EST; TRACE=158501677
SEQUENCE    ACC=CB595147.1; NID=g29513003; CLONE=IMAGE:30300703; END=5'; LID=12885; MGC=6677832; SEQTYPE=EST
SEQUENCE    ACC=BY144053.1; NID=g26280109; CLONE=L930184D22; END=5'; LID=12267; SEQTYPE=EST
//
ID          Mm.5
TITLE       homeo box A10
GENE        Hoxa10
CYTOBAND    6 26.33 cM
LOCUSLINK   15395
EXPRESS     kidney ; colon ; mammary gland
CHROMOSOME  6
PROTSIM     ORG=Caenorhabditis elegans; PROTGI=7510074; PROTID=pir:T31611; PCT=30; ALN=326
SCOUNT      1
SEQUENCE    ACC=AW990320.1; NID=g8185938; CLONE=IMAGE:1513482; END=5'; LID=1043; SEQTYPE=EST; TRACE=94472873
//
"""
        records = list(GbFinder(fake_file.split("\n")))
        self.assertEqual(len(records), 2)
        first, second = list(map(LinesToUniGene, records))
        self.assertEqual(first.ID, "Mm.1")
        self.assertEqual(first.TITLE, "S100 calcium binder")
        self.assertEqual(first.GENE, "S100a10")
        self.assertEqual(first.CYTOBAND, "3 41.7 cM")
        self.assertEqual(first.CHROMOSOME, "3")
        self.assertEqual(first.LOCUSLINK, 20194)
        self.assertEqual(first.EXPRESS,
                         ["embryo", "whole body", "mammary gland", "brain"])
        self.assertEqual(
            first.STS,
            [
                {
                    "ACC": "RH128467",
                    "UNISTS": "211775"
                },
                {
                    "ACC": "M16465",
                    "UNISTS": "178878"
                },
            ],
        )
        exp_prot_sim = list(
            map(
                UniGeneProtSimRecord,
                [
                    {
                        "ORG": "H**o sapiens",
                        "PROTGI": "107251",
                        "PROTID": "pir:JC1139",
                        "PCT": "91",
                        "ALN": "97",
                    },
                    {
                        "ORG": "Mus musculus",
                        "PROTGI": "116487",
                        "PROTID": "sp:P08207",
                        "PCT": "100",
                        "ALN": "97",
                    },
                    {
                        "ORG": "Rattus norvegicus",
                        "PROTGI": "116489",
                        "PROTID": "sp:P05943",
                        "PCT": "94",
                        "ALN": "94",
                    },
                ],
            ))
        for obs, exp in zip(first.PROTSIM, exp_prot_sim):
            self.assertEqual(obs, exp)
        self.assertEqual(first.SCOUNT, 5)
        exp_seqs = list(
            map(
                UniGeneSeqRecord,
                [
                    {
                        "ACC": "BC025044.1",
                        "NID": "g19263549",
                        "PID": "g19263550",
                        "SEQTYPE": "mRNA",
                    },
                    {
                        "ACC": "AA471893.1",
                        "NID": "g2199884",
                        "END": "5'",
                        "CLONE": "IMAGE:872193",
                        "LID": "539",
                        "SEQTYPE": "EST",
                    },
                    {
                        "ACC": "AI842963.1",
                        "NID": "g5477176",
                        "CLONE": "UI-M-AO1-aem-f-10-0-UI",
                        "END": "3'",
                        "LID": "1944",
                        "SEQTYPE": "EST",
                        "TRACE": "158501677",
                    },
                    {
                        "ACC": "CB595147.1",
                        "NID": "g29513003",
                        "CLONE": "IMAGE:30300703",
                        "END": "5'",
                        "LID": "12885",
                        "MGC": "6677832",
                        "SEQTYPE": "EST",
                    },
                    {
                        "ACC": "BY144053.1",
                        "NID": "g26280109",
                        "CLONE": "L930184D22",
                        "END": "5'",
                        "LID": "12267",
                        "SEQTYPE": "EST",
                    },
                ],
            ))
        for obs, exp in zip(first.SEQUENCE, exp_seqs):
            self.assertEqual(obs, exp)
        self.assertEqual(second.ID, "Mm.5")
        self.assertEqual(second.TITLE, "homeo box A10")
        self.assertEqual(second.GENE, "Hoxa10")
        self.assertEqual(second.CYTOBAND, "6 26.33 cM")
        self.assertEqual(second.LOCUSLINK, 15395)
        self.assertEqual(second.EXPRESS, ["kidney", "colon", "mammary gland"])
        self.assertEqual(second.CHROMOSOME, "6")
        self.assertEqual(
            second.PROTSIM,
            list(
                map(
                    UniGeneProtSimRecord,
                    [{
                        "ORG": "Caenorhabditis elegans",
                        "PROTGI": "7510074",
                        "PROTID": "pir:T31611",
                        "PCT": "30",
                        "ALN": "326",
                    }],
                )),
        )
        self.assertEqual(second.SCOUNT, 1)
        self.assertEqual(second.STS, [])
        self.assertEqual(
            second.SEQUENCE,
            list(
                map(
                    UniGeneSeqRecord,
                    [{
                        "ACC": "AW990320.1",
                        "NID": "g8185938",
                        "CLONE": "IMAGE:1513482",
                        "END": "5'",
                        "LID": "1043",
                        "SEQTYPE": "EST",
                        "TRACE": "94472873",
                    }],
                )),
        )

        # test that the synonym mapping works OK
        self.assertEqual(second.SequenceIds[0].NucleotideId, "g8185938")