def UniGeneParser(lines): """Treats lines as a stream of unigene records""" for record in GbFinder(lines): curr = LinesToUniGene(record) del curr["//"] # clean up delimiter yield curr
def test_LinesToUniGene(self): """LinesToUniGene should give expected results on sample data""" fake_file = """ID Mm.1 TITLE S100 calcium binder GENE S100a10 CYTOBAND 3 41.7 cM LOCUSLINK 20194 EXPRESS embryo ; whole body ; mammary gland ; brain CHROMOSOME 3 STS ACC=RH128467 UNISTS=211775 STS ACC=M16465 UNISTS= 178878 PROTSIM ORG=H**o sapiens; PROTGI=107251; PROTID=pir:JC1139; PCT=91; ALN=97 PROTSIM ORG=Mus musculus; PROTGI=116487; PROTID=sp:P08207; PCT=100; ALN=97 PROTSIM ORG=Rattus norvegicus; PROTGI=116489; PROTID=sp:P05943; PCT=94; ALN=94 SCOUNT 5 SEQUENCE ACC=BC025044.1; NID=g19263549; PID=g19263550; SEQTYPE=mRNA SEQUENCE ACC=AA471893.1; NID=g2199884; CLONE=IMAGE:872193; END=5'; LID=539; SEQTYPE=EST SEQUENCE ACC=AI842963.1; NID=g5477176; CLONE=UI-M-AO1-aem-f-10-0-UI; END=3'; LID=1944; SEQTYPE=EST; TRACE=158501677 SEQUENCE ACC=CB595147.1; NID=g29513003; CLONE=IMAGE:30300703; END=5'; LID=12885; MGC=6677832; SEQTYPE=EST SEQUENCE ACC=BY144053.1; NID=g26280109; CLONE=L930184D22; END=5'; LID=12267; SEQTYPE=EST // ID Mm.5 TITLE homeo box A10 GENE Hoxa10 CYTOBAND 6 26.33 cM LOCUSLINK 15395 EXPRESS kidney ; colon ; mammary gland CHROMOSOME 6 PROTSIM ORG=Caenorhabditis elegans; PROTGI=7510074; PROTID=pir:T31611; PCT=30; ALN=326 SCOUNT 1 SEQUENCE ACC=AW990320.1; NID=g8185938; CLONE=IMAGE:1513482; END=5'; LID=1043; SEQTYPE=EST; TRACE=94472873 // """ records = list(GbFinder(fake_file.split("\n"))) self.assertEqual(len(records), 2) first, second = list(map(LinesToUniGene, records)) self.assertEqual(first.ID, "Mm.1") self.assertEqual(first.TITLE, "S100 calcium binder") self.assertEqual(first.GENE, "S100a10") self.assertEqual(first.CYTOBAND, "3 41.7 cM") self.assertEqual(first.CHROMOSOME, "3") self.assertEqual(first.LOCUSLINK, 20194) self.assertEqual(first.EXPRESS, ["embryo", "whole body", "mammary gland", "brain"]) self.assertEqual( first.STS, [ { "ACC": "RH128467", "UNISTS": "211775" }, { "ACC": "M16465", "UNISTS": "178878" }, ], ) exp_prot_sim = list( map( UniGeneProtSimRecord, [ { "ORG": "H**o sapiens", "PROTGI": "107251", "PROTID": "pir:JC1139", "PCT": "91", "ALN": "97", }, { "ORG": "Mus musculus", "PROTGI": "116487", "PROTID": "sp:P08207", "PCT": "100", "ALN": "97", }, { "ORG": "Rattus norvegicus", "PROTGI": "116489", "PROTID": "sp:P05943", "PCT": "94", "ALN": "94", }, ], )) for obs, exp in zip(first.PROTSIM, exp_prot_sim): self.assertEqual(obs, exp) self.assertEqual(first.SCOUNT, 5) exp_seqs = list( map( UniGeneSeqRecord, [ { "ACC": "BC025044.1", "NID": "g19263549", "PID": "g19263550", "SEQTYPE": "mRNA", }, { "ACC": "AA471893.1", "NID": "g2199884", "END": "5'", "CLONE": "IMAGE:872193", "LID": "539", "SEQTYPE": "EST", }, { "ACC": "AI842963.1", "NID": "g5477176", "CLONE": "UI-M-AO1-aem-f-10-0-UI", "END": "3'", "LID": "1944", "SEQTYPE": "EST", "TRACE": "158501677", }, { "ACC": "CB595147.1", "NID": "g29513003", "CLONE": "IMAGE:30300703", "END": "5'", "LID": "12885", "MGC": "6677832", "SEQTYPE": "EST", }, { "ACC": "BY144053.1", "NID": "g26280109", "CLONE": "L930184D22", "END": "5'", "LID": "12267", "SEQTYPE": "EST", }, ], )) for obs, exp in zip(first.SEQUENCE, exp_seqs): self.assertEqual(obs, exp) self.assertEqual(second.ID, "Mm.5") self.assertEqual(second.TITLE, "homeo box A10") self.assertEqual(second.GENE, "Hoxa10") self.assertEqual(second.CYTOBAND, "6 26.33 cM") self.assertEqual(second.LOCUSLINK, 15395) self.assertEqual(second.EXPRESS, ["kidney", "colon", "mammary gland"]) self.assertEqual(second.CHROMOSOME, "6") self.assertEqual( second.PROTSIM, list( map( UniGeneProtSimRecord, [{ "ORG": "Caenorhabditis elegans", "PROTGI": "7510074", "PROTID": "pir:T31611", "PCT": "30", "ALN": "326", }], )), ) self.assertEqual(second.SCOUNT, 1) self.assertEqual(second.STS, []) self.assertEqual( second.SEQUENCE, list( map( UniGeneSeqRecord, [{ "ACC": "AW990320.1", "NID": "g8185938", "CLONE": "IMAGE:1513482", "END": "5'", "LID": "1043", "SEQTYPE": "EST", "TRACE": "94472873", }], )), ) # test that the synonym mapping works OK self.assertEqual(second.SequenceIds[0].NucleotideId, "g8185938")