def test_vanilla(self): dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN') persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('HTGCAATCATATGCTTCTGC', persProt.transcript.cDNA[:20])
def setUp(self): try : B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") except KeyError : print "--> Seems to already exist in db" try : B.importSNPs("Human_agnostic.dummySRY.tar.gz") except KeyError : print "--> Seems to already exist in db" self.ref = Genome(name = 'GRCh37.75_Y-Only')
def test_noModif(self) : from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter) : def __init__(self) : SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN) : return None dummy = Genome(name = 'GRCh37.75_Y-Only', SNPs = 'dummySRY_AGN', SNPFilter = MyFilter()) persProt = dummy.get(Protein, id = 'ENSP00000438917')[0] refProt = self.ref.get(Protein, id = 'ENSP00000438917')[0] self.assertEqual(persProt.transcript.cDNA[:20], refProt.transcript.cDNA[:20])
def test_SNP(self) : from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter) : def __init__(self) : SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN) : from pyGeno.SNPFiltering import SequenceSNP return SequenceSNP(dummySRY_AGN.alt) dummy = Genome(name = 'GRCh37.75_Y-Only', SNPs = 'dummySRY_AGN', SNPFilter = MyFilter()) persProt = dummy.get(Protein, id = 'ENSP00000438917')[0] refProt = self.ref.get(Protein, id = 'ENSP00000438917')[0] self.assertEqual('M', refProt.sequence[0]) self.assertEqual('L', persProt.sequence[0])
def test_deletion(self) : from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter) : def __init__(self) : SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN) : from pyGeno.SNPFiltering import SequenceDel refAllele = chromosome.refSequence[dummySRY_AGN.start] return SequenceDel(1) dummy = Genome(name = 'GRCh37.75_Y-Only', SNPs = 'dummySRY_AGN', SNPFilter = MyFilter()) persProt = dummy.get(Protein, id = 'ENSP00000438917')[0] refProt = self.ref.get(Protein, id = 'ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('TGCAATCATATGCTTCTGCT', persProt.transcript.cDNA[:20])
def test_indels(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN_indels): from pyGeno.SNPFiltering import SequenceInsert ret = "" for s in dummySRY_AGN_indels: ret += "X" return SequenceInsert(ret) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN_indels', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('XXXATGCAATCATATGCTTC', persProt.transcript.cDNA[:20])
def setUp(self): # try : # B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") # except KeyError : # deleteGenome("human", "GRCh37.75_Y-Only") # B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") # print "--> Seems to already exist in db" # try : # B.importSNPs("Human_agnostic.dummySRY.tar.gz") # except KeyError : # deleteSNPs("dummySRY_AGN") # B.importSNPs("Human_agnostic.dummySRY.tar.gz") # print "--> Seems to already exist in db" # try : # B.importSNPs("Human_agnostic.dummySRY_indels") # except KeyError : # deleteSNPs("dummySRY_AGN_indels") # B.importSNPs("Human_agnostic.dummySRY_indels") # print "--> Seems to already exist in db" self.ref = Genome(name='GRCh37.75_Y-Only')
class pyGenoSNPTests(unittest.TestCase): def setUp(self): # try : # B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") # except KeyError : # deleteGenome("human", "GRCh37.75_Y-Only") # B.importGenome("Human.GRCh37.75_Y-Only.tar.gz") # print "--> Seems to already exist in db" # try : # B.importSNPs("Human_agnostic.dummySRY.tar.gz") # except KeyError : # deleteSNPs("dummySRY_AGN") # B.importSNPs("Human_agnostic.dummySRY.tar.gz") # print "--> Seems to already exist in db" # try : # B.importSNPs("Human_agnostic.dummySRY_indels") # except KeyError : # deleteSNPs("dummySRY_AGN_indels") # B.importSNPs("Human_agnostic.dummySRY_indels") # print "--> Seems to already exist in db" self.ref = Genome(name='GRCh37.75_Y-Only') def tearDown(self): pass # @unittest.skip("skipping") def test_vanilla(self): dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN') persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('HTGCAATCATATGCTTCTGC', persProt.transcript.cDNA[:20]) # @unittest.skip("skipping") def test_noModif(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): return None dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual(persProt.transcript.cDNA[:20], refProt.transcript.cDNA[:20]) # @unittest.skip("skipping") def test_insert(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceInsert refAllele = chromosome.refSequence[dummySRY_AGN.start] return SequenceInsert('XXX') dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('XXXATGCAATCATATGCTTC', persProt.transcript.cDNA[:20]) # @unittest.skip("skipping") def test_SNP(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceSNP return SequenceSNP(dummySRY_AGN.alt) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('M', refProt.sequence[0]) self.assertEqual('L', persProt.sequence[0]) # @unittest.skip("skipping") def test_deletion(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceDel refAllele = chromosome.refSequence[dummySRY_AGN.start] return SequenceDel(1) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('TGCAATCATATGCTTCTGCT', persProt.transcript.cDNA[:20]) # @unittest.skip("skipping") def test_indels(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN_indels): from pyGeno.SNPFiltering import SequenceInsert ret = "" for s in dummySRY_AGN_indels: ret += "X" return SequenceInsert(ret) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN_indels', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('XXXATGCAATCATATGCTTC', persProt.transcript.cDNA[:20]) # @unittest.skip("skipping") def test_bags(self): dummy = Genome(name='GRCh37.75_Y-Only') self.assertEqual(dummy.wrapped_object, self.ref.wrapped_object) # @unittest.skip("skipping") def test_prot_find(self): prot = self.ref.get(Protein, id='ENSP00000438917')[0] needle = prot.sequence[:10] self.assertEqual(0, prot.find(needle)) needle = prot.sequence[-10:] self.assertEqual(len(prot) - 10, prot.find(needle)) # @unittest.skip("skipping") def test_trans_find(self): trans = self.ref.get(Transcript, name="SRY-001")[0] self.assertEqual(0, trans.find(trans[:5]))
def test_bags(self): dummy = Genome(name='GRCh37.75_Y-Only') self.assertEqual(dummy.wrapped_object, self.ref.wrapped_object)
def setUp(self): self.ref = Genome(name='GRCh37.75_Y-Only')
class pyGenoSNPTests(unittest.TestCase): def setUp(self): self.ref = Genome(name='GRCh37.75_Y-Only') def tearDown(self): pass def test_vanilla(self): dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN') persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('HTGCAATCATATGCTTCTGC', persProt.transcript.cDNA[:20]) def test_noModif(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): return None dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual(persProt.transcript.cDNA[:20], refProt.transcript.cDNA[:20]) def test_insert(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceInsert refAllele = chromosome.refSequence[dummySRY_AGN.start] return SequenceInsert('TCA') dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('ATGATGCAATCATATGCTTC', persProt.transcript.cDNA[:20]) def test_SNP(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceSNP return SequenceSNP(dummySRY_AGN.alt) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('M', refProt.sequence[0]) self.assertEqual('L', persProt.sequence[0]) def test_deletion(self): from pyGeno.SNPFiltering import SNPFilter class MyFilter(SNPFilter): def __init__(self): SNPFilter.__init__(self) def filter(self, chromosome, dummySRY_AGN): from pyGeno.SNPFiltering import SequenceDel refAllele = chromosome.refSequence[dummySRY_AGN.start] return SequenceDel(1) dummy = Genome(name='GRCh37.75_Y-Only', SNPs='dummySRY_AGN', SNPFilter=MyFilter()) persProt = dummy.get(Protein, id='ENSP00000438917')[0] refProt = self.ref.get(Protein, id='ENSP00000438917')[0] self.assertEqual('ATGCAATCATATGCTTCTGC', refProt.transcript.cDNA[:20]) self.assertEqual('TGCAATCATATGCTTCTGCT', persProt.transcript.cDNA[:20]) def test_bags(self): dummy = Genome(name='GRCh37.75_Y-Only') self.assertEqual(dummy.wrapped_object, self.ref.wrapped_object) def test_prot_find(self): prot = self.ref.get(Protein, id='ENSP00000438917')[0] needle = prot.sequence[:10] self.assertEqual(0, prot.find(needle)) needle = prot.sequence[-10:] self.assertEqual(len(prot) - 10, prot.find(needle)) def test_trans_find(self): trans = self.ref.get(Transcript, name="SRY-001")[0] self.assertEqual(0, trans.find(trans[:5])) def test_import_remote_genome(self): self.assertRaises(KeyError, B.importRemoteGenome, "Human.GRCh37.75_Y-Only.tar.gz") def test_import_remote_snps(self): self.assertRaises(KeyError, B.importRemoteSNPs, "Human_agnostic.dummySRY.tar.gz")
def importGenome(packageFile, batchSize=50, verbose=0): """Import a pyGeno genome package. A genome packages is a tar.gz ball that contains at it's root: * gziped fasta files for all chromosomes, or URLs from where them must be downloaded * gziped GTF gene_set file from Ensembl, or an URL from where it must be downloaded * a manifest.ini file such as:: [package_infos] description = Test package. This package installs only chromosome Y of mus musculus maintainer = Tariq Daouda maintainer_contact = tariq.daouda [at] umontreal version = GRCm38.73 [genome] species = Mus_musculus name = GRCm38_test source = http://useast.ensembl.org/info/data/ftp/index.html [chromosome_files] Y = Mus_musculus.GRCm38.73.dna.chromosome.Y.fa.gz / or an url such as ftp://... or http:// [gene_set] gtf = Mus_musculus.GRCm38.73_Y-only.gtf.gz / or an url such as ftp://... or http:// All files except the manifest can be downloaded from: http://useast.ensembl.org/info/data/ftp/index.html A rollback is performed if an exception is caught during importation batchSize sets the number of genes to parse before performing a database save. PCs with little ram like small values, while those endowed with more memory may perform faster with higher ones. Verbose must be an int [0, 4] for various levels of verbosity """ def reformatItems(items): s = str(items) s = s.replace('[', '').replace(']', '').replace("',", ': ').replace( '), ', '\n').replace("'", '').replace('(', '').replace(')', '') return s printf('Importing genome package: %s... (This may take a while)' % packageFile) packageDir = _decompressPackage(packageFile) parser = SafeConfigParser() parser.read(os.path.normpath(packageDir + '/manifest.ini')) packageInfos = parser.items('package_infos') genomeName = parser.get('genome', 'name') species = parser.get('genome', 'species') genomeSource = parser.get('genome', 'source') seqTargetDir = conf.getGenomeSequencePath(species.lower(), genomeName) if os.path.isdir(seqTargetDir): raise KeyError( "The directory %s already exists, Please call deleteGenome() first if you want to reinstall" % seqTargetDir) gtfFile = _getFile(parser.get('gene_set', 'gtf'), packageDir) chromosomesFiles = {} chromosomeSet = set() for key, fil in parser.items('chromosome_files'): chromosomesFiles[key] = _getFile(fil, packageDir) chromosomeSet.add(key) try: genome = Genome(name=genomeName, species=species) except KeyError: pass else: raise KeyError( "There seems to be already a genome (%s, %s), please call deleteGenome() first if you want to reinstall it" % (genomeName, species)) genome = Genome_Raba() genome.set(name=genomeName, species=species, source=genomeSource, packageInfos=packageInfos) printf("Importing:\n\t%s\nGenome:\n\t%s\n..." % (reformatItems(packageInfos).replace('\n', '\n\t'), reformatItems(parser.items('genome')).replace('\n', '\n\t'))) chros = _importGenomeObjects(gtfFile, chromosomeSet, genome, batchSize, verbose) os.makedirs(seqTargetDir) startChro = 0 pBar = ProgressBar(nbEpochs=len(chros)) for chro in chros: pBar.update(label="Importing DNA, chro %s" % chro.number) length = _importSequence(chro, chromosomesFiles[chro.number.lower()], seqTargetDir) chro.start = startChro chro.end = startChro + length startChro = chro.end pBar.close() shutil.rmtree(packageDir) #~ objgraph.show_most_common_types(limit=20) return True
def encode(seq) : code = { 'A' : [0, 0, 0, 1], 'T' : [0, 0, 1, 0], 'C' : [0, 1, 0, 0], 'G' : [1, 0, 0, 0] } res = [] for c in seq : res.extend(code[c]) return res ref = Genome(name = 'GRCh37.75_Y-Only') seqs = [] targs = [] for trans in ref.iterGet(Transcript) : if len(trans.cDNA) >= 150 : r = random.randint(0, len(trans.cDNA) - 100) dnaseq = trans.cDNA[r: r + 100] seqs.append(encode(dnaseq)) targs.append(1) randomseq = getRandomSeq(100) seqs.append(encode(randomseq)) targs.append(0) t = int(len(seqs) * 0.2) validation = [seqs[0:t], targs[0:t]] train = [seqs[t:], targs[t:]]
from pyGeno.Genome import * g = Genome(name = "GRCh37.75") prot = g.get(Protein, id = 'ENSP00000438917')[0] #print the protein sequence print prot.sequence #print the protein's gene biotype print prot.gene.biotype #print protein's transcript sequence print prot.transcript.sequence #fancy queries for exon in g.get(Exon, {"CDS_start >": x1, "CDS_end <=" : x2, "chromosome.number" : "22"}) : #print the exon's coding sequence print exon.CDS #print the exon's transcript sequence print exon.transcript.sequence #You can do the same for your subject specific genomes #by combining a reference genome with polymorphisms g = Genome(name = "GRCh37.75", SNPs = ["STY21_RNA"], SNPFilter = MyFilter())