Exemple #1
0
def printRemoteDatawraps(location=conf.pyGeno_REMOTE_LOCATION):
    """
		print all available datawraps from a remote location the location must have a datawraps.json in the following format::

			{
			"Ordered": {
				"Reference genomes": {
					"Human" :	["GRCh37.75", "GRCh38.78"],
					"Mouse" : ["GRCm38.78"],
				},
				"SNPs":{
					}
			},
			"Flat":{
				"Reference genomes": {
					"GRCh37.75": "Human.GRCh37.75.tar.gz",
					"GRCh38.78": "Human.GRCh37.75.tar.gz",
					"GRCm38.78": "Mouse.GRCm38.78.tar.gz"
				},
				"SNPs":{
				}
			}
		}
		
	"""

    l = listRemoteDatawraps(location)
    printf("Available datawraps for bootstraping\n")
    print json.dumps(l["Ordered"],
                     sort_keys=True,
                     indent=4,
                     separators=(',', ': '))
Exemple #2
0
def _DW(name, url):
    packageDir = tempfile.mkdtemp(prefix="pyGeno_remote_")

    printf("~~~:>\n\tDownloading datawrap: %s..." % name)
    finalFile = os.path.normpath('%s/%s' % (packageDir, name))
    urllib.urlretrieve(url, finalFile)
    printf('\tdone.\n~~~:>')
    return finalFile
Exemple #3
0
def importSNPs(packageFile):
    """The big wrapper, this function should detect the SNP type by the package manifest and then launch the corresponding function.
	Here's an example of a SNP manifest file for Casava SNPs::

		[package_infos]
		description = Casava SNPs for testing purposes
		maintainer = Tariq Daouda
		maintainer_contact = tariq.daouda [at] umontreal
		version = 1

		[set_infos]
		species = human
		name = dummySRY
		type = Agnostic
		source = my place at IRIC

		[snps]
		filename = snps.txt # as with genomes you can either include de file at the root of the package or specify an URL from where it must be downloaded
	"""
    printf("Importing polymorphism set: %s... (This may take a while)" %
           packageFile)

    packageDir = _decompressPackage(packageFile)

    parser = SafeConfigParser()
    parser.read(os.path.normpath(packageDir + '/manifest.ini'))
    packageInfos = parser.items('package_infos')

    setName = parser.get('set_infos', 'name')
    typ = parser.get('set_infos', 'type') + 'SNP'
    species = parser.get('set_infos', 'species').lower()
    genomeSource = parser.get('set_infos', 'source')
    snpsFileTmp = parser.get('snps', 'filename').strip()
    snpsFile = _getFile(parser.get('snps', 'filename'), packageDir)

    try:
        SMaster = SNPMaster(setName=setName)
    except KeyError:
        if typ.lower() == 'casavasnp':
            return _importSNPs_CasavaSNP(setName, species, genomeSource,
                                         snpsFile)
        elif typ.lower() == 'dbsnpsnp':
            return _importSNPs_dbSNPSNP(setName, species, genomeSource,
                                        snpsFile)
        elif typ.lower() == 'topHatsnp':
            return _importSNPs_TopHatSNP(setName, species, genomeSource,
                                         snpsFile)
        elif typ.lower() == 'agnosticsnp':
            return _importSNPs_AgnosticSNP(setName, species, genomeSource,
                                           snpsFile)
        else:
            raise FutureWarning('Unknown SNP type in manifest %s' % typ)
    else:
        raise KeyError(
            "There's already a SNP set by the name %s. Use deleteSNPs() to remove it first"
            % setName)

    shutil.rmtree(packageDir)
Exemple #4
0
def printDatawraps():
    """print all available datawraps for bootstraping"""
    l = listDatawraps()
    printf("Available datawraps for boostraping\n")
    for k, v in l.iteritems():
        printf(k)
        printf("~" * len(k) + "|")
        for vv in v:
            printf(" " * len(k) + "|" + "~~~:> " + vv)
        printf('\n')
Exemple #5
0
def printRemoteDatawraps(location):
    """print all available datawraps from a remote location default is 'http://pygeno.iric.ca/_downloads/datawraps.json'"""
    l = listDatawraps_url()
    printf("Available datawraps for bootstraping\n")
    for typ, dw in l.iteritems():
        printf(typ)
        printf("~" * len(typ) + "|")
        for name in dw:
            printf(" " * len(typ) + "|" + "~~~:> " + name)
        printf('\n')
Exemple #6
0
def _getFile(fil, directory):
    if fil.find("http://") == 0 or fil.find("ftp://") == 0:
        printf("Downloading file: %s..." % fil)
        finalFile = os.path.normpath('%s/%s' % (directory, fil.split('/')[-1]))
        urllib.urlretrieve(fil, finalFile)
        printf('done.')
    else:
        finalFile = os.path.normpath('%s/%s' % (directory, fil))

    return finalFile
Exemple #7
0
def _getFile(fil, directory) :
    if fil.find("http://") == 0 or fil.find("ftp://") == 0 :
        printf("Downloading file: %s..." % fil)
        finalFile = os.path.normpath('%s/%s' %(directory, fil.split('/')[-1]))
        urllib.request.urlretrieve (fil, finalFile)
        #with closing(urllib.request.urlopen(fil)) as r:
        #    with open(finalFile, 'wb') as f:
        #        shutil.copyfileobj(r, f)
        
        printf('done.')
    else :
        finalFile = os.path.normpath('%s/%s' %(directory, fil))
    
    return finalFile
Exemple #8
0
def _importSNPs_AgnosticSNP(setName, species, genomeSource, snpsFile):
    "This function will also create an index on start->chromosomeNumber->setName. Warning : pyGeno wil interpret all positions as 0 based"
    printf('importing SNP set %s for species %s...' % (setName, species))

    snpData = CSVFile()
    snpData.parse(snpsFile, separator="\t")

    AgnosticSNP.dropIndex(('start', 'chromosomeNumber', 'setName'))
    conf.db.beginTransaction()

    pBar = ProgressBar(len(snpData))
    pLabel = ''
    currChrNumber = None
    for snpEntry in snpData:
        tmpChr = snpEntry['chromosomeNumber']
        if tmpChr != currChrNumber:
            currChrNumber = tmpChr
            pLabel = 'Chr %s...' % currChrNumber

        snp = AgnosticSNP()
        snp.species = species
        snp.setName = setName
        for f in snp.getFields():
            try:
                setattr(snp, f, snpEntry[f])
            except KeyError:
                if f != 'species' and f != 'setName':
                    printf("Warning filetype as no key %s", f)
        snp.quality = float(snp.quality)
        snp.start = int(snp.start)
        snp.end = int(snp.end)
        snp.save()
        pBar.update(label=pLabel)

    pBar.close()

    snpMaster = SNPMaster()
    snpMaster.set(setName=setName, SNPType='AgnosticSNP', species=species)
    snpMaster.save()

    printf('saving...')
    conf.db.endTransaction()
    printf('creating indexes...')
    AgnosticSNP.ensureGlobalIndex(('start', 'chromosomeNumber', 'setName'))
    printf('importation of SNP set %s for species %s done.' %
           (setName, species))

    return True
Exemple #9
0
def deleteGenome(species, name):
    """Removes a genome from the database"""

    printf('deleting genome (%s, %s)...' % (species, name))

    conf.db.beginTransaction()
    objs = []
    allGood = True
    try:
        genome = Genome_Raba(name=name, species=species.lower())
        objs.append(genome)
        pBar = ProgressBar(label='preparing')
        for typ in (Chromosome_Raba, Gene_Raba, Transcript_Raba, Exon_Raba,
                    Protein_Raba):
            pBar.update()
            f = RabaQuery(typ, namespace=genome._raba_namespace)
            f.addFilter({'genome': genome})
            for e in f.iterRun():
                objs.append(e)
        pBar.close()

        pBar = ProgressBar(nbEpochs=len(objs), label='deleting objects')
        for e in objs:
            pBar.update()
            e.delete()
        pBar.close()

    except KeyError as e:
        #~ printf("\tWARNING, couldn't remove genome form db, maybe it's not there: ", e)
        raise KeyError(
            "\tWARNING, couldn't remove genome form db, maybe it's not there: ",
            e)
        allGood = False
    printf('\tdeleting folder')
    try:
        shutil.rmtree(conf.getGenomeSequencePath(species, name))
    except OSError as e:
        #~ printf('\tWARNING, Unable to delete folder: ', e)
        OSError('\tWARNING, Unable to delete folder: ', e)
        allGood = False

    conf.db.endTransaction()
    return allGood
Exemple #10
0
def _importSNPs_dbSNPSNP(setName, species, genomeSource, snpsFile):
    "This function will also create an index on start->chromosomeNumber->setName. Warning : pyGeno positions are 0 based"
    snpData = VCFFile(snpsFile, gziped=True, stream=True)
    dbSNPSNP.dropIndex(('start', 'chromosomeNumber', 'setName'))
    conf.db.beginTransaction()
    pBar = ProgressBar()
    pLabel = ''
    for snpEntry in snpData:
        pBar.update(label='Chr %s, %s...' %
                    (snpEntry['#CHROM'], snpEntry['ID']))

        snp = dbSNPSNP()
        for f in snp.getFields():
            try:
                setattr(snp, f, snpEntry[f])
            except KeyError:
                pass
        snp.chromosomeNumber = snpEntry['#CHROM']
        snp.species = species
        snp.setName = setName
        snp.start = snpEntry['POS'] - 1
        snp.alt = snpEntry['ALT']
        snp.ref = snpEntry['REF']
        snp.end = snp.start + len(snp.alt)
        snp.save()

    pBar.close()

    snpMaster = SNPMaster()
    snpMaster.set(setName=setName, SNPType='dbSNPSNP', species=species)
    snpMaster.save()

    printf('saving...')
    conf.db.endTransaction()
    printf('creating indexes...')
    dbSNPSNP.ensureGlobalIndex(('start', 'chromosomeNumber', 'setName'))
    printf('importation of SNP set %s for species %s done.' %
           (setName, species))

    return True
Exemple #11
0
def _importGenomeObjects(gtfFilePath, chroSet, genome, batchSize, verbose=0):
    """verbose must be an int [0, 4] for various levels of verbosity"""
    class Store(object):
        def __init__(self, conf):
            self.conf = conf
            self.chromosomes = {}

            self.genes = {}
            self.transcripts = {}
            self.proteins = {}
            self.exons = {}

        def batch_save(self):
            self.conf.db.beginTransaction()

            for c in self.genes.itervalues():
                c.save()
                conf.removeFromDBRegistery(c)

            for c in self.transcripts.itervalues():
                c.save()
                conf.removeFromDBRegistery(c.exons)
                conf.removeFromDBRegistery(c)

            for c in self.proteins.itervalues():
                c.save()
                conf.removeFromDBRegistery(c)

            self.conf.db.endTransaction()

            del (self.genes)
            del (self.transcripts)
            del (self.proteins)
            del (self.exons)

            self.genes = {}
            self.transcripts = {}
            self.proteins = {}
            self.exons = {}

            gc.collect()

        def save_chros(self):
            pBar = ProgressBar(nbEpochs=len(self.chromosomes))
            for c in self.chromosomes.itervalues():
                pBar.update(label='Chr %s' % c.number)
                c.save()
            pBar.close()

    printf('Importing gene set infos from %s...' % gtfFilePath)

    printf('Backuping indexes...')
    indexes = conf.db.getIndexes()
    printf(
        "Droping all your indexes, (don't worry i'll restore them later)...")
    Genome_Raba.flushIndexes()
    Chromosome_Raba.flushIndexes()
    Gene_Raba.flushIndexes()
    Transcript_Raba.flushIndexes()
    Protein_Raba.flushIndexes()
    Exon_Raba.flushIndexes()

    printf("Parsing gene set...")
    gtf = GTFFile(gtfFilePath, gziped=True)
    printf('Done. Importation begins!')

    store = Store(conf)
    chroNumber = None
    pBar = ProgressBar(nbEpochs=len(gtf))
    for line in gtf:
        chroN = line['seqname']
        pBar.update(label="Chr %s" % chroN)

        if (chroN.upper() in chroSet or chroN.lower() in chroSet):
            strand = line['strand']
            gene_biotype = line['gene_biotype']
            regionType = line['feature']
            frame = line['frame']

            start = int(line['start']) - 1
            end = int(line['end'])
            if start > end:
                start, end = end, start

            chroNumber = chroN.upper()
            if chroNumber not in store.chromosomes:
                store.chromosomes[chroNumber] = Chromosome_Raba()
                store.chromosomes[chroNumber].set(genome=genome,
                                                  number=chroNumber)

            try:
                geneId = line['gene_id']
                geneName = line['gene_name']
            except KeyError:
                geneId = None
                geneName = None
                if verbose:
                    printf('Warning: no gene_id/name found in line %s' %
                           gtf[i])

            if geneId is not None:
                if geneId not in store.genes:
                    if len(store.genes) > batchSize:
                        store.batch_save()

                    if verbose > 0:
                        printf('\tGene %s, %s...' % (geneId, geneName))
                    store.genes[geneId] = Gene_Raba()
                    store.genes[geneId].set(
                        genome=genome,
                        id=geneId,
                        chromosome=store.chromosomes[chroNumber],
                        name=geneName,
                        strand=strand,
                        biotype=gene_biotype)
                if start < store.genes[geneId].start or store.genes[
                        geneId].start is None:
                    store.genes[geneId].start = start
                if end > store.genes[geneId].end or store.genes[
                        geneId].end is None:
                    store.genes[geneId].end = end
            try:
                transId = line['transcript_id']
                transName = line['transcript_name']
            except KeyError:
                transId = None
                transName = None
                if verbose > 2:
                    printf(
                        '\t\tWarning: no transcript_id, name found in line %s'
                        % gtf[i])

            if transId is not None:
                if transId not in store.transcripts:
                    if verbose > 1:
                        printf('\t\tTranscript %s, %s...' %
                               (transId, transName))
                    store.transcripts[transId] = Transcript_Raba()
                    store.transcripts[transId].set(
                        genome=genome,
                        id=transId,
                        chromosome=store.chromosomes[chroNumber],
                        gene=store.genes.get(geneId, None),
                        name=transName)
                if start < store.transcripts[
                        transId].start or store.transcripts[
                            transId].start is None:
                    store.transcripts[transId].start = start
                if end > store.transcripts[transId].end or store.transcripts[
                        transId].end is None:
                    store.transcripts[transId].end = end

                try:
                    protId = line['protein_id']
                except KeyError:
                    protId = None
                    if verbose > 2:
                        printf('Warning: no protein_id found in line %s' %
                               gtf[i])

                if protId is not None and protId not in store.proteins:
                    if verbose > 1:
                        printf('\t\tProtein %s...' % (protId))
                    store.proteins[protId] = Protein_Raba()
                    store.proteins[protId].set(
                        genome=genome,
                        id=protId,
                        chromosome=store.chromosomes[chroNumber],
                        gene=store.genes.get(geneId, None),
                        transcript=store.transcripts.get(transId, None),
                        name=transName)
                    store.transcripts[transId].protein = store.proteins[protId]

                try:
                    exonNumber = int(line['exon_number']) - 1
                    exonKey = (transId, exonNumber)
                except KeyError:
                    exonNumber = None
                    exonKey = None
                    if verbose > 2:
                        printf(
                            'Warning: no exon number or id found in line %s' %
                            gtf[i])

                if exonKey is not None:
                    if verbose > 3:
                        printf('\t\t\texon %s...' % (exonId))

                    if exonKey not in store.exons:
                        store.exons[exonKey] = Exon_Raba()
                        store.exons[exonKey].set(
                            genome=genome,
                            chromosome=store.chromosomes[chroNumber],
                            gene=store.genes.get(geneId, None),
                            transcript=store.transcripts.get(transId, None),
                            protein=store.proteins.get(protId, None),
                            strand=strand,
                            number=exonNumber,
                            start=start,
                            end=end)
                        store.transcripts[transId].exons.append(
                            store.exons[exonKey])

                    try:
                        store.exons[exonKey].id = line['exon_id']
                    except KeyError:
                        pass

                    if regionType == 'exon':
                        if start < store.exons[exonKey].start or store.exons[
                                exonKey].start is None:
                            store.exons[exonKey].start = start
                        if end > store.transcripts[transId].end or store.exons[
                                exonKey].end is None:
                            store.exons[exonKey].end = end
                    elif regionType == 'CDS':
                        store.exons[exonKey].CDS_start = start
                        store.exons[exonKey].CDS_end = end
                        store.exons[exonKey].frame = frame
                    elif regionType == 'stop_codon':
                        if strand == '+':
                            store.exons[exonKey].end += 3
                            if store.exons[exonKey].CDS_end != None:
                                store.exons[exonKey].CDS_end += 3
                        if strand == '-':
                            if store.exons[exonKey].CDS_start != None:
                                store.exons[exonKey].CDS_start -= 3
    pBar.close()

    store.batch_save()

    conf.db.beginTransaction()
    printf('almost done saving chromosomes...')
    store.save_chros()

    printf('saving genome object...')
    genome.save()
    conf.db.endTransaction()

    conf.db.beginTransaction()
    printf('restoring core indexes...')
    # Genome.ensureGlobalIndex(('name', 'species'))
    # Chromosome.ensureGlobalIndex('genome')
    # Gene.ensureGlobalIndex('genome')
    # Transcript.ensureGlobalIndex('genome')
    # Protein.ensureGlobalIndex('genome')
    # Exon.ensureGlobalIndex('genome')
    Transcript.ensureGlobalIndex('exons')

    printf('commiting changes...')
    conf.db.endTransaction()

    conf.db.beginTransaction()
    printf('restoring user indexes')
    pBar = ProgressBar(label="restoring", nbEpochs=len(indexes))
    for idx in indexes:
        pBar.update()
        conf.db.execute(idx[-1].replace('CREATE INDEX',
                                        'CREATE INDEX IF NOT EXISTS'))
    pBar.close()

    printf('commiting changes...')
    conf.db.endTransaction()

    return store.chromosomes.values()
Exemple #12
0
def importGenome(packageFile, batchSize=50, verbose=0):
    """Import a pyGeno genome package. A genome packages is a tar.gz ball that contains at it's root:

	* gziped fasta files for all chromosomes, or URLs from where them must be downloaded
	
	* gziped GTF gene_set file from Ensembl, or an URL from where it must be downloaded
	
	* a manifest.ini file such as::
	
		[package_infos]
		description = Test package. This package installs only chromosome Y of mus musculus
		maintainer = Tariq Daouda
		maintainer_contact = tariq.daouda [at] umontreal
		version = GRCm38.73

		[genome]
		species = Mus_musculus
		name = GRCm38_test
		source = http://useast.ensembl.org/info/data/ftp/index.html

		[chromosome_files]
		Y = Mus_musculus.GRCm38.73.dna.chromosome.Y.fa.gz / or an url such as ftp://... or http://

		[gene_set]
		gtf = Mus_musculus.GRCm38.73_Y-only.gtf.gz / or an url such as ftp://... or http://

	All files except the manifest can be downloaded from: http://useast.ensembl.org/info/data/ftp/index.html
	
	A rollback is performed if an exception is caught during importation
	
	batchSize sets the number of genes to parse before performing a database save. PCs with little ram like
	small values, while those endowed with more memory may perform faster with higher ones.
	
	Verbose must be an int [0, 4] for various levels of verbosity
	"""
    def reformatItems(items):
        s = str(items)
        s = s.replace('[', '').replace(']', '').replace("',", ': ').replace(
            '), ', '\n').replace("'", '').replace('(', '').replace(')', '')
        return s

    printf('Importing genome package: %s... (This may take a while)' %
           packageFile)

    packageDir = _decompressPackage(packageFile)

    parser = SafeConfigParser()
    parser.read(os.path.normpath(packageDir + '/manifest.ini'))
    packageInfos = parser.items('package_infos')

    genomeName = parser.get('genome', 'name')
    species = parser.get('genome', 'species')
    genomeSource = parser.get('genome', 'source')

    seqTargetDir = conf.getGenomeSequencePath(species.lower(), genomeName)
    if os.path.isdir(seqTargetDir):
        raise KeyError(
            "The directory %s already exists, Please call deleteGenome() first if you want to reinstall"
            % seqTargetDir)

    gtfFile = _getFile(parser.get('gene_set', 'gtf'), packageDir)

    chromosomesFiles = {}
    chromosomeSet = set()
    for key, fil in parser.items('chromosome_files'):
        chromosomesFiles[key] = _getFile(fil, packageDir)
        chromosomeSet.add(key)

    try:
        genome = Genome(name=genomeName, species=species)
    except KeyError:
        pass
    else:
        raise KeyError(
            "There seems to be already a genome (%s, %s), please call deleteGenome() first if you want to reinstall it"
            % (genomeName, species))

    genome = Genome_Raba()
    genome.set(name=genomeName,
               species=species,
               source=genomeSource,
               packageInfos=packageInfos)

    printf("Importing:\n\t%s\nGenome:\n\t%s\n..." %
           (reformatItems(packageInfos).replace('\n', '\n\t'),
            reformatItems(parser.items('genome')).replace('\n', '\n\t')))

    chros = _importGenomeObjects(gtfFile, chromosomeSet, genome, batchSize,
                                 verbose)
    os.makedirs(seqTargetDir)
    startChro = 0
    pBar = ProgressBar(nbEpochs=len(chros))
    for chro in chros:
        pBar.update(label="Importing DNA, chro %s" % chro.number)
        length = _importSequence(chro, chromosomesFiles[chro.number.lower()],
                                 seqTargetDir)
        chro.start = startChro
        chro.end = startChro + length
        startChro = chro.end
    pBar.close()

    shutil.rmtree(packageDir)

    #~ objgraph.show_most_common_types(limit=20)
    return True