def convert_files(): print 'Converting files to CSV...' filenames = map(utils.datafile, get_filenames()) convert(filenames[0], filenames[1], utils.datafile('mnist-train.csv'), 60000) convert(filenames[2], filenames[3], utils.datafile('mnist-test.csv'), 10000) for fn in filenames: os.remove(fn)
def download_files(): base_url = 'http://yann.lecun.com/exdb/mnist/' for fn in get_filenames(): file_url = base_url+fn+'.gz' local_path = utils.datafile(fn+'.gz') print 'Saving {0} to {1}'.format(file_url, local_path) urllib.urlretrieve(file_url,local_path)
def ensure_datadir_exists(): datadir = utils.datafile('') if os.path.exists(datadir): print 'Directory {0} already exists; skipping'.format(datadir) else: print 'Creating directory {0}'.format(datadir) os.makedirs(datadir)
def extract_files(): for fn in map(utils.datafile, get_filenames()): with gzip.open(fn+'.gz', 'rb') as gzf: with open(utils.datafile(fn), 'wb') as f: print 'Extracting {0}'.format(gzf.name) f.write(gzf.read()) print 'Deleting {0}.gz'.format(fn) os.remove(fn+'.gz')
def get_Y_depth(self, N=5): UNIQY = datafile("chrY.{}.unique_ccn.gc".format( self.ref.split('_')[0])) fp = open(UNIQY) depths = [] for i, row in enumerate(fp): # Some regions still have mapped reads, exclude a few if i in (1, 4, 6, 7, 10, 11, 13, 16, 18, 19): continue if len(depths) >= N: break c, start, end, gc = row.split() start, end = int(start), int(end) d = self.region_depth(c, start, end) depths.append(d) self.logger.debug("Y depths (first {} regions): {}"\ .format(N, np.array(depths))) return np.median(depths)
import numpy as np from math import exp from collections import defaultdict from bam_parser import PEextractor, FLANKMATCH, SPAN from utils import datafile, listify from scipy.stats import gaussian_kde, poisson # Global settings MAX_PERIOD = 6 SMALL_VALUE = exp(-10) REALLY_SMALL_VALUE = exp(-100) MODEL_PREFIX = "illumina_v3.pcrfree" STEPMODEL = datafile(MODEL_PREFIX + ".stepmodel") NOISEMODEL = datafile(MODEL_PREFIX + ".stuttermodel") MIN_SPANNING_PAIRS = 5 class StepModel: """ Contains information about step size distributions """ def __init__(self, filename=STEPMODEL): self.non_unit_step_by_period = {} self.step_size_by_period = {} fp = open(filename, "r") for i in range(MAX_PERIOD): line = fp.readline() self.non_unit_step_by_period[i + 1] = float(line.strip())