def __init__(self, _kw=0, chromosome=21, windowSize=100, batchSize=100, testBatchSize=500, seed=1, test_frac=0.05, pos_frac=0.5, load_coverage=True, load_entropy=False, load_recombination=False, include_filtered=True, triclass=False, nearby=0, offset=0, load_entire=True, delref=True): self.window = windowSize # If window size k, means we read k base pairs before the center and k after, for a total of 2k+1 base pairs in the input self.batchSize = batchSize # Number of training examples per batch self.testBatchSize = testBatchSize # Number of testing examples per test batch (we can't test everything at once due to memory) self.test_frac = test_frac # Fraction of data used for testing self.triclass = triclass # Whether to do tri-class classification (Insertion, Deletion, or neither) as opposed to binary (Indel or non-indel) self.nearby = nearby # If nearby is nonzero, negative examples are only sampled from within 'nearby' of some positive example. Otherwise, they are sampled at random from the genome. self.offset = offset # Either 0 or 1, to handle 1-indexing of the gnomad_indels.tsv file. Technically should be 1, but in practice 0 seems to work just as well?? self.load_entropy = load_entropy # Whether to use calculated sequence entropy as input to the model self.load_coverage = load_coverage # Whether to use coverage data as input to the model self.load_recombination = load_recombination # Whether to use recombination data as input to the model reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp") # Load the reference genome self.referenceChr = reference[str(chromosome)] # Pick out the sequence data for the chromosome of interest self.refChrLen = len(self.referenceChr) del reference, ambiguous_bases # Preserve memory ext = ".txt" if not include_filtered: ext = "_filtered" + ext # If include_filtered is false, filtered examples are excluded from the set of positive indel examples if self.triclass: self.insertionLocations = np.loadtxt(data_dir + "indelLocations{}_ins".format(chromosome) + ext).astype(int) self.deletionLocations = np.loadtxt(data_dir + "indelLocations{}_del".format(chromosome) + ext).astype(int) self.indelLocations = np.concatenate((self.insertionLocations, self.deletionLocations)) else: self.indelLocations = np.loadtxt(data_dir + "indelLocations{}".format(chromosome) + ext).astype(int) self.nonzeroLocationsRef = np.where(np.any(self.referenceChr != 0, axis = 1))[0] # Locations where the reference is nonzero (if zero, means that that base is missing/uncertain) if nearby: self.zeroLocationsRef = np.where(np.all(self.referenceChr == 0, axis = 1))[0] # Locations where the reference sequence is zero self.setOfZeroLocations = set(self.zeroLocationsRef) self.indelLocations = self.indelLocations - offset self.coverage = None if load_coverage: self.coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome)) self.recombination = None if load_recombination: self.recombination = lr.load_recombination(data_dir + "recombination_map/genetic_map_chr{}_combined_b37.txt".format(chromosome)) self.setOfIndelLocations = set(self.indelLocations) self.prevChosenRefLocations = set() self.cur_index = 0 # Index of next training example (for batching purposes) self.test_index = 0 # Index of next testing example (for batching purposes) self.chrom_index = 0 # Index of next location in the chromosome (for load_chromosome_window_batch function) if seed is not None: np.random.seed(seed) self.__initializeTrainData(pos_frac) self.__initializeTestData() if not load_entire: # If we don't need the sequence data once our train and test set are initialized, we can delete it del self.referenceChr del self.nonzeroLocationsRef
def __init__(self, _kw=0, chromosome=21, windowSize=100, batchSize=100, testBatchSize=500, seed=1, test_frac=0.05, pos_frac=0.5, load_coverage=True, load_entropy=False, load_recombination=False, include_filtered=True, triclass=False, nearby=0, offset=0, load_entire = True, delref=True): self.window = windowSize self.batchSize = batchSize self.testBatchSize = testBatchSize self.test_frac = test_frac self.triclass = triclass self.nearby = nearby self.offset = offset self.load_entropy = load_entropy self.load_coverage = load_coverage self.load_recombination = load_recombination reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp") self.referenceChr = reference[str(chromosome)] self.refChrLen = len(self.referenceChr) del reference, ambiguous_bases ext = ".txt" if not include_filtered: ext = "_filtered" + ext if self.triclass: self.insertionLocations = np.loadtxt(data_dir + "indelLocations{}_ins".format(chromosome) + ext).astype(int) self.deletionLocations = np.loadtxt(data_dir + "indelLocations{}_del".format(chromosome) + ext).astype(int) self.indelLocations = np.concatenate((self.insertionLocations, self.deletionLocations)) else: self.indelLocations = np.loadtxt(data_dir + "indelLocations{}".format(chromosome) + ext).astype(int) self.nonzeroLocationsRef = np.where(np.any(self.referenceChr != 0, axis = 1))[0] if nearby: self.zeroLocationsRef = np.where(np.all(self.referenceChr == 0, axis = 1))[0] self.setOfZeroLocations = set(self.zeroLocationsRef) self.indelLocations = self.indelLocations - offset self.coverage = None if load_coverage: self.coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome)) self.recombination = None if load_recombination: self.recombination = lr.load_recombination(data_dir + "recombination_map/genetic_map_chr{}_combined_b37.txt".format(chromosome)) self.setOfIndelLocations = set(self.indelLocations) self.prevChosenRefLocations = set() self.cur_index = 0 self.test_index = 0 self.chrom_index = 0 if seed is not None: np.random.seed(seed) self.__initializeTrainData(pos_frac) self.__initializeTestData() if not load_entire: del self.referenceChr del self.nonzeroLocationsRef
def __init__(self, _kw=0, windowSize=100, batchSize=100, testBatchSize=500, seed=1, pos_frac=0.5, load_coverage=True, load_entropy=False, triclass=False, nearby=0, offset=0, complexity_threshold=0): ## # If window size k, means we read k base pairs before the center and k after, for a total of 2k+1 base pairs in the input self.window = windowSize # Number of training examples per batch self.batchSize = batchSize # Number of testing examples per test batch (we can't test everything at once due to memory) self.testBatchSize = testBatchSize ## # Whether to do tri-class classification (Insertion, Deletion, or neither) as opposed to binary (Indel or non-indel) self.triclass = triclass # If nearby is nonzero, negative examples are only sampled from within 'nearby' of some positive example. Otherwise, they are sampled at random from the genome. self.nearby = nearby # Either 0 or 1, to handle 1-indexing of the gnomad_indels.tsv file. Technically should be 1, but in practice 0 seems to work just as well?? self.offset = offset ## # Whether to use calculated sequence entropy as input to the model self.load_entropy = load_entropy # Whether to use coverage data as input to the model self.load_coverage = load_coverage # Whether to use recombination data as input to the model #self.load_recombination = load_recombination ## # The minimum complexity of the sequence needed to be a part of our train/test/val sets self.complexity_threshold = complexity_threshold ## # Load the reference genome self.referenceChrFull, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp") # Preserve memory del ambiguous_bases ## # Index of next training example (for batching purposes) self.cur_index = 0 # Index of next testing example (for batching purposes) self.test_index = 0 # Index of next location in the chromosome (for load_chromosome_window_batch function) self.chrom_index = 0 ## if seed is not None: np.random.seed(seed) self.__initializeTrainData(pos_frac) self.__initializeTestData()
def __init__(self, _kw=0, windowSize=100, batchSize=100, testBatchSize=500, seed=1, test_frac=0.05, pos_frac=0.5, load_coverage=True, load_entropy=False, include_filtered=True, triclass=False, nearby=0, offset=0, load_entire=True): self.window = windowSize self.batchSize = batchSize self.testBatchSize = testBatchSize self.test_frac = test_frac self.triclass = triclass self.nearby = nearby self.offset = offset self.include_filtered = include_filtered self.load_entropy = load_entropy self.load_coverage = load_coverage self.referenceChrFull, ambiguous_bases = cs273b.load_bitpacked_reference( data_dir + "Homo_sapiens_assembly19.fasta.bp") del ambiguous_bases self.cur_index = 0 self.test_index = 0 self.chrom_index = 0 if seed is not None: np.random.seed(seed) self.__initializeTrainData(pos_frac) self.__initializeTestData() # Nimit ToDo -- in this new context, do the next 3 lines need to exist? if not load_entire: del self.referenceChrFull del self.nonzeroLocationsRef
import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import tensorflow as tf import numpy as np import random import load_coverage as lc import utils import cs273b np.random.seed(1) data_dir = '/datadrive/project_data/' reference, ambiguous_bases = cs273b.load_bitpacked_reference( data_dir + "Homo_sapiens_assembly19.fasta.bp") del ambiguous_bases # TODO: Can augment training set with overlapping windows (i.e. starting at random positions) forbidden_chroms = [1, 2] validation_chrom = 8 #np.random.choice(19) + 3 print "Validation chromosome is %d" % validation_chrom k = 200 window_size = 2 * k + 1 windows_per_bin = 50 margin = 15 expanded_window_size = window_size + 2 * margin batch_size = 50 num_train_ex = 500000 epochs = 12 complexity_thresh = 1.1