Exemple #1
0
 def __init__(self, _kw=0, chromosome=21, windowSize=100, batchSize=100, testBatchSize=500, seed=1, test_frac=0.05, pos_frac=0.5, load_coverage=True, load_entropy=False, load_recombination=False, include_filtered=True, triclass=False, nearby=0, offset=0, load_entire=True, delref=True):
   self.window = windowSize # If window size k, means we read k base pairs before the center and k after, for a total of 2k+1 base pairs in the input
   self.batchSize = batchSize # Number of training examples per batch
   self.testBatchSize = testBatchSize # Number of testing examples per test batch (we can't test everything at once due to memory)
   self.test_frac = test_frac # Fraction of data used for testing
   self.triclass = triclass # Whether to do tri-class classification (Insertion, Deletion, or neither) as opposed to binary (Indel or non-indel)
   self.nearby = nearby # If nearby is nonzero, negative examples are only sampled from within 'nearby' of some positive example. Otherwise, they are sampled at random from the genome.
   self.offset = offset # Either 0 or 1, to handle 1-indexing of the gnomad_indels.tsv file. Technically should be 1, but in practice 0 seems to work just as well??
   self.load_entropy = load_entropy # Whether to use calculated sequence entropy as input to the model
   self.load_coverage = load_coverage # Whether to use coverage data as input to the model
   self.load_recombination = load_recombination # Whether to use recombination data as input to the model
   reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp") # Load the reference genome
   self.referenceChr = reference[str(chromosome)] # Pick out the sequence data for the chromosome of interest
   self.refChrLen = len(self.referenceChr)
   del reference, ambiguous_bases # Preserve memory
   ext = ".txt"
   if not include_filtered: ext = "_filtered" + ext # If include_filtered is false, filtered examples are excluded from the set of positive indel examples
   if self.triclass:
     self.insertionLocations = np.loadtxt(data_dir + "indelLocations{}_ins".format(chromosome) + ext).astype(int)
     self.deletionLocations = np.loadtxt(data_dir + "indelLocations{}_del".format(chromosome) + ext).astype(int)
     self.indelLocations = np.concatenate((self.insertionLocations, self.deletionLocations))
   else:
     self.indelLocations = np.loadtxt(data_dir + "indelLocations{}".format(chromosome) + ext).astype(int)
   self.nonzeroLocationsRef = np.where(np.any(self.referenceChr != 0, axis = 1))[0] # Locations where the reference is nonzero (if zero, means that that base is missing/uncertain)
   if nearby:
     self.zeroLocationsRef = np.where(np.all(self.referenceChr == 0, axis = 1))[0] # Locations where the reference sequence is zero
     self.setOfZeroLocations = set(self.zeroLocationsRef)
   self.indelLocations = self.indelLocations - offset
   self.coverage = None
   if load_coverage:
     self.coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome))
   self.recombination = None
   if load_recombination:
     self.recombination = lr.load_recombination(data_dir + "recombination_map/genetic_map_chr{}_combined_b37.txt".format(chromosome))
   self.setOfIndelLocations = set(self.indelLocations)
   self.prevChosenRefLocations = set()
   self.cur_index = 0 # Index of next training example (for batching purposes)
   self.test_index = 0 # Index of next testing example (for batching purposes)
   self.chrom_index = 0 # Index of next location in the chromosome (for load_chromosome_window_batch function)
   if seed is not None:
     np.random.seed(seed)
   self.__initializeTrainData(pos_frac)
   self.__initializeTestData()
   if not load_entire: # If we don't need the sequence data once our train and test set are initialized, we can delete it
     del self.referenceChr
     del self.nonzeroLocationsRef
Exemple #2
0
	def __init__(self, _kw=0, chromosome=21, windowSize=100, batchSize=100, testBatchSize=500, seed=1, test_frac=0.05, pos_frac=0.5, load_coverage=True, load_entropy=False, load_recombination=False, include_filtered=True, triclass=False, nearby=0, offset=0, load_entire = True, delref=True):
		self.window = windowSize
		self.batchSize = batchSize
		self.testBatchSize = testBatchSize
		self.test_frac = test_frac
		self.triclass = triclass
		self.nearby = nearby
		self.offset = offset
		self.load_entropy = load_entropy
                self.load_coverage = load_coverage
                self.load_recombination = load_recombination
		reference, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
		self.referenceChr = reference[str(chromosome)]
		self.refChrLen = len(self.referenceChr)
		del reference, ambiguous_bases
		ext = ".txt"
		if not include_filtered: ext = "_filtered" + ext
		if self.triclass:
			self.insertionLocations = np.loadtxt(data_dir + "indelLocations{}_ins".format(chromosome) + ext).astype(int)
			self.deletionLocations = np.loadtxt(data_dir + "indelLocations{}_del".format(chromosome) + ext).astype(int)
			self.indelLocations = np.concatenate((self.insertionLocations, self.deletionLocations))
		else:
			self.indelLocations = np.loadtxt(data_dir + "indelLocations{}".format(chromosome) + ext).astype(int)
		self.nonzeroLocationsRef = np.where(np.any(self.referenceChr != 0, axis = 1))[0]
		if nearby:
		  self.zeroLocationsRef = np.where(np.all(self.referenceChr == 0, axis = 1))[0]
		  self.setOfZeroLocations = set(self.zeroLocationsRef)
		self.indelLocations = self.indelLocations - offset
		self.coverage = None
		if load_coverage:
			self.coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome))
		self.recombination = None
                if load_recombination:
                	self.recombination = lr.load_recombination(data_dir + "recombination_map/genetic_map_chr{}_combined_b37.txt".format(chromosome))
                self.setOfIndelLocations = set(self.indelLocations)
		self.prevChosenRefLocations = set()
		self.cur_index = 0
		self.test_index = 0
		self.chrom_index = 0
		if seed is not None:
			np.random.seed(seed)
		self.__initializeTrainData(pos_frac)
		self.__initializeTestData()
		if not load_entire:
			del self.referenceChr
			del self.nonzeroLocationsRef
Exemple #3
0
 def __init__(self, _kw=0, windowSize=100, batchSize=100, testBatchSize=500, seed=1, pos_frac=0.5, load_coverage=True, load_entropy=False, triclass=False, nearby=0, offset=0, complexity_threshold=0):
   ##
   # If window size k, means we read k base pairs before the center and k after, for a total of 2k+1 base pairs in the input
   self.window = windowSize
   # Number of training examples per batch
   self.batchSize = batchSize
   # Number of testing examples per test batch (we can't test everything at once due to memory)
   self.testBatchSize = testBatchSize
   ##
   # Whether to do tri-class classification (Insertion, Deletion, or neither) as opposed to binary (Indel or non-indel)
   self.triclass = triclass
   # If nearby is nonzero, negative examples are only sampled from within 'nearby' of some positive example. Otherwise, they are sampled at random from the genome.
   self.nearby = nearby
   # Either 0 or 1, to handle 1-indexing of the gnomad_indels.tsv file. Technically should be 1, but in practice 0 seems to work just as well??
   self.offset = offset
   ##
   # Whether to use calculated sequence entropy as input to the model
   self.load_entropy = load_entropy
   # Whether to use coverage data as input to the model
   self.load_coverage = load_coverage
   # Whether to use recombination data as input to the model
   #self.load_recombination = load_recombination
   ##
   # The minimum complexity of the sequence needed to be a part of our train/test/val sets
   self.complexity_threshold = complexity_threshold
   ##
   # Load the reference genome
   self.referenceChrFull, ambiguous_bases = cs273b.load_bitpacked_reference(data_dir + "Homo_sapiens_assembly19.fasta.bp")
   # Preserve memory
   del ambiguous_bases
   ##
   # Index of next training example (for batching purposes)
   self.cur_index = 0
   # Index of next testing example (for batching purposes)
   self.test_index = 0
   # Index of next location in the chromosome (for load_chromosome_window_batch function)
   self.chrom_index = 0
   ##
   if seed is not None:
     np.random.seed(seed)
   self.__initializeTrainData(pos_frac)
   self.__initializeTestData()
Exemple #4
0
 def __init__(self,
              _kw=0,
              windowSize=100,
              batchSize=100,
              testBatchSize=500,
              seed=1,
              test_frac=0.05,
              pos_frac=0.5,
              load_coverage=True,
              load_entropy=False,
              include_filtered=True,
              triclass=False,
              nearby=0,
              offset=0,
              load_entire=True):
     self.window = windowSize
     self.batchSize = batchSize
     self.testBatchSize = testBatchSize
     self.test_frac = test_frac
     self.triclass = triclass
     self.nearby = nearby
     self.offset = offset
     self.include_filtered = include_filtered
     self.load_entropy = load_entropy
     self.load_coverage = load_coverage
     self.referenceChrFull, ambiguous_bases = cs273b.load_bitpacked_reference(
         data_dir + "Homo_sapiens_assembly19.fasta.bp")
     del ambiguous_bases
     self.cur_index = 0
     self.test_index = 0
     self.chrom_index = 0
     if seed is not None:
         np.random.seed(seed)
     self.__initializeTrainData(pos_frac)
     self.__initializeTestData()
     # Nimit ToDo -- in this new context, do the next 3 lines need to exist?
     if not load_entire:
         del self.referenceChrFull
         del self.nonzeroLocationsRef
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

import tensorflow as tf
import numpy as np
import random

import load_coverage as lc
import utils
import cs273b

np.random.seed(1)

data_dir = '/datadrive/project_data/'
reference, ambiguous_bases = cs273b.load_bitpacked_reference(
    data_dir + "Homo_sapiens_assembly19.fasta.bp")
del ambiguous_bases

# TODO: Can augment training set with overlapping windows (i.e. starting at random positions)
forbidden_chroms = [1, 2]
validation_chrom = 8  #np.random.choice(19) + 3
print "Validation chromosome is %d" % validation_chrom
k = 200
window_size = 2 * k + 1
windows_per_bin = 50
margin = 15
expanded_window_size = window_size + 2 * margin
batch_size = 50
num_train_ex = 500000
epochs = 12
complexity_thresh = 1.1