def __initializeTrainData(self, frac_positives): k = self.window # for brevity self.indelLocations = np.loadtxt(data_dir + "indelLocations21.txt").astype(int) lengthIndels = int(len(self.indelLocations) / 22) * 22 num_negatives = int( int((1. / frac_positives - 1) * lengthIndels) / 22) * 22 total_length = lengthIndels + num_negatives num_negatives_per_chrom = int(num_negatives / 22) lengthIndels_per_chrom = int(lengthIndels / 22) total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom dataset = np.zeros((total_length, 2 * k + 1, 4)) coverageDataset = np.zeros((total_length, 2 * k + 1)) entropyDataset = np.zeros((total_length, 2 * k + 1)) indices = np.zeros(total_length, dtype=np.uint32) nearby_indels = np.zeros(total_length, dtype=np.uint32) if self.triclass: labeltype = np.uint8 else: labeltype = np.bool labels = np.zeros(total_length, dtype=labeltype) genome_positions = np.zeros(total_length, dtype=np.uint32) for chromosome in range(1, 23): self.referenceChr = self.referenceChrFull[str(chromosome)] self.refChrLen = len(self.referenceChr) ext = ".txt" if not self.include_filtered: ext = "_filtered" + ext if self.triclass: self.insertionLocations = np.loadtxt( data_dir + "indelLocations{}_ins".format(chromosome) + ext).astype(int) self.deletionLocations = np.loadtxt( data_dir + "indelLocations{}_del".format(chromosome) + ext).astype(int) self.indelLocationsFull = np.concatenate( (self.insertionLocations, self.deletionLocations)) self.insertLocations = np.random.choice( self.insertLocations, size=int(lengthIndels_per_chrom / 2), replace=False) self.deletionLocations = np.random.choice( self.deletionLocations, size=lengthIndels_per_chrom - int(lengthIndels_per_chrom / 2), replace=False) self.indelLocations = np.concatenate( (self.insertionLocations, self.deletionLocations)) self.indelLocations = self.indelLocations - self.offset else: self.indelLocationsFull = np.loadtxt( data_dir + "indelLocations{}".format(chromosome) + ext).astype(int) self.indelLocations = np.random.choice( self.indelLocationsFull, size=lengthIndels_per_chrom, replace=False) self.indelLocations = self.indelLocations - self.offset self.nonzeroLocationsRef = np.where( np.any(self.referenceChr != 0, axis=1))[0] if self.nearby: self.zeroLocationsRef = np.where( np.all(self.referenceChr == 0, axis=1))[0] self.setOfZeroLocations = set(self.zeroLocationsRef) self.coverage = None if self.load_coverage: self.coverage = lc.load_coverage( data_dir + "coverage/{}.npy".format(chromosome)) self.setOfIndelLocations = set(self.indelLocations) self.prevChosenRefLocations = set() nearby_indels[total_length_per_chrom * (chromosome - 1):total_length_per_chrom * (chromosome - 1) + lengthIndels_per_chrom] = self.indelLocations # dataset should have all the indels as well as random negative training samples if self.nearby: neg_positions = np.random.choice(self.indelLocations, size=num_negatives_per_chrom) nearby_indels[total_length_per_chrom * (chromosome - 1) + lengthIndels_per_chrom:total_length_per_chrom * chromosome] = neg_positions offset = np.multiply( np.random.randint(1, self.nearby + 1, size=num_negatives_per_chrom), np.random.choice([-1, 1], size=num_negatives_per_chrom)) neg_positions = neg_positions + offset # locations that are offset from indels by some amount else: neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives_per_chrom) self.nearby_indels = neg_positions # to prevent error if this is undefined for i in range(lengthIndels_per_chrom + num_negatives_per_chrom): if i < lengthIndels_per_chrom: if not self.triclass: label = 1 # standard binary classification labels elif i < len(self.insertionLocations): label = 1 # insertions will be labeled as 1 else: label = 2 # deletions will be labeled as 2 pos = self.indelLocations[i] else: label = 0 pos = neg_positions[i - lengthIndels_per_chrom] if self.nearby: niter = 0 while (pos in self.prevChosenRefLocations) or ( pos in self.setOfZeroLocations ) or (pos in self.setOfIndelLocations) and niter < 1001: nearby_indels[total_length_per_chrom * (chromosome - 1) + i] = np.random.choice( self.indelLocations) pos = nearby_indels[ total_length_per_chrom * (chromosome - 1) + i] + np.random.randint( 1, self.nearby + 1) * np.random.choice( [-1, 1]) niter += 1 else: while (pos in self.prevChosenRefLocations) or ( pos in self.setOfIndelLocations): pos = np.random.choice(self.nonzeroLocationsRef) self.prevChosenRefLocations.add(pos) indices[total_length_per_chrom * (chromosome - 1) + i] = pos coverageWindow = np.zeros(2 * k + 1) # get k base pairs before and after the position window = self.referenceChr[pos - k:pos + k + 1] coverageWindow = None if self.coverage is not None: coverageWindow = utils.flatten(self.coverage[pos - k:pos + k + 1]) dataset[total_length_per_chrom * (chromosome - 1) + i] = window coverageDataset[total_length_per_chrom * (chromosome - 1) + i] = coverageWindow labels[total_length_per_chrom * (chromosome - 1) + i] = label genome_positions[total_length_per_chrom * (chromosome - 1) + i] = pos if self.load_entropy: entropyDataset[:, k + 1:2 * k + 1] = entropy.entropyVector(dataset) rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(indices), list(nearby_indels), list(entropyDataset)) # Shuffle the list np.random.shuffle(rawZipped) a, b, c, d, e, f, g = zip(*rawZipped) dataset = np.array(a) coverageDataset = np.array(b) entropyDataset = np.array(g) labels = np.array(c, dtype=labeltype) genome_positions = np.array(d, dtype=np.uint32) self.indices = np.array(e, dtype=np.uint32) self.nearby_indels = np.array(f, dtype=np.uint32) self.dataset = dataset self.coverageDataset = coverageDataset self.entropyDataset = entropyDataset if self.triclass: self.labels = utils.to_onehot(labels, 3) else: self.labels = np.expand_dims(labels, axis=1) self.genome_positions = genome_positions self.num_train_examples = int( round(total_length * (1 - self.test_frac))) self.ordering = list(range(0, self.num_train_examples))
def __initializeTrainData(self, frac_positives): ## # for brevity k = self.window # The window size used to compute sequence complexity k_seq_complexity = 20 # We use chromosomes 2-22, we won't use chromosome 1 until the very end num_chrom_used = 21 ## # Number of indels in the entire dataset used to train/test/val lengthIndels = 25000*num_chrom_used # Number of non-indels in the entire dataset num_negatives = int(int((1./frac_positives-1) * lengthIndels)/num_chrom_used)*num_chrom_used # Number of locations in the entire dataset total_length = lengthIndels + num_negatives ## # Number of indels in the entire dataset per chromosome num_negatives_per_chrom = int(num_negatives/num_chrom_used) # Number of non-indels in the entire dataset per chromosome lengthIndels_per_chrom = int(lengthIndels/num_chrom_used) # Number of locations in the entire dataset per chromosome total_length_per_chrom = lengthIndels_per_chrom + num_negatives_per_chrom ## # one-hot encoded sequences of size 2*k + 1 around each location dataset = np.zeros((total_length, 2*k + 1, 4)) # coverage corresponding to each location in the dataset coverageDataset = np.zeros((total_length, 2*k + 1)) # entropy of expanding windows in the dataset entropyDataset = np.zeros((total_length, 2*k + 1)) # indices on the genome of the locations in the dataset indices = np.zeros(total_length, dtype=np.uint32) # allele count values for indels, 0 for non-indels allele_count = np.zeros(total_length, dtype=np.uint32) nearby_indels = np.zeros(total_length, dtype=np.uint32) # label is either a bool or an int depending on the number of classes if self.triclass: labeltype = np.uint8 else: labeltype = np.bool # 0 for non-indels 1 (and 2) in case of indels labels = np.zeros(total_length, dtype=labeltype) # seems to be the same as indices, ToDo does it neet to be there??? genome_positions = np.zeros(total_length, dtype=np.uint32) # the chromosome number corresponding to each location chrom_num = np.zeros(total_length, dtype=np.uint32) # Test the number of indels in a non-indel window, as well as multiple indel in a single indel window num_indel_neg_set = 0 num_indel_pos_set = 0 # Load data from chromosomes 2-22 # populate dataset and related variables per chromosome for chromosome in range(2, 23): ## # Load the chromosome from the full genome referenceChr = self.referenceChrFull[str(chromosome)] ## Load and process the positive (indels) dataset # This is a 4 column data: indel locations, allele count, filter value, insertion (1) or deletion (0) indel_data_load = np.load(data_dir + "indelLocationsFiltered" + str(chromosome) + ".npy") indel_indices_set = set(np.array(indel_data_load[:, 0], dtype = int)) indel_data_load = indel_data_load[indel_data_load[:, 0] + k < referenceChr.shape[0]] # Remove those that have complexity below the threshold indel_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity indel_sequence_indices = np.repeat(indel_sequence_indices, indel_data_load.shape[0], axis = 0) indel_sequence_indices = np.reshape(indel_sequence_indices, [-1, indel_data_load.shape[0]]) indel_sequence_indices += np.transpose(np.array(indel_data_load[:, 0], dtype = int)) indel_sequence_complexity = entropy.entropySequence(referenceChr[indel_sequence_indices.transpose(), :]) del indel_sequence_indices # Filter by sequence complexity and filter value around 20 sized window and complexity threshold total_indices = np.arange(indel_data_load.shape[0]) filtered_indices = np.logical_and(indel_data_load[:, 2] == 1, indel_sequence_complexity >= self.complexity_threshold) # Add an additional filter for allele count = 1 filtered_indices = np.logical_and(indel_data_load[:, 1] == 1, filtered_indices) # Sample the indels, taking into consideration the classification problem in hand if self.triclass: filtered_indices_insert = np.logical_and(indel_data_load.iloc[:, 3] == 1, filtered_indices) filtered_indices_insert = total_indices[filtered_indices_insert] filtered_indices_delete = np.logical_and(indel_data_load.iloc[:, 3] == 0, filtered_indices) filtered_indices_delete = total_indices[filtered_indices_delete] insertionLocations = np.random.choice(filtered_indices_insert, size = int(lengthIndels_per_chrom/2), replace = False) deletionLocations = np.random.choice(filtered_indices_delete, size = lengthIndels_per_chrom - int(lengthIndels_per_chrom/2), replace = False) indel_indices = np.concatenate((insertionLocations, deletionLocations)) del filtered_indices_insert, filtered_indices_delete, insertionLocations, deletionLocations else: filtered_indices = total_indices[filtered_indices] indel_indices = np.random.choice(filtered_indices, size = lengthIndels_per_chrom, replace = False) ## indelLocations = np.array(indel_data_load[indel_indices, 0], dtype = int) allele_count_val = indel_data_load[indel_indices, 1] del indel_data_load, indel_indices, filtered_indices, total_indices indelLocations = indelLocations - self.offset ## Load the coverage data if needed coverage = None if self.load_coverage: coverage = lc.load_coverage(data_dir + "coverage/{}.npy".format(chromosome)) ## Create the negative dataset rel_size_neg_large = 2 neg_positions_large = np.load(data_dir + "nonindelLocationsSampled" + str(chromosome) + '.npy') neg_positions_large = np.random.choice(neg_positions_large, size = rel_size_neg_large*num_negatives_per_chrom, replace = False) # Remove those that have complexity below the threshold neg_sequence_indices = np.arange(2*k_seq_complexity + 1) - k_seq_complexity neg_sequence_indices = np.repeat(neg_sequence_indices, len(neg_positions_large), axis = 0) neg_sequence_indices = np.reshape(neg_sequence_indices, [-1, len(neg_positions_large)]) neg_sequence_indices += np.transpose(neg_positions_large) neg_sequence_complexity = entropy.entropySequence(referenceChr[neg_sequence_indices.transpose(), :]) neg_positions_large = neg_positions_large[neg_sequence_complexity >= self.complexity_threshold] del neg_sequence_indices, neg_sequence_complexity ## if self.nearby: # Create a list of all permissible nearby locations nearby_locations = np.arange(-self.nearby, self.nearby + 1) nearby_locations = np.repeat(nearby_locations, len(indelLocations), axis = 0) nearby_locations = np.reshape(nearby_locations, [-1, len(indelLocations)]) nearby_locations += np.transpose(indelLocations) nearby_locations = np.reshape(nearby_locations, -1) # Remove all indel locations and low-complexity non-indel locations from nearby locations nearby_locations = np.array((set(nearby_locations) - set(indelLocationsFull)) & set(neg_positions_large)) if len(nearby_locations) >= num_negatives_per_chrom: neg_positions = np.random.choice(nearby_locations, size = num_negatives_per_chrom, replace = False) else: # Else sample the remaining from the negative positions- this is the best that can be done, try increasing the nearby size print "Try increasing nearby or rel_size_neg_large. Not enough nearby-non-indels could be sampled in chromosome {}".format(chromosome) num_neg_needed = num_negatives_per_chrom - len(nearby_locations) not_nearby = np.random.choice(list((set(neg_positions_large) - set(indelLocationsFull)) - set(nearby_locations)), size = num_neg_needed, replace = False) neg_positions = np.concatenate((nearby_locations, not_nearby)) else: neg_positions = np.random.choice(neg_positions_large, size = num_negatives_per_chrom, replace = False) for i in range(lengthIndels_per_chrom + num_negatives_per_chrom): if i < lengthIndels_per_chrom: if not self.triclass: label = 1 # standard binary classification labels elif i < int(lengthIndels_per_chrom/2): label = 1 # insertions will be labeled as 1 else: label = 2 # deletions will be labeled as 2 pos = indelLocations[i] allele_count[total_length_per_chrom*(chromosome - 2) + i] = allele_count_val[i] num_indel_pos_set += len(indel_indices_set & set(range(pos - k, pos + k + 1))) else: label = 0 pos = neg_positions[i - lengthIndels_per_chrom] # Compute the true value of nearby_indels TODO #if self.nearby: num_indel_neg_set += len(indel_indices_set & set(range(pos - k, pos + k + 1))) indices[total_length_per_chrom*(chromosome - 2) + i] = pos coverageWindow = np.zeros(2*k + 1) # get k base pairs before and after the position window = referenceChr[pos - k : pos + k + 1] if coverage is not None: coverageWindow += np.mean(utils.flatten(coverage[pos - k : pos + k + 1]))#= utils.flatten(coverage[pos - k : pos + k + 1]) dataset[total_length_per_chrom*(chromosome - 2) + i] = window coverageDataset[total_length_per_chrom*(chromosome - 2) + i] = coverageWindow labels[total_length_per_chrom*(chromosome - 2) + i] = label genome_positions[total_length_per_chrom*(chromosome - 2) + i] = pos chrom_num[total_length_per_chrom*(chromosome - 2) + i] = chromosome if self.load_entropy: entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset) ## # Randomly choose the validation and test chromosome self.val_chrom, self.test_chrom = np.random.choice(range(2, 23), 2, replace=False) # Set the number of training examples, and the respective set indices total_indices = np.arange(total_length) self.num_train_examples = total_length_per_chrom*(num_chrom_used - 2) self.train_indices = total_indices[np.logical_and(chrom_num != self.val_chrom, chrom_num != self.test_chrom)] self.test_indices = total_indices[chrom_num == self.test_chrom] self.val_indices = total_indices[chrom_num == self.val_chrom] ## # Set the respective variables self.dataset = dataset self.coverageDataset = coverageDataset self.entropyDataset = entropyDataset self.indices = indices self.allele_count = allele_count self.nearby_indels = nearby_indels self.genome_positions = genome_positions if self.triclass: self.labels = utils.to_onehot(labels, 3) else: self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience) del dataset, coverageDataset, entropyDataset, indices, allele_count, nearby_indels, genome_positions, labels print num_indel_pos_set print float(num_indel_pos_set)/lengthIndels print num_indel_neg_set print float(num_indel_neg_set)/num_negatives print np.mean(np.mean(self.coverageDataset, axis = 1)) print np.mean(np.var(self.coverageDataset, axis = 1))
def __initializeTrainData(self, frac_positives): k = self.window # for brevity lengthIndels = len(self.indelLocations) # Total number of indels num_negatives = int((1./frac_positives-1) * lengthIndels) # Total number of negative training examples we need, based on the desired fraction of positive examples total_length = lengthIndels + num_negatives # Total number of examples [both training and testing!] dataset = np.zeros((total_length, 2*k + 1, 4)) coverageDataset = np.zeros((total_length, 2*k + 1)) entropyDataset = np.zeros((total_length, 2*k + 1)) recombinationDataset = np.zeros((total_length, 1)) #recombinationDataset = np.zeros((total_length, 2*k + 1)) if self.triclass: labeltype = np.uint8 # Three distinct labels in this case else: labeltype = np.bool labels = np.zeros(total_length, dtype=labeltype) genome_positions = np.zeros(total_length, dtype=np.uint32) # dataset should have all the indels as well as random negative training samples if self.nearby: neg_positions = np.random.choice(self.indelLocations, size=num_negatives) # First choose a random number of examples among known indels self.nearby_indels = neg_positions # Store the locations of these selected indels offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives)) # Offset by a random nonzero amount <= to self.nearby neg_positions = neg_positions + offset # These locations that are offset from indels by some amount are [roughly] our negative examples; but see for loop below else: neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives) # Select random nonzero locations from the reference genomes self.nearby_indels = neg_positions # to prevent error if this is undefined (value should not be used as it is meaningless in this case) self.indices = neg_positions # Locations of the negative training examples for i in range(lengthIndels + num_negatives): # Loop over all examples if i < lengthIndels: # Positive example if not self.triclass: label = 1 # standard binary classification labels elif i < len(self.insertionLocations): label = 1 # insertions will be labeled as 1 else: label = 2 # deletions will be labeled as 2 pos = self.indelLocations[i] else: # Negative example (not an indel) label = 0 pos = neg_positions[i - lengthIndels] # Get corresponding entry of neg_positions, which stores the tentative positions of all negative examples. However, we may need to update this position. We still predefine them and update if needed simply for efficiency's sake. if self.nearby: # Position must be near a known indel niter = 0 # Avoid infinite loops (probably should still make sure the selected position is not at an indel location (or zero?) regardless of # iterations in the below condition, though) while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001: # Avoid choosing an already selected position, a zero location (unknown reference base), or an actual indel self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations) # Select again using the same procedure, until we get a valid negative example pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1]) niter += 1 else: # Position simply just has to not be previously selected, and not a positive (i.e. indel) example while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations): pos = np.random.choice(self.nonzeroLocationsRef) self.indices[i - lengthIndels] = pos # True position of the negative example self.prevChosenRefLocations.add(pos) # Store this position, so we don't reuse it # get the k base pairs before and after the position, and the position itself window = self.referenceChr[pos - k : pos + k + 1] coverageWindow = None # Coverage window corresponding to the input base pairs (loaded only if necessary) if self.coverage is not None: coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1]) recombWindowAverage = None if self.recombination is not None: # Recombination window, if needed recombWindow = np.zeros((2*k + 1, 1)) recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1)) recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))] recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds] recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))] recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] recombWindowAverage = np.mean(recombWindow) #recombWindowAverage = utils.flatten(recombWindow) dataset[i] = window # Store the data for this example in the overall data structure coverageDataset[i] = coverageWindow recombinationDataset[i] = recombWindowAverage labels[i] = label genome_positions[i] = pos # This might be the same as self.indices? self.indices = np.concatenate((self.indelLocations, self.indices)) # Indices for positive examples are simply in self.indelLocations self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels)) # "Nearby" indel for a positive example is the indel itself if self.load_entropy: entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset) # Create the entropy vectors, if needed rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset)) # Shuffle the data np.random.shuffle(rawZipped) a, b, c, d, e, f, g, h = zip(*rawZipped) dataset = np.array(a) coverageDataset = np.array(b) entropyDataset = np.array(g) recombinationDataset = np.array(h) labels = np.array(c, dtype=labeltype) genome_positions = np.array(d, dtype=np.uint32) self.indices = np.array(e, dtype=np.uint32) self.nearby_indels = np.array(f, dtype=np.uint32) self.dataset = dataset self.coverageDataset = coverageDataset self.entropyDataset = entropyDataset self.recombinationDataset = recombinationDataset if self.triclass: self.labels = utils.to_onehot(labels, 3) else: self.labels = np.expand_dims(labels, axis=1) # Make labels n by 1 (for convenience) self.genome_positions = genome_positions self.num_train_examples = int(round(total_length * (1-self.test_frac))) # Number of examples to use for training (as opposed to testing) self.ordering = list(range(0, self.num_train_examples)) # Order in which we go through the training examples (will be changed)
print_every = 100 # print accuracy every 100 steps config = Config() loader = load_full_dataset_sample_per_chrom.DatasetLoader( windowSize=config.window, batchSize=config.batch_size, testBatchSize=config.test_batch_size, seed=1, load_coverage=False, complexity_threshold=1.2, pos_frac=0.5) datset = loader.dataset labls = utils.flatten(loader.labels) entropyMatrix = entropy.entropyVector(datset) freq_count, freq_matrix = sequence_analysis.sequence_2_mer_generate(datset) print("Validation Chromosome: {}".format(loader.val_chrom)) print("Test Chromosome: {}".format(loader.test_chrom)) print "Entropy Model" log_reg_model_entrpy = entropy.logisticRegression( entropyMatrix, labls, loader.train_indices, loader.test_indices, testAC=loader.allele_count[loader.test_indices]) print "Frequency Model" log_reg_model_freq = sequence_analysis.logistic_regression_2_mer( freq_matrix, labls, loader.train_indices, loader.test_indices) ''' loader.load_chromosome_window_data(loader.val_chrom)
seq_val = seq_ch else: seq.extend(seq_ch) del seq_ch del reference, referenceChr order = [x for x in range(len(seq))] random.shuffle(order) seq = np.array( [seq[i] for i in order] ) # Shuffle the training data, so we can easily choose a random subset for testing num_indels = np.array([num_indels[i] for i in order]) x_train = np.array(seq[:num_train_ex]) x_train = entropy.entropyVector(x_train) y_train = np.array(num_indels[:num_train_ex]) x_test = np.array(seq_val) x_test = entropy.entropyVector(x_test) y_test = np.array(num_indels_val) #np.save(data_dir + 'RegrKerasTestSeq' + str(validation_chrom) + str(complexity_thresh) + '.npy', x_test) np.save(data_dir + 'RegrKerasEntropyTestLab' + str(validation_chrom) + '.npy', y_test) print('Mean # indels per window: {}'.format( float(sum(y_train)) / len(y_train))) import keras from keras.regularizers import l2 from keras.layers import Conv1D, Dense, Flatten, Dropout
def __initializeTrainData(self, frac_positives): k = self.window # for brevity lengthIndels = len(self.indelLocations) num_negatives = int((1./frac_positives-1) * lengthIndels) total_length = lengthIndels + num_negatives dataset = np.zeros((total_length, 2*k + 1, 4)) coverageDataset = np.zeros((total_length, 2*k + 1)) entropyDataset = np.zeros((total_length, 2*k + 1)) recombinationDataset = np.zeros((total_length, 1)) #recombinationDataset= np.zeros((total_length, 2*k + 1)) if self.triclass: labeltype = np.uint8 else: labeltype = np.bool labels = np.zeros(total_length, dtype=labeltype) genome_positions = np.zeros(total_length, dtype=np.uint32) num_negatives = int((1./frac_positives-1) * lengthIndels) # dataset should have all the indels as well as random negative training samples if self.nearby: neg_positions = np.random.choice(self.indelLocations, size=num_negatives) self.nearby_indels = neg_positions offset = np.multiply(np.random.randint(1, self.nearby+1, size=num_negatives), np.random.choice([-1, 1], size=num_negatives)) neg_positions = neg_positions + offset # locations that are offset from indels by some amount self.indices = neg_positions else: neg_positions = np.random.choice(self.nonzeroLocationsRef, size=num_negatives) self.indices = neg_positions self.nearby_indels = neg_positions # to prevent error if this is undefined for i in range(lengthIndels + num_negatives): if i < lengthIndels: if not self.triclass: label = 1 # standard binary classification labels elif i < len(self.insertionLocations): label = 1 # insertions will be labeled as 1 else: label = 2 # deletions will be labeled as 2 pos = self.indelLocations[i] else: label = 0 pos = neg_positions[i - lengthIndels] if self.nearby: niter = 0 while (pos in self.prevChosenRefLocations) or (pos in self.setOfZeroLocations) or (pos in self.setOfIndelLocations) and niter < 1001: self.nearby_indels[i - lengthIndels] = np.random.choice(self.indelLocations) pos = self.nearby_indels[i - lengthIndels] + np.random.randint(1, self.nearby+1) * np.random.choice([-1, 1]) niter += 1 else: while (pos in self.prevChosenRefLocations) or (pos in self.setOfIndelLocations): pos = np.random.choice(self.nonzeroLocationsRef) self.indices[i - lengthIndels] = pos self.prevChosenRefLocations.add(pos) coverageWindow = np.zeros(2*k + 1) # get k base pairs before and after the position window = self.referenceChr[pos - k : pos + k + 1] coverageWindow = None if self.coverage is not None: coverageWindow = utils.flatten(self.coverage[pos - k : pos + k + 1]) recombWindowAverage = None if self.recombination is not None: recombWindow = np.zeros((2*k + 1, 1)) recombWindowIndices = np.arange(pos - k, pos + k + 1).reshape((2*k + 1, 1)) recombInBounds = recombWindowIndices[np.where(recombWindowIndices < len(self.recombination))] recombWindow[recombInBounds - (pos - k)] = self.recombination[recombInBounds] recombOutOfBounds = recombWindowIndices[np.where(recombWindowIndices >= len(self.recombination))] recombWindow[recombOutOfBounds - (pos - k)] = self.recombination[-1] recombWindowAverage = np.mean(recombWindow) #recombWindowAverage = utils.flatten(recombWindow) dataset[i] = window coverageDataset[i] = coverageWindow recombinationDataset[i] = recombWindowAverage labels[i] = label genome_positions[i] = pos self.indices = np.concatenate((self.indelLocations, self.indices)) self.nearby_indels = np.concatenate((self.indelLocations, self.nearby_indels)) if self.load_entropy: entropyDataset[:, k+1:2*k+1] = entropy.entropyVector(dataset) rawZipped = zip(list(dataset), list(coverageDataset), list(labels), list(genome_positions), list(self.indices), list(self.nearby_indels), list(entropyDataset), list(recombinationDataset)) # Shuffle the list np.random.shuffle(rawZipped) a, b, c, d, e, f, g, h = zip(*rawZipped) dataset = np.array(a) coverageDataset = np.array(b) entropyDataset = np.array(g) recombinationDataset = np.array(h) labels = np.array(c, dtype=labeltype) genome_positions = np.array(d, dtype=np.uint32) self.indices = np.array(e, dtype=np.uint32) self.nearby_indels = np.array(f, dtype=np.uint32) self.dataset = dataset self.coverageDataset = coverageDataset self.entropyDataset = entropyDataset self.recombinationDataset = recombinationDataset if self.triclass: self.labels = utils.to_onehot(labels, 3) else: self.labels = np.expand_dims(labels, axis=1) self.genome_positions = genome_positions self.num_train_examples = int(round(total_length * (1-self.test_frac))) self.ordering = list(range(0, self.num_train_examples))