def get_scores_and_labels(result, test_samples_sequences, test_correct_labels): all_scores = [] all_true_labels = [] for sample_index in range(len(result)): sample_object = SampleObject(test_samples_sequences[sample_index], test_correct_labels[sample_index], True) true_label_int = sample_object.convert_matrix_to_label() y_score = result[sample_index][ 1] # takes only the score of the negative label all_scores.append(y_score) all_true_labels.append(true_label_int) return all_scores, all_true_labels
def create_one_negative_sample_simulated_data(): bases = [] for i in range(SAMPLE_LENGTH): array = np.random.multinomial(1, [1 / 4.] * 4, size=1) # uniform distribution bases.append(get_base(array)) sample = concatenate_random_bases_and_motif(bases) sample_object = SampleObject(sample, 0) return sample_object
def create_one_positive_sample_simulated_data(motif, motif_center_index): bases = [] motif_length = len(motif) for i in range(SAMPLE_LENGTH - motif_length): # sample bases from uniform distribution array = np.random.multinomial(1, [1 / float(BASES_NUMBER)] * BASES_NUMBER, size=1) bases.append(get_base(array)) sample = concatenate_random_bases_and_motif(bases, motif, motif_center_index) sample_object = SampleObject(sample, 1) return sample_object
def __init__(self, samples_path, labels_path): sample_matrices = np.load(samples_path) label_matrices = np.load(labels_path) self._num_samples = len(sample_matrices) self._samples = [] for index in range(self._num_samples): self._samples.append( SampleObject(sample_matrices[index], label_matrices[index], is_matrix=True)) self._current_epoch = 1 self._current_position_in_epoch = 0 self._batch_size = 0
def create_data_from_all_species_together(self): """ creating data for all species together - 12000 samples: takes random 1/5 of the samples from the training data of each species, random 1/17 of the samples from the validation data of each species, and random 1/5 of the samples from the test data of each species. (same random indices from all species). :return: """ print "start creating data for : ", self.project.species[-1], \ " and : ", self.project.species[-2] train_samples_60000 = [] validation_samples_60000 = [] test_samples_60000 = [] train_samples_12000 = [] validation_samples_12000 = [] test_samples_12000 = [] # collect all training, validation and test data from all species: section_index = 0 for section_name in self.sections: for j in range(len(self.project.species) - 2): species_name = self.project.species[j] section_samples_per_species = [] section_file_x_path = os.path.join( self.project.text_samples_base_dir, species_name, self.project.k_let_dirs[self.project.k - 1], section_name + "_X") section_file_y_path = os.path.join( self.project.text_samples_base_dir, species_name, self.project.k_let_dirs[self.project.k - 1], section_name + "_Y") sample_sequences = [] with open(section_file_x_path) as one_species_section_file_x: for line in one_species_section_file_x: if "\n" in line: sequence = line[: -1] # without \n at the end of the line else: sequence = line sample_sequences.append(sequence) sample_labels = [] with open(section_file_y_path) as one_species_section_file_y: for line in one_species_section_file_y: if "\n" in line: label = int( line[:-1]) # without \n at the end of the line else: label = int(line) sample_labels.append(label) current_number_of_samples_in_section = 0 for sample_index in range(len(sample_sequences)): sample_object = SampleObject( sample_sequences[sample_index], sample_labels[sample_index]) section_samples_per_species.append(sample_object) current_number_of_samples_in_section += 1 number_of_samples_in_section = \ 2*(self.project.get_number_of_samples())*(self.section_ratios[section_index]) if current_number_of_samples_in_section >= ( number_of_samples_in_section * self.ratio_of_samples_from_all_species): break number_section_samples = current_number_of_samples_in_section # takes 1/17 from each species samples random_indices_of_section_samples = random.sample( xrange(number_section_samples), number_section_samples / (len(self.project.species) - 2)) random_section_samples = [ section_samples_per_species[j] for j in range(len(section_samples_per_species)) if j in random_indices_of_section_samples ] if section_name == 'train': train_samples_12000.extend(random_section_samples) train_samples_60000.extend(section_samples_per_species) elif section_name == 'validation': validation_samples_12000.extend(random_section_samples) validation_samples_60000.extend( section_samples_per_species) elif section_name == 'test': test_samples_12000.extend(random_section_samples) test_samples_60000.extend(section_samples_per_species) section_index += 1 for i in range( len(self.project.species) - 2, len(self.project.species)): species_name = self.project.species[i] species_dir_text = os.path.join(self.project.text_samples_base_dir, species_name) pos_out_path = os.path.join(species_dir_text, "positive_samples") neg_out_path = os.path.join( species_dir_text, self.project.k_let_dirs[self.project.k - 1], "negative_samples") species_dir_npy = os.path.join(self.project.samples_base_dir, species_name) if i == len( self.project.species ) - 2: # iteration #17 - creating data for all species - 60000 samples train_samples = train_samples_60000 validation_samples = validation_samples_60000 test_samples = test_samples_60000 elif i == len( self.project.species ) - 1: # iteration #18 - creating data for all species - 12000 samples train_samples = train_samples_12000 validation_samples = validation_samples_12000 test_samples = test_samples_12000 with open(pos_out_path, 'w') as out_pos: with open(neg_out_path, 'w') as out_neg: for section_name in self.sections: print "start section: ", section_name if section_name == 'train': section_samples = train_samples elif section_name == 'validation': section_samples = validation_samples elif section_name == 'test': section_samples = test_samples # write the section samples and labels to text file and numpy file path_out_text_X, path_out_text_Y = data_handle.get_path( species_dir_text, section_name, self.project.k) path_out_npy_X, path_out_npy_Y = data_handle.get_path( species_dir_npy, section_name, self.project.k) shuffle(section_samples) section_sample_matrices = [] section_label_matrices = [] section_samples_str = [] section_labels_str = [] for sample in section_samples: section_sample_matrices.append( sample.get_sample_matrix()) section_label_matrices.append( sample.get_label_matrix()) section_samples_str.append(sample.get_sample_str()) section_labels_str.append(str(sample.get_label())) np.save(path_out_npy_X, section_sample_matrices) np.save(path_out_npy_Y, section_label_matrices) with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(section_samples_str) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(section_labels_str) out_text_labels.write(string) self.write_positive_and_negative_text_samples_files( section_labels_str, section_samples_str, out_pos, out_neg)
def main(): create_directories() # the k-lets counts are preserved during the negative samples generation for k in range(1, MAXIMAL_K + 1): print "start creating data, k = ", k for species_name in H3K27ac_species_names_ordered: if "All" in species_name: continue print "species name: ", species_name positive_samples, negative_samples = create_data(k, species_name) all_Xs = np.array(positive_samples + negative_samples) all_ys = np.array([1] * len(positive_samples) + [0] * len(negative_samples)) perm = np.random.permutation(len(all_Xs)) all_Xs_shuffled = all_Xs[perm] all_ys_shuffled = all_ys[perm] samples = np.array(all_Xs_shuffled) labels = np.array(convert_labels_to_one_hot(all_ys_shuffled)) all_samples = [] for i in range(len(all_Xs_shuffled)): sample_matrix = all_Xs_shuffled[i] label = all_ys_shuffled[i] label_matrix = labels_map[label] sample_object = SampleObject(sample_matrix, label_matrix, is_matrix=True) all_samples.append(sample_object) indices = dict() train_start_idx = 0 train_end_idx = int(math.ceil(len(all_Xs) * train_ratio)) indices["train"] = (train_start_idx, train_end_idx) validation_start_idx = train_end_idx validation_end_idx = train_end_idx + int( math.ceil(len(all_Xs) * validation_ratio)) indices["validation"] = (validation_start_idx, validation_end_idx) test_start_idx = validation_end_idx test_end_idx = validation_end_idx + int( math.ceil(len(all_Xs) * test_ratio)) indices["test"] = (test_start_idx, test_end_idx) # save npy files for each species path_out_train_X = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'train_X') path_out_train_y = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'train_Y') path_out_validation_X = os.path.join( samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'validation_X') path_out_validation_y = os.path.join( samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'validation_Y') path_out_test_X = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'test_X') path_out_test_y = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'test_Y') np.save(path_out_train_X, samples[train_start_idx:train_end_idx]) np.save(path_out_train_y, labels[train_start_idx:train_end_idx]) np.save(path_out_validation_X, samples[validation_start_idx:validation_end_idx]) np.save(path_out_validation_y, labels[validation_start_idx:validation_end_idx]) np.save(path_out_test_X, samples[test_start_idx:test_end_idx]) np.save(path_out_test_y, labels[test_start_idx:test_end_idx]) # write positive and negative text files: dir_path = os.path.join(samples_out_base_dir_for_text_files, species_name) all_Xs_text_shuffled = [] all_Ys_text_shuffled = [] for sample in all_samples: all_Xs_text_shuffled.append(sample.get_sample_str()) all_Ys_text_shuffled.append(str(sample.get_label())) text_samples = np.array(all_Xs_text_shuffled) text_labels = np.array(all_Ys_text_shuffled) for section in sections: print "section: ", section path_out_text_X, path_out_text_Y = data_handle.get_path( dir_path, section, k) start, end = indices[section] with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(text_samples[start:end]) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(text_labels[start:end]) out_text_labels.write(string) print "End! :)"
def create_data_from_all_species_together(self): """ creating data for all species together - 14000 samples: takes random 1/17 of the samples from the training data of each species, random 1/17 of the samples from the validation data of each species, and random 1/17 of the samples from the test data of each species. (same random indices from all species). :return: """ print "start creating data for : ", self.project.species[-1], \ " and : ", self.project.species[-2] train_samples_238000 = [] validation_samples_238000 = [] test_samples_238000 = [] train_samples_14000 = [] validation_samples_14000 = [] test_samples_14000 = [] # collect all training, validation and test data from all species: section_index = 0 for section_name in self.sections: for j in range(len(self.project.species) - 2): species_name = self.project.species[j] section_samples_per_species = [] section_file_path = os.path.join( self.project.text_samples_base_dir, species_name, section_name + "_data") with open(section_file_path) as one_species_section_file: line_counter = 0 for line in one_species_section_file: split_line = line.split("\t") label = split_line[0] label_int = 1 if label == "True" else 0 sequence = (split_line[1] )[:-1] # without \n at the end of the line sample_object = SampleObject(sequence, label_int) section_samples_per_species.append(sample_object) line_counter += 1 number_of_samples_in_section = \ 2*(self.project.get_number_of_samples())*(self.section_ratios[section_index]) # TODO for now - takes only <self.ratio_of_samples_from_all_species> samples from each section if line_counter >= ( number_of_samples_in_section * self.ratio_of_samples_from_all_species): break number_section_samples = line_counter # takes 1/17 from each species samples random_indices_of_section_samples = random.sample( xrange(number_section_samples), number_section_samples / (len(self.project.species) - 2)) random_section_samples = [ section_samples_per_species[j] for j in range(len(section_samples_per_species)) if j in random_indices_of_section_samples ] if section_name == 'train': train_samples_14000.extend(random_section_samples) train_samples_238000.extend(section_samples_per_species) elif section_name == 'validation': validation_samples_14000.extend(random_section_samples) validation_samples_238000.extend( section_samples_per_species) elif section_name == 'test': test_samples_14000.extend(random_section_samples) test_samples_238000.extend(section_samples_per_species) section_index += 1 # print "len(train_samples_14000) = ", len(train_samples_14000) # print "len(validation_samples_14000) = ", len(validation_samples_14000) # print "len(test_samples_14000) = ", len(test_samples_14000) # print "len(train_samples_238000) = ", len(train_samples_238000) # print "len(validation_samples_238000) = ", len(validation_samples_238000) # print "len(test_samples_238000) = ", len(test_samples_238000) # print "total 238000 train+validation+test: ", len(train_samples_238000) \ # + len(validation_samples_238000) + len(test_samples_238000) # print "total 14000 train+validation+test: ", len(train_samples_14000) \ # + len(validation_samples_14000) + len(test_samples_14000) for i in range( len(self.project.species) - 2, len(self.project.species)): species_name = self.project.species[i] species_dir_text = os.path.join(self.project.text_samples_base_dir, species_name) pos_out_path = os.path.join(species_dir_text, "positive_samples") neg_out_path = os.path.join(species_dir_text, "negative_samples") species_dir_npy = os.path.join(self.project.samples_base_dir, species_name) if i == len( self.project.species ) - 2: # iteration #17 - creating data for all species - 238000 samples train_samples = train_samples_238000 validation_samples = validation_samples_238000 test_samples = test_samples_238000 elif i == len( self.project.species ) - 1: # iteration #18 - creating data for all species - 14000 samples train_samples = train_samples_14000 validation_samples = validation_samples_14000 test_samples = test_samples_14000 with open(pos_out_path, 'w') as out_pos: with open(neg_out_path, 'w') as out_neg: for section_name in self.sections: print "start section: ", section_name if section_name == 'train': section_samples = train_samples elif section_name == 'validation': section_samples = validation_samples elif section_name == 'test': section_samples = test_samples # write the section samples and labels to text file and numpy file path_out_text_X, path_out_text_Y = data_handle.get_path( species_dir_text, section_name) path_out_npy_X, path_out_npy_Y = data_handle.get_path( species_dir_npy, section_name) shuffle(section_samples) section_sample_matrices = [] section_label_matrices = [] section_samples_str = [] section_labels_str = [] for sample in section_samples: section_sample_matrices.append( sample.get_sample_matrix()) section_label_matrices.append( sample.get_label_matrix()) section_samples_str.append(sample.get_sample_str()) section_labels_str.append(str(sample.get_label())) np.save(path_out_npy_X, section_sample_matrices) np.save(path_out_npy_Y, section_label_matrices) with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(section_samples_str) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(section_labels_str) out_text_labels.write(string) self.write_positive_and_negative_text_samples_files( section_labels_str, section_samples_str, out_pos, out_neg)
def create_one_sample(is_positive, sequence): label = 1 if is_positive else 0 sample_object = SampleObject(sequence, label) return sample_object