def create_data_from_all_species_together(self): """ creating data for all species together - 12000 samples: takes random 1/5 of the samples from the training data of each species, random 1/17 of the samples from the validation data of each species, and random 1/5 of the samples from the test data of each species. (same random indices from all species). :return: """ print "start creating data for : ", self.project.species[-1], \ " and : ", self.project.species[-2] train_samples_60000 = [] validation_samples_60000 = [] test_samples_60000 = [] train_samples_12000 = [] validation_samples_12000 = [] test_samples_12000 = [] # collect all training, validation and test data from all species: section_index = 0 for section_name in self.sections: for j in range(len(self.project.species) - 2): species_name = self.project.species[j] section_samples_per_species = [] section_file_x_path = os.path.join( self.project.text_samples_base_dir, species_name, self.project.k_let_dirs[self.project.k - 1], section_name + "_X") section_file_y_path = os.path.join( self.project.text_samples_base_dir, species_name, self.project.k_let_dirs[self.project.k - 1], section_name + "_Y") sample_sequences = [] with open(section_file_x_path) as one_species_section_file_x: for line in one_species_section_file_x: if "\n" in line: sequence = line[: -1] # without \n at the end of the line else: sequence = line sample_sequences.append(sequence) sample_labels = [] with open(section_file_y_path) as one_species_section_file_y: for line in one_species_section_file_y: if "\n" in line: label = int( line[:-1]) # without \n at the end of the line else: label = int(line) sample_labels.append(label) current_number_of_samples_in_section = 0 for sample_index in range(len(sample_sequences)): sample_object = SampleObject( sample_sequences[sample_index], sample_labels[sample_index]) section_samples_per_species.append(sample_object) current_number_of_samples_in_section += 1 number_of_samples_in_section = \ 2*(self.project.get_number_of_samples())*(self.section_ratios[section_index]) if current_number_of_samples_in_section >= ( number_of_samples_in_section * self.ratio_of_samples_from_all_species): break number_section_samples = current_number_of_samples_in_section # takes 1/17 from each species samples random_indices_of_section_samples = random.sample( xrange(number_section_samples), number_section_samples / (len(self.project.species) - 2)) random_section_samples = [ section_samples_per_species[j] for j in range(len(section_samples_per_species)) if j in random_indices_of_section_samples ] if section_name == 'train': train_samples_12000.extend(random_section_samples) train_samples_60000.extend(section_samples_per_species) elif section_name == 'validation': validation_samples_12000.extend(random_section_samples) validation_samples_60000.extend( section_samples_per_species) elif section_name == 'test': test_samples_12000.extend(random_section_samples) test_samples_60000.extend(section_samples_per_species) section_index += 1 for i in range( len(self.project.species) - 2, len(self.project.species)): species_name = self.project.species[i] species_dir_text = os.path.join(self.project.text_samples_base_dir, species_name) pos_out_path = os.path.join(species_dir_text, "positive_samples") neg_out_path = os.path.join( species_dir_text, self.project.k_let_dirs[self.project.k - 1], "negative_samples") species_dir_npy = os.path.join(self.project.samples_base_dir, species_name) if i == len( self.project.species ) - 2: # iteration #17 - creating data for all species - 60000 samples train_samples = train_samples_60000 validation_samples = validation_samples_60000 test_samples = test_samples_60000 elif i == len( self.project.species ) - 1: # iteration #18 - creating data for all species - 12000 samples train_samples = train_samples_12000 validation_samples = validation_samples_12000 test_samples = test_samples_12000 with open(pos_out_path, 'w') as out_pos: with open(neg_out_path, 'w') as out_neg: for section_name in self.sections: print "start section: ", section_name if section_name == 'train': section_samples = train_samples elif section_name == 'validation': section_samples = validation_samples elif section_name == 'test': section_samples = test_samples # write the section samples and labels to text file and numpy file path_out_text_X, path_out_text_Y = data_handle.get_path( species_dir_text, section_name, self.project.k) path_out_npy_X, path_out_npy_Y = data_handle.get_path( species_dir_npy, section_name, self.project.k) shuffle(section_samples) section_sample_matrices = [] section_label_matrices = [] section_samples_str = [] section_labels_str = [] for sample in section_samples: section_sample_matrices.append( sample.get_sample_matrix()) section_label_matrices.append( sample.get_label_matrix()) section_samples_str.append(sample.get_sample_str()) section_labels_str.append(str(sample.get_label())) np.save(path_out_npy_X, section_sample_matrices) np.save(path_out_npy_Y, section_label_matrices) with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(section_samples_str) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(section_labels_str) out_text_labels.write(string) self.write_positive_and_negative_text_samples_files( section_labels_str, section_samples_str, out_pos, out_neg)
def create_data_from_all_species_together(self): """ creating data for all species together - 14000 samples: takes random 1/17 of the samples from the training data of each species, random 1/17 of the samples from the validation data of each species, and random 1/17 of the samples from the test data of each species. (same random indices from all species). :return: """ print "start creating data for : ", self.project.species[-1], \ " and : ", self.project.species[-2] train_samples_238000 = [] validation_samples_238000 = [] test_samples_238000 = [] train_samples_14000 = [] validation_samples_14000 = [] test_samples_14000 = [] # collect all training, validation and test data from all species: section_index = 0 for section_name in self.sections: for j in range(len(self.project.species) - 2): species_name = self.project.species[j] section_samples_per_species = [] section_file_path = os.path.join( self.project.text_samples_base_dir, species_name, section_name + "_data") with open(section_file_path) as one_species_section_file: line_counter = 0 for line in one_species_section_file: split_line = line.split("\t") label = split_line[0] label_int = 1 if label == "True" else 0 sequence = (split_line[1] )[:-1] # without \n at the end of the line sample_object = SampleObject(sequence, label_int) section_samples_per_species.append(sample_object) line_counter += 1 number_of_samples_in_section = \ 2*(self.project.get_number_of_samples())*(self.section_ratios[section_index]) # TODO for now - takes only <self.ratio_of_samples_from_all_species> samples from each section if line_counter >= ( number_of_samples_in_section * self.ratio_of_samples_from_all_species): break number_section_samples = line_counter # takes 1/17 from each species samples random_indices_of_section_samples = random.sample( xrange(number_section_samples), number_section_samples / (len(self.project.species) - 2)) random_section_samples = [ section_samples_per_species[j] for j in range(len(section_samples_per_species)) if j in random_indices_of_section_samples ] if section_name == 'train': train_samples_14000.extend(random_section_samples) train_samples_238000.extend(section_samples_per_species) elif section_name == 'validation': validation_samples_14000.extend(random_section_samples) validation_samples_238000.extend( section_samples_per_species) elif section_name == 'test': test_samples_14000.extend(random_section_samples) test_samples_238000.extend(section_samples_per_species) section_index += 1 # print "len(train_samples_14000) = ", len(train_samples_14000) # print "len(validation_samples_14000) = ", len(validation_samples_14000) # print "len(test_samples_14000) = ", len(test_samples_14000) # print "len(train_samples_238000) = ", len(train_samples_238000) # print "len(validation_samples_238000) = ", len(validation_samples_238000) # print "len(test_samples_238000) = ", len(test_samples_238000) # print "total 238000 train+validation+test: ", len(train_samples_238000) \ # + len(validation_samples_238000) + len(test_samples_238000) # print "total 14000 train+validation+test: ", len(train_samples_14000) \ # + len(validation_samples_14000) + len(test_samples_14000) for i in range( len(self.project.species) - 2, len(self.project.species)): species_name = self.project.species[i] species_dir_text = os.path.join(self.project.text_samples_base_dir, species_name) pos_out_path = os.path.join(species_dir_text, "positive_samples") neg_out_path = os.path.join(species_dir_text, "negative_samples") species_dir_npy = os.path.join(self.project.samples_base_dir, species_name) if i == len( self.project.species ) - 2: # iteration #17 - creating data for all species - 238000 samples train_samples = train_samples_238000 validation_samples = validation_samples_238000 test_samples = test_samples_238000 elif i == len( self.project.species ) - 1: # iteration #18 - creating data for all species - 14000 samples train_samples = train_samples_14000 validation_samples = validation_samples_14000 test_samples = test_samples_14000 with open(pos_out_path, 'w') as out_pos: with open(neg_out_path, 'w') as out_neg: for section_name in self.sections: print "start section: ", section_name if section_name == 'train': section_samples = train_samples elif section_name == 'validation': section_samples = validation_samples elif section_name == 'test': section_samples = test_samples # write the section samples and labels to text file and numpy file path_out_text_X, path_out_text_Y = data_handle.get_path( species_dir_text, section_name) path_out_npy_X, path_out_npy_Y = data_handle.get_path( species_dir_npy, section_name) shuffle(section_samples) section_sample_matrices = [] section_label_matrices = [] section_samples_str = [] section_labels_str = [] for sample in section_samples: section_sample_matrices.append( sample.get_sample_matrix()) section_label_matrices.append( sample.get_label_matrix()) section_samples_str.append(sample.get_sample_str()) section_labels_str.append(str(sample.get_label())) np.save(path_out_npy_X, section_sample_matrices) np.save(path_out_npy_Y, section_label_matrices) with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(section_samples_str) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(section_labels_str) out_text_labels.write(string) self.write_positive_and_negative_text_samples_files( section_labels_str, section_samples_str, out_pos, out_neg)
def main(): create_directories() # the k-lets counts are preserved during the negative samples generation for k in range(1, MAXIMAL_K + 1): print "start creating data, k = ", k for species_name in H3K27ac_species_names_ordered: if "All" in species_name: continue print "species name: ", species_name positive_samples, negative_samples = create_data(k, species_name) all_Xs = np.array(positive_samples + negative_samples) all_ys = np.array([1] * len(positive_samples) + [0] * len(negative_samples)) perm = np.random.permutation(len(all_Xs)) all_Xs_shuffled = all_Xs[perm] all_ys_shuffled = all_ys[perm] samples = np.array(all_Xs_shuffled) labels = np.array(convert_labels_to_one_hot(all_ys_shuffled)) all_samples = [] for i in range(len(all_Xs_shuffled)): sample_matrix = all_Xs_shuffled[i] label = all_ys_shuffled[i] label_matrix = labels_map[label] sample_object = SampleObject(sample_matrix, label_matrix, is_matrix=True) all_samples.append(sample_object) indices = dict() train_start_idx = 0 train_end_idx = int(math.ceil(len(all_Xs) * train_ratio)) indices["train"] = (train_start_idx, train_end_idx) validation_start_idx = train_end_idx validation_end_idx = train_end_idx + int( math.ceil(len(all_Xs) * validation_ratio)) indices["validation"] = (validation_start_idx, validation_end_idx) test_start_idx = validation_end_idx test_end_idx = validation_end_idx + int( math.ceil(len(all_Xs) * test_ratio)) indices["test"] = (test_start_idx, test_end_idx) # save npy files for each species path_out_train_X = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'train_X') path_out_train_y = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'train_Y') path_out_validation_X = os.path.join( samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'validation_X') path_out_validation_y = os.path.join( samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'validation_Y') path_out_test_X = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'test_X') path_out_test_y = os.path.join(samples_out_base_dir_for_npy_files, species_name, output_k_lets_dirs[k - 1], 'test_Y') np.save(path_out_train_X, samples[train_start_idx:train_end_idx]) np.save(path_out_train_y, labels[train_start_idx:train_end_idx]) np.save(path_out_validation_X, samples[validation_start_idx:validation_end_idx]) np.save(path_out_validation_y, labels[validation_start_idx:validation_end_idx]) np.save(path_out_test_X, samples[test_start_idx:test_end_idx]) np.save(path_out_test_y, labels[test_start_idx:test_end_idx]) # write positive and negative text files: dir_path = os.path.join(samples_out_base_dir_for_text_files, species_name) all_Xs_text_shuffled = [] all_Ys_text_shuffled = [] for sample in all_samples: all_Xs_text_shuffled.append(sample.get_sample_str()) all_Ys_text_shuffled.append(str(sample.get_label())) text_samples = np.array(all_Xs_text_shuffled) text_labels = np.array(all_Ys_text_shuffled) for section in sections: print "section: ", section path_out_text_X, path_out_text_Y = data_handle.get_path( dir_path, section, k) start, end = indices[section] with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(text_samples[start:end]) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(text_labels[start:end]) out_text_labels.write(string) print "End! :)"
def shuffle_and_write_samples(self, all_samples, species_name=None): if species_name: path_out_npy_files_dir = os.path.join( self.project.samples_base_dir, species_name) path_out_text_samples_dir = os.path.join( self.project.text_samples_base_dir, species_name) else: path_out_npy_files_dir = self.project.samples_base_dir path_out_text_samples_dir = self.project.text_samples_base_dir if self.project.sigma: if not os.path.exists( path_out_npy_files_dir) and not os.path.isdir( path_out_npy_files_dir): print "make directory: ", path_out_npy_files_dir os.makedirs(path_out_npy_files_dir) if not os.path.exists( path_out_text_samples_dir) and not os.path.isdir( path_out_text_samples_dir): print "make directory: ", path_out_text_samples_dir os.makedirs(path_out_text_samples_dir) positive_samples_file_path = os.path.join(path_out_text_samples_dir, "positive_samples") negative_samples_file_path = os.path.join(path_out_text_samples_dir, "negative_samples") shuffle(all_samples) # shuffle all sample objects in place all_Xs_matrices_shuffled = [] all_Ys_matrices_shuffled = [] all_Xs_text_shuffled = [] all_Ys_text_shuffled = [] positive_samples = [] negative_samples = [] for sample in all_samples: all_Xs_matrices_shuffled.append(sample.get_sample_matrix()) all_Ys_matrices_shuffled.append(sample.get_label_matrix()) all_Xs_text_shuffled.append(sample.get_sample_str()) all_Ys_text_shuffled.append(str(sample.get_label())) if sample.get_label() == 1: positive_samples.append(sample) elif sample.get_label() == 0: negative_samples.append(sample) # write positive and negative text files: print "write positive and negative text files ... " with open(positive_samples_file_path, 'w') as pos_out: for sample in positive_samples: pos_out.write(sample.get_sample_str() + "\n") with open(negative_samples_file_path, 'w') as neg_out: for sample in negative_samples: neg_out.write(sample.get_sample_str() + "\n") samples = np.array(all_Xs_matrices_shuffled) labels = np.array(all_Ys_matrices_shuffled) text_samples = np.array(all_Xs_text_shuffled) text_labels = np.array(all_Ys_text_shuffled) indices = dict() train_start_idx = 0 train_end_idx = math.ceil(len(samples) * self.section_ratios[0]) indices["train"] = (train_start_idx, train_end_idx) validation_start_idx = train_end_idx validation_end_idx = train_end_idx + math.ceil( len(samples) * self.section_ratios[1]) indices["validation"] = (validation_start_idx, validation_end_idx) test_start_idx = validation_end_idx test_end_idx = validation_end_idx + math.ceil( len(samples) * self.section_ratios[2]) indices["test"] = (test_start_idx, test_end_idx) for section in self.sections: print "section: ", section path_out_text_X, path_out_text_Y = data_handle.get_path( path_out_text_samples_dir, section) path_out_npy_X, path_out_npy_Y = data_handle.get_path( path_out_npy_files_dir, section) start = int(indices[section][0]) end = int(indices[section][1]) np.save(path_out_npy_X, samples[start:end]) np.save(path_out_npy_Y, labels[start:end]) with open(path_out_text_X, 'w') as out_text_samples: string = '\n'.join(text_samples[start:end]) out_text_samples.write(string) with open(path_out_text_Y, 'w') as out_text_labels: string = '\n'.join(text_labels[start:end]) out_text_labels.write(string)