Example #1
0
def get_scores_and_labels(result, test_samples_sequences, test_correct_labels):
    all_scores = []
    all_true_labels = []
    for sample_index in range(len(result)):
        sample_object = SampleObject(test_samples_sequences[sample_index],
                                     test_correct_labels[sample_index], True)
        true_label_int = sample_object.convert_matrix_to_label()
        y_score = result[sample_index][
            1]  # takes only the score of the negative label
        all_scores.append(y_score)
        all_true_labels.append(true_label_int)
    return all_scores, all_true_labels
Example #2
0
def create_one_negative_sample_simulated_data():
    bases = []
    for i in range(SAMPLE_LENGTH):
        array = np.random.multinomial(1, [1 / 4.] * 4,
                                      size=1)  # uniform distribution
        bases.append(get_base(array))
    sample = concatenate_random_bases_and_motif(bases)
    sample_object = SampleObject(sample, 0)
    return sample_object
Example #3
0
def create_one_positive_sample_simulated_data(motif, motif_center_index):
    bases = []
    motif_length = len(motif)
    for i in range(SAMPLE_LENGTH - motif_length):
        # sample bases from uniform distribution
        array = np.random.multinomial(1,
                                      [1 / float(BASES_NUMBER)] * BASES_NUMBER,
                                      size=1)
        bases.append(get_base(array))
    sample = concatenate_random_bases_and_motif(bases, motif,
                                                motif_center_index)
    sample_object = SampleObject(sample, 1)
    return sample_object
Example #4
0
    def __init__(self, samples_path, labels_path):

        sample_matrices = np.load(samples_path)
        label_matrices = np.load(labels_path)
        self._num_samples = len(sample_matrices)

        self._samples = []
        for index in range(self._num_samples):
            self._samples.append(
                SampleObject(sample_matrices[index],
                             label_matrices[index],
                             is_matrix=True))
        self._current_epoch = 1
        self._current_position_in_epoch = 0
        self._batch_size = 0
    def create_data_from_all_species_together(self):
        """
        creating data for all species together -  12000 samples:
        takes random 1/5 of the samples from the training data of each species,
        random 1/17 of the samples from the validation data of each species,
        and random 1/5 of the samples from the test data of each species.
        (same random indices from all species).
        :return:
        """
        print "start creating data for : ", self.project.species[-1], \
              " and : ", self.project.species[-2]
        train_samples_60000 = []
        validation_samples_60000 = []
        test_samples_60000 = []
        train_samples_12000 = []
        validation_samples_12000 = []
        test_samples_12000 = []
        # collect all training, validation and test data from all species:
        section_index = 0
        for section_name in self.sections:
            for j in range(len(self.project.species) - 2):
                species_name = self.project.species[j]
                section_samples_per_species = []
                section_file_x_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    self.project.k_let_dirs[self.project.k - 1],
                    section_name + "_X")
                section_file_y_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    self.project.k_let_dirs[self.project.k - 1],
                    section_name + "_Y")

                sample_sequences = []
                with open(section_file_x_path) as one_species_section_file_x:
                    for line in one_species_section_file_x:
                        if "\n" in line:
                            sequence = line[:
                                            -1]  # without \n at the end of the line
                        else:
                            sequence = line
                        sample_sequences.append(sequence)
                sample_labels = []
                with open(section_file_y_path) as one_species_section_file_y:
                    for line in one_species_section_file_y:
                        if "\n" in line:
                            label = int(
                                line[:-1])  # without \n at the end of the line
                        else:
                            label = int(line)
                        sample_labels.append(label)
                current_number_of_samples_in_section = 0
                for sample_index in range(len(sample_sequences)):
                    sample_object = SampleObject(
                        sample_sequences[sample_index],
                        sample_labels[sample_index])
                    section_samples_per_species.append(sample_object)
                    current_number_of_samples_in_section += 1
                    number_of_samples_in_section = \
                        2*(self.project.get_number_of_samples())*(self.section_ratios[section_index])
                    if current_number_of_samples_in_section >= (
                            number_of_samples_in_section *
                            self.ratio_of_samples_from_all_species):
                        break

                number_section_samples = current_number_of_samples_in_section
                # takes 1/17 from each species samples
                random_indices_of_section_samples = random.sample(
                    xrange(number_section_samples),
                    number_section_samples / (len(self.project.species) - 2))
                random_section_samples = [
                    section_samples_per_species[j]
                    for j in range(len(section_samples_per_species))
                    if j in random_indices_of_section_samples
                ]
                if section_name == 'train':
                    train_samples_12000.extend(random_section_samples)
                    train_samples_60000.extend(section_samples_per_species)
                elif section_name == 'validation':
                    validation_samples_12000.extend(random_section_samples)
                    validation_samples_60000.extend(
                        section_samples_per_species)
                elif section_name == 'test':
                    test_samples_12000.extend(random_section_samples)
                    test_samples_60000.extend(section_samples_per_species)
            section_index += 1
        for i in range(
                len(self.project.species) - 2, len(self.project.species)):
            species_name = self.project.species[i]
            species_dir_text = os.path.join(self.project.text_samples_base_dir,
                                            species_name)
            pos_out_path = os.path.join(species_dir_text, "positive_samples")
            neg_out_path = os.path.join(
                species_dir_text, self.project.k_let_dirs[self.project.k - 1],
                "negative_samples")
            species_dir_npy = os.path.join(self.project.samples_base_dir,
                                           species_name)
            if i == len(
                    self.project.species
            ) - 2:  # iteration #17 - creating data for all species - 60000 samples
                train_samples = train_samples_60000
                validation_samples = validation_samples_60000
                test_samples = test_samples_60000
            elif i == len(
                    self.project.species
            ) - 1:  # iteration #18 - creating data for all species - 12000 samples
                train_samples = train_samples_12000
                validation_samples = validation_samples_12000
                test_samples = test_samples_12000
            with open(pos_out_path, 'w') as out_pos:
                with open(neg_out_path, 'w') as out_neg:
                    for section_name in self.sections:
                        print "start section: ", section_name
                        if section_name == 'train':
                            section_samples = train_samples
                        elif section_name == 'validation':
                            section_samples = validation_samples
                        elif section_name == 'test':
                            section_samples = test_samples
                        # write the section samples and labels to text file and numpy file
                        path_out_text_X, path_out_text_Y = data_handle.get_path(
                            species_dir_text, section_name, self.project.k)
                        path_out_npy_X, path_out_npy_Y = data_handle.get_path(
                            species_dir_npy, section_name, self.project.k)
                        shuffle(section_samples)
                        section_sample_matrices = []
                        section_label_matrices = []
                        section_samples_str = []
                        section_labels_str = []
                        for sample in section_samples:
                            section_sample_matrices.append(
                                sample.get_sample_matrix())
                            section_label_matrices.append(
                                sample.get_label_matrix())
                            section_samples_str.append(sample.get_sample_str())
                            section_labels_str.append(str(sample.get_label()))
                        np.save(path_out_npy_X, section_sample_matrices)
                        np.save(path_out_npy_Y, section_label_matrices)
                        with open(path_out_text_X, 'w') as out_text_samples:
                            string = '\n'.join(section_samples_str)
                            out_text_samples.write(string)
                        with open(path_out_text_Y, 'w') as out_text_labels:
                            string = '\n'.join(section_labels_str)
                            out_text_labels.write(string)
                        self.write_positive_and_negative_text_samples_files(
                            section_labels_str, section_samples_str, out_pos,
                            out_neg)
Example #6
0
def main():
    create_directories()

    # the k-lets counts are preserved during the negative samples generation
    for k in range(1, MAXIMAL_K + 1):
        print "start creating data, k = ", k
        for species_name in H3K27ac_species_names_ordered:
            if "All" in species_name:
                continue
            print "species name: ", species_name
            positive_samples, negative_samples = create_data(k, species_name)
            all_Xs = np.array(positive_samples + negative_samples)
            all_ys = np.array([1] * len(positive_samples) +
                              [0] * len(negative_samples))
            perm = np.random.permutation(len(all_Xs))

            all_Xs_shuffled = all_Xs[perm]
            all_ys_shuffled = all_ys[perm]

            samples = np.array(all_Xs_shuffled)
            labels = np.array(convert_labels_to_one_hot(all_ys_shuffled))

            all_samples = []
            for i in range(len(all_Xs_shuffled)):
                sample_matrix = all_Xs_shuffled[i]
                label = all_ys_shuffled[i]
                label_matrix = labels_map[label]
                sample_object = SampleObject(sample_matrix,
                                             label_matrix,
                                             is_matrix=True)
                all_samples.append(sample_object)

            indices = dict()
            train_start_idx = 0
            train_end_idx = int(math.ceil(len(all_Xs) * train_ratio))
            indices["train"] = (train_start_idx, train_end_idx)
            validation_start_idx = train_end_idx
            validation_end_idx = train_end_idx + int(
                math.ceil(len(all_Xs) * validation_ratio))
            indices["validation"] = (validation_start_idx, validation_end_idx)
            test_start_idx = validation_end_idx
            test_end_idx = validation_end_idx + int(
                math.ceil(len(all_Xs) * test_ratio))
            indices["test"] = (test_start_idx, test_end_idx)

            # save npy files for each species

            path_out_train_X = os.path.join(samples_out_base_dir_for_npy_files,
                                            species_name,
                                            output_k_lets_dirs[k - 1],
                                            'train_X')
            path_out_train_y = os.path.join(samples_out_base_dir_for_npy_files,
                                            species_name,
                                            output_k_lets_dirs[k - 1],
                                            'train_Y')
            path_out_validation_X = os.path.join(
                samples_out_base_dir_for_npy_files, species_name,
                output_k_lets_dirs[k - 1], 'validation_X')
            path_out_validation_y = os.path.join(
                samples_out_base_dir_for_npy_files, species_name,
                output_k_lets_dirs[k - 1], 'validation_Y')
            path_out_test_X = os.path.join(samples_out_base_dir_for_npy_files,
                                           species_name,
                                           output_k_lets_dirs[k - 1], 'test_X')
            path_out_test_y = os.path.join(samples_out_base_dir_for_npy_files,
                                           species_name,
                                           output_k_lets_dirs[k - 1], 'test_Y')

            np.save(path_out_train_X, samples[train_start_idx:train_end_idx])
            np.save(path_out_train_y, labels[train_start_idx:train_end_idx])

            np.save(path_out_validation_X,
                    samples[validation_start_idx:validation_end_idx])
            np.save(path_out_validation_y,
                    labels[validation_start_idx:validation_end_idx])

            np.save(path_out_test_X, samples[test_start_idx:test_end_idx])
            np.save(path_out_test_y, labels[test_start_idx:test_end_idx])

            # write positive and negative text files:
            dir_path = os.path.join(samples_out_base_dir_for_text_files,
                                    species_name)

            all_Xs_text_shuffled = []
            all_Ys_text_shuffled = []
            for sample in all_samples:
                all_Xs_text_shuffled.append(sample.get_sample_str())
                all_Ys_text_shuffled.append(str(sample.get_label()))

            text_samples = np.array(all_Xs_text_shuffled)
            text_labels = np.array(all_Ys_text_shuffled)

            for section in sections:
                print "section: ", section
                path_out_text_X, path_out_text_Y = data_handle.get_path(
                    dir_path, section, k)
                start, end = indices[section]
                with open(path_out_text_X, 'w') as out_text_samples:
                    string = '\n'.join(text_samples[start:end])
                    out_text_samples.write(string)
                with open(path_out_text_Y, 'w') as out_text_labels:
                    string = '\n'.join(text_labels[start:end])
                    out_text_labels.write(string)

    print "End! :)"
    def create_data_from_all_species_together(self):
        """
        creating data for all species together -  14000 samples:
        takes random 1/17 of the samples from the training data of each species,
        random 1/17 of the samples from the validation data of each species,
        and random 1/17 of the samples from the test data of each species.
        (same random indices from all species).
        :return:
        """
        print "start creating data for : ", self.project.species[-1], \
              " and : ", self.project.species[-2]
        train_samples_238000 = []
        validation_samples_238000 = []
        test_samples_238000 = []
        train_samples_14000 = []
        validation_samples_14000 = []
        test_samples_14000 = []
        # collect all training, validation and test data from all species:
        section_index = 0
        for section_name in self.sections:
            for j in range(len(self.project.species) - 2):
                species_name = self.project.species[j]
                section_samples_per_species = []
                section_file_path = os.path.join(
                    self.project.text_samples_base_dir, species_name,
                    section_name + "_data")
                with open(section_file_path) as one_species_section_file:
                    line_counter = 0
                    for line in one_species_section_file:
                        split_line = line.split("\t")
                        label = split_line[0]
                        label_int = 1 if label == "True" else 0
                        sequence = (split_line[1]
                                    )[:-1]  # without \n at the end of the line
                        sample_object = SampleObject(sequence, label_int)
                        section_samples_per_species.append(sample_object)
                        line_counter += 1
                        number_of_samples_in_section = \
                            2*(self.project.get_number_of_samples())*(self.section_ratios[section_index])
                        # TODO for now - takes only <self.ratio_of_samples_from_all_species> samples from each section
                        if line_counter >= (
                                number_of_samples_in_section *
                                self.ratio_of_samples_from_all_species):
                            break

                number_section_samples = line_counter
                # takes 1/17 from each species samples
                random_indices_of_section_samples = random.sample(
                    xrange(number_section_samples),
                    number_section_samples / (len(self.project.species) - 2))
                random_section_samples = [
                    section_samples_per_species[j]
                    for j in range(len(section_samples_per_species))
                    if j in random_indices_of_section_samples
                ]
                if section_name == 'train':
                    train_samples_14000.extend(random_section_samples)
                    train_samples_238000.extend(section_samples_per_species)
                elif section_name == 'validation':
                    validation_samples_14000.extend(random_section_samples)
                    validation_samples_238000.extend(
                        section_samples_per_species)
                elif section_name == 'test':
                    test_samples_14000.extend(random_section_samples)
                    test_samples_238000.extend(section_samples_per_species)
            section_index += 1
        # print "len(train_samples_14000) = ", len(train_samples_14000)
        # print "len(validation_samples_14000) = ", len(validation_samples_14000)
        # print "len(test_samples_14000) = ", len(test_samples_14000)
        # print "len(train_samples_238000) = ", len(train_samples_238000)
        # print "len(validation_samples_238000) = ", len(validation_samples_238000)
        # print "len(test_samples_238000) = ", len(test_samples_238000)
        # print "total 238000 train+validation+test: ", len(train_samples_238000) \
        #         + len(validation_samples_238000) + len(test_samples_238000)
        # print "total 14000 train+validation+test: ", len(train_samples_14000) \
        #         + len(validation_samples_14000) + len(test_samples_14000)
        for i in range(
                len(self.project.species) - 2, len(self.project.species)):
            species_name = self.project.species[i]
            species_dir_text = os.path.join(self.project.text_samples_base_dir,
                                            species_name)
            pos_out_path = os.path.join(species_dir_text, "positive_samples")
            neg_out_path = os.path.join(species_dir_text, "negative_samples")
            species_dir_npy = os.path.join(self.project.samples_base_dir,
                                           species_name)
            if i == len(
                    self.project.species
            ) - 2:  # iteration #17 - creating data for all species - 238000 samples
                train_samples = train_samples_238000
                validation_samples = validation_samples_238000
                test_samples = test_samples_238000
            elif i == len(
                    self.project.species
            ) - 1:  # iteration #18 - creating data for all species - 14000 samples
                train_samples = train_samples_14000
                validation_samples = validation_samples_14000
                test_samples = test_samples_14000
            with open(pos_out_path, 'w') as out_pos:
                with open(neg_out_path, 'w') as out_neg:
                    for section_name in self.sections:
                        print "start section: ", section_name
                        if section_name == 'train':
                            section_samples = train_samples
                        elif section_name == 'validation':
                            section_samples = validation_samples
                        elif section_name == 'test':
                            section_samples = test_samples
                        # write the section samples and labels to text file and numpy file
                        path_out_text_X, path_out_text_Y = data_handle.get_path(
                            species_dir_text, section_name)
                        path_out_npy_X, path_out_npy_Y = data_handle.get_path(
                            species_dir_npy, section_name)
                        shuffle(section_samples)
                        section_sample_matrices = []
                        section_label_matrices = []
                        section_samples_str = []
                        section_labels_str = []
                        for sample in section_samples:
                            section_sample_matrices.append(
                                sample.get_sample_matrix())
                            section_label_matrices.append(
                                sample.get_label_matrix())
                            section_samples_str.append(sample.get_sample_str())
                            section_labels_str.append(str(sample.get_label()))
                        np.save(path_out_npy_X, section_sample_matrices)
                        np.save(path_out_npy_Y, section_label_matrices)
                        with open(path_out_text_X, 'w') as out_text_samples:
                            string = '\n'.join(section_samples_str)
                            out_text_samples.write(string)
                        with open(path_out_text_Y, 'w') as out_text_labels:
                            string = '\n'.join(section_labels_str)
                            out_text_labels.write(string)
                        self.write_positive_and_negative_text_samples_files(
                            section_labels_str, section_samples_str, out_pos,
                            out_neg)
Example #8
0
def create_one_sample(is_positive, sequence):
    label = 1 if is_positive else 0
    sample_object = SampleObject(sequence, label)
    return sample_object