Esempio n. 1
0
def run_batch_training_from_tuples():
    # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/",
    #                "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"]

    chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"]

    trainer = JointClassifierTrainer()

    all_file_paths = list()
    for path in chr_paths:
        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl")
        all_file_paths.extend(file_paths)

    counts = trainer.get_counts_from_tuples(paths=all_file_paths)

    distribution = trainer.train_model(counts)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename))

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 2
0
def train_joint_model_from_tuples(tuples_path):
    training_tuples = load_training_tuples(tuples_path, cutoff=16)

    print("training tuples loaded: ", len(training_tuples))

    distribution = train_model(data=training_tuples)

    distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/"
    distribution_filename = "distribution_" + FileManager.get_datetime_string()

    FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
Esempio n. 3
0
def generate_training_data(data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer=None):
    # datetime_string = FileManager.get_datetime_string()
    #
    # output_dir = os.path.join(output_dir, datetime_string)
    # filename = "joint_distribution_" + datetime_string

    n_files = len(data_loader)
    all_training_tuples = list()
    i = 0

    print("testing n windows: ", n_files)

    for b, batch in enumerate(data_loader):
        # sys.stdout.write("\r %.2f%% COMPLETED  " % (100*b/n_batches))

        paths, x_pileup, y_pileup, x_repeat, y_repeat, reversal = batch

        # print()
        # print("X PILEUP", x_pileup.shape)
        # print("Y PILEUP", y_pileup.shape)
        # print("X REPEAT", x_repeat.shape)
        # print("Y REPEAT", y_repeat.shape)
        # print("REVERSAL", reversal.shape)

        if gap_filterer is not None:
            try:
                batch = gap_filterer.filter_batch(batch, plot=False)
                x_pileup, y_pileup, x_repeat, y_repeat, reversal = batch

            except ValueError as e:
                print("ERROR:", e)
                print("X PILEUP", x_pileup.shape)
                print("Y PILEUP", y_pileup.shape)
                print("X REPEAT", x_repeat.shape)
                print("Y REPEAT", y_repeat.shape)
                print("REVERSAL", reversal.shape)

                continue

        # (n,h,w) shape
        batch_size, n_channels, height, width = x_pileup.shape

        for n in range(batch_size):
            # input shape = (batch_size, n_channels, height, width)
            # example x_pileup_n shape: (5, 44, 24)
            # example y_pileup_n shape: (5, 1, 24)
            # example x_repeat_n shape: (1, 44, 24)
            # example y_repeat_n shape: (1, 1, 24)

            x_pileup_n = x_pileup[n, :, :].reshape([n_channels, height, width])
            y_pileup_n = y_pileup[n, :, :].reshape([5, 1, width])
            x_repeat_n = x_repeat[n, :, :].reshape([1, height, width])
            y_repeat_n = y_repeat[n, :, :].reshape([1, width])
            reversal_n = reversal[n, :, :].reshape([1, height, width])

            truths_vs_observations = get_joint_base_runlength_observations_vs_truth(x_pileup=x_pileup_n,
                                                                                    y_pileup=y_pileup_n,
                                                                                    x_repeat=x_repeat_n,
                                                                                    y_repeat=y_repeat_n,
                                                                                    reversal=reversal_n,
                                                                                    path=paths[0])

            all_training_tuples.extend(truths_vs_observations)

            if i % 1 == 0:
                sys.stdout.write("\r " + str(round(i/n_files*100,3)) + "% completed")

            if i % 10000 == 0 or i == n_files -1:
                filename = "training_data_" + filename_suffix + "_" + str(i)
                print("\nSAVING: ", os.path.join(output_dir, filename))
                FileManager.save_object_pickle(output_dir=output_dir, filename=filename, object=all_training_tuples)
                all_training_tuples = list()

            i += 1

    return output_dir