Ejemplo n.º 1
0
def run():
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one-hot with anchors and reversal matrix Chr1 filtered 2820
    model_state_path = "output/training_2018-10-17-15-1-39-2-290/model_checkpoint_9"

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Training parameters
    batch_size_train = 1
    n_batches = 1000

    threshold = 0.005

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False,
                             convert_to_distributions=False,
                             use_gpu=False)

    gap_filterer = GapFilterer(model_state_path=model_state_path,
                               threshold=threshold)

    test_filter(gap_filterer=gap_filterer,
                data_loader=data_loader,
                n_batches=n_batches)
Ejemplo n.º 2
0
def run():
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one-hot with anchors and reversal matrix chr1 celegans

    # file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=directory, file_extension=".npz", sort=False)

    file_paths = [
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_9699291_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4172039_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4552073_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7332035_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_12807084_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7773028_matrix.npz"
    ]

    # Training parameters
    batch_size_train = 1
    n_batches = 500

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False)

    consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float)

    print(len(data_loader))
    test_consensus(consensus_caller=consensus_caller,
                   data_loader=data_loader,
                   n_batches=n_batches,
                   plot_mismatches=False)
Ejemplo n.º 3
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8"  # one-hot with anchors and reversal matrix Chr4
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # one hot with anchors and reversal matrix chr1

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Training parameters
    batch_size_train = 1
    checkpoint_interval = 300

    n_batches = 1000

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False)

    consensus_caller = ConsensusCaller(sequence_to_index, sequence_to_float)

    gap_filterer = GapFilterer()

    print(len(data_loader))
    test_consensus(consensus_caller=consensus_caller,
                   data_loader=data_loader,
                   plot_mismatches=False,
                   gap_filterer=gap_filterer,
                   n_batches=n_batches)
Ejemplo n.º 4
0
def run():
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8"  # one-hot with anchors and reversal matrix Chr4

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Architecture parameters
    hidden_size = 256
    input_channels = 7  # 1-dimensional signal
    output_size = 1  # '-','A','C','T','G' one hot vector
    n_layers = 1

    # Hyperparameters
    learning_rate = 1e-3
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = None

    checkpoint_interval = 250

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=True)
    model = EncoderDecoder(hidden_size=hidden_size,
                           input_size=input_channels,
                           output_size=output_size,
                           n_layers=n_layers,
                           dropout_rate=dropout_rate)
    results_handler = ResultsHandler()

    # Initialize the optimizer with above parameters
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)

    # Define the loss function
    loss_fn_repeat = nn.MSELoss()
    loss_fn_base = nn.CrossEntropyLoss()

    losses = train(model=model,
                   input_channels=input_channels,
                   data_loader=data_loader,
                   optimizer=optimizer,
                   loss_fn_repeat=loss_fn_repeat,
                   loss_fn_base=loss_fn_base,
                   n_batches=n_batches,
                   results_handler=results_handler,
                   checkpoint_interval=checkpoint_interval)

    results_handler.save_model(model)
    results_handler.save_plot(losses)
Ejemplo n.º 5
0
def run_generate_tuples_from_pileups():
    max_threads = 6

    # NC_003279.8         Caenorhabditis elegans chromosome I
    # NC_003280.10     Caenorhabditis elegans chromosome II
    # NC_003281.10     Caenorhabditis elegans chromosome III
    # NC_003282.8         Caenorhabditis elegans chromosome IV
    # NC_003283.11    Caenorhabditis elegans chromosome V
    # NC_003284.9        Caenorhabditis elegans chromosome X
    # NC_001328.1        Caenorhabditis elegans mitochondrion, complete genome

    # data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003280.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003281.10",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003282.8",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11",
    #              "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003284.9"]

    data_path = ["/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"]

    args = list()
    for path in data_path:
        gap_filterer = GapFilterer()

        batch_size = 1

        file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".npz")

        data_loader = DataLoader(file_paths, batch_size=batch_size, parse_batches=False)

        consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float)

        output_dir = "output/joint_runlength_base_model/" + FileManager.get_datetime_string()

        filename_suffix = path.split("/")[-1]
        print(filename_suffix)

        args.append([data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer])

        gap_filterer = None

        gc.collect()

    n_threads = min(len(args), max_threads)

    for arg in args:
        print(arg)
    print(n_threads)

    with Pool(processes=n_threads) as pool:
        pool.starmap(generate_training_data, args)
Ejemplo n.º 6
0
def run():
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"     # one-hot with anchors and reversal matrix Chr1 filtered 2820
    model_state_path = "output/training_2018-10-17-15-1-39-2-290/model_checkpoint_9"

    file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=directory, file_extension=".npz", sort=False)

    # Architecture parameters
    hidden_size = 256
    input_channels = 61      # 1-dimensional signal
    output_size = 1         # '-','A','C','T','G' one hot vector
    n_layers = 1

    # Hyperparameters
    learning_rate = 5e-4
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = 10000

    scale_by_length = True
    checkpoint_interval = 200

    use_gpu = False

    data_loader = DataLoader(file_paths=file_paths, batch_size=batch_size_train, parse_batches=True, convert_to_distributions=True, convert_to_binary=True, use_gpu=use_gpu)
    print(data_loader.y_dtype)

    model = Decoder(hidden_size=hidden_size, input_size=input_channels, output_size=output_size, n_layers=n_layers, dropout_rate=dropout_rate, use_sigmoid=True)
    model.load_state_dict(torch.load(model_state_path))

    results_handler = ResultsHandler()

    # for alignment/pileup operations + conversions
    consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index, sequence_to_float=sequence_to_float)

    if use_gpu:
        model = model.cuda()

    test(model=model,
         data_loader=data_loader,
         n_batches=n_batches,
         results_handler=results_handler,
         checkpoint_interval=checkpoint_interval,
         consensus_caller=consensus_caller,
         use_gpu=use_gpu)
Ejemplo n.º 7
0
def run():
    # data_path = "/home/ryan/code/nanopore_assembly/output/celegans_chr1_1m_windows_spoa_pileup_generation_2018-9-18"    # 1 million bases in celegans chr1 scrappie
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_human_chr1_1mbp_2018-9-18"             # 1 million bases in human guppy
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_celegans_chr1_2-12Mbp_2018-9-21"       # 10 million bases in human guppy
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_chr5_FULL_20Mbp_2018-9-24"             # chr5 full float encoded
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003279.8"  # chr1 one-hot
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_fixed_size2018-10-11-13-1-59-3-284"    # chr1 fixed size windows one-hot
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_anchored2018-10-11-11-20-29-3-284"    # chr1 windowed windows one-hot
    # data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_anchored_2018-10-12-13-38-4-4-285/NC_003279.8" # chr1 fixed size windows one-hot (repeat transitions only!)
    data_path = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # chr1 filtered BAM celegans repeat transition windows

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=data_path, file_extension=".npz")

    data_loader = DataLoader(file_paths, batch_size=1, parse_batches=False)

    lengths = measure_runlengths(data_loader)
Ejemplo n.º 8
0
    def generate_filter_mask(self, x_pileup, y_pileup, x_repeat, reversal):
        if self.use_gpu:
            self.model.cuda()

        # (n,h,w) shape
        n_channels, height, width = x_pileup.shape

        _, x_pileup_distribution, x_repeat_distribution = \
            DataLoader.convert_pileup_to_distributions(x_pileup, x_repeat, reversal)

        x_pileup_distribution = numpy.expand_dims(x_pileup_distribution,
                                                  axis=0)
        x_repeat_distribution = numpy.expand_dims(x_repeat_distribution,
                                                  axis=0)

        # print(x_pileup_distribution.shape)
        # print(x_repeat_distribution.shape)

        if self.use_gpu:
            x_pileup_distribution = torch.cuda.FloatTensor(
                x_pileup_distribution)
            x_repeat_distribution = torch.cuda.FloatTensor(
                x_repeat_distribution)

        else:
            x_pileup_distribution = torch.FloatTensor(x_pileup_distribution)
            x_repeat_distribution = torch.FloatTensor(x_repeat_distribution)

        # print()
        # print("X PILEUP",x_pileup.shape)
        # print("Y PILEUP",y_pileup.shape)
        # print("X REPEAT",x_repeat.shape)
        # print("REVERSAL",reversal.shape)

        x = torch.cat([x_pileup_distribution, x_repeat_distribution],
                      dim=2).reshape([1, 61, width])

        y_pileup_predict = self.model.forward(x)

        if self.use_gpu:
            y_pileup_predict = y_pileup_predict.cpu()

        gap_filter_mask = (y_pileup_predict.detach().numpy().reshape([width]) >
                           self.threshold)

        return gap_filter_mask
Ejemplo n.º 9
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8"  # Training Chr4
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8"  # Testing Chr1

    model_state_path = "/home/ryan/code/nanopore_assembly/output/training_2018-11-8-10-13-3-3-312/model_checkpoint_5"
    #
    # file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=directory, file_extension=".npz", sort=False)

    file_paths = [
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_9699291_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4172039_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_4552073_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7332035_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_12807084_matrix.npz",
        "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003279.8/NC_003279.8_7773028_matrix.npz"
    ]

    # Architecture parameters
    hidden_size = 512
    input_channels = 7  # 1-dimensional signal
    output_size = 5 + 1  # '-','A','C','T','G' one hot vector
    n_layers = 2

    # Hyperparameters
    learning_rate = 1e-3
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = None

    scale_by_length = False

    checkpoint_interval = 50

    use_gpu = False

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=False,
                             use_gpu=use_gpu)
    model = EncoderDecoder(hidden_size=hidden_size,
                           input_size=input_channels,
                           output_size=output_size,
                           n_layers=n_layers,
                           dropout_rate=dropout_rate)
    results_handler = ResultsHandler()

    model.load_state_dict(torch.load(model_state_path))

    # for alignment/pileup operations + conversions
    consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index,
                                       sequence_to_float=sequence_to_float)

    # remove gap columns with preprocessing RNN
    gap_filterer = GapFilterer(threshold=0.003)

    if use_gpu:
        model = model.cuda()

    print(model)

    losses = test(model=model,
                  input_channels=input_channels,
                  data_loader=data_loader,
                  n_batches=n_batches,
                  results_handler=results_handler,
                  checkpoint_interval=checkpoint_interval,
                  consensus_caller=consensus_caller,
                  scale_by_length=scale_by_length,
                  use_gpu=use_gpu,
                  gap_filterer=gap_filterer)
Ejemplo n.º 10
0
def run(load_model=False, model_state_path=None):
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-9-12-15-25-8-2-255"  # spoa 2 pass arbitray region 2500 windows

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    results_handler = ResultsHandler()

    # Architecture parameters
    hidden_size = 64
    input_channels = 5 * 2  # 1-dimensional signal
    output_size = 5  # '-','A','C','T','G' one hot vector
    n_layers = 2

    # Hyperparameters
    learning_rate = 1e-4
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = 8000

    checkpoint_interval = 200

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             convert_to_frequency=True,
                             convert_repeats_to_counts=True)
    model = Decoder(hidden_size=hidden_size,
                    input_size=input_channels,
                    output_size=output_size,
                    n_layers=n_layers,
                    dropout_rate=dropout_rate)

    # Initialize the optimizer with above parameters
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)

    # Define the loss function
    # loss_fn = nn.MSELoss()
    loss_fn = nn.CrossEntropyLoss()

    if load_model:
        # get weight parameters from saved model state
        model.load_state_dict(torch.load(model_state_path))

    # Train and get the resulting loss per iteration
    losses = train(model=model,
                   data_loader=data_loader,
                   optimizer=optimizer,
                   loss_fn=loss_fn,
                   n_batches=n_batches,
                   results_handler=results_handler,
                   checkpoint_interval=checkpoint_interval)

    # test(model=model,
    #      data_loader=data_loader,
    #      n_batches=4)

    results_handler.save_model(model)
    results_handler.save_plot(losses)

    print(model)
Ejemplo n.º 11
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-15-13-10-33-0-288/NC_003283.11"     # one-hot with anchors and reversal matrix Chr5 filtered 2820
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-11-12-14-8-24-0-316/gi"  # one-hot with anchors and reversal matrix E. Coli

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Architecture parameters
    hidden_size = 256
    input_channels = 61  # 1-dimensional signal
    output_size = 1  # '-','A','C','T','G' one hot vector
    n_layers = 1

    # Hyperparameters
    learning_rate = 5e-4
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = None

    scale_by_length = True
    checkpoint_interval = 200

    use_gpu = False

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=True,
                             convert_to_distributions=True,
                             convert_to_binary=True,
                             use_gpu=use_gpu)
    print(data_loader.y_dtype)

    model = Decoder(hidden_size=hidden_size,
                    input_size=input_channels,
                    output_size=output_size,
                    n_layers=n_layers,
                    dropout_rate=dropout_rate,
                    use_sigmoid=True)
    results_handler = ResultsHandler()

    # Initialize the optimizer with above parameters
    optimizer = optim.Adam(model.parameters(),
                           lr=learning_rate,
                           weight_decay=weight_decay)

    # Define the loss function
    loss_fn_repeat = nn.MSELoss()
    loss_fn_base = nn.BCELoss()

    # for alignment/pileup operations + conversions
    consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index,
                                       sequence_to_float=sequence_to_float)

    if use_gpu:
        model = model.cuda()

    losses = train(model=model,
                   input_channels=input_channels,
                   data_loader=data_loader,
                   optimizer=optimizer,
                   loss_fn_repeat=loss_fn_repeat,
                   loss_fn_base=loss_fn_base,
                   n_batches=n_batches,
                   results_handler=results_handler,
                   checkpoint_interval=checkpoint_interval,
                   consensus_caller=consensus_caller,
                   scale_by_length=scale_by_length,
                   use_gpu=use_gpu)

    results_handler.save_model(model)
    results_handler.save_plot(losses)
Ejemplo n.º 12
0
def train(model,
          data_loader,
          optimizer,
          input_channels,
          n_batches,
          results_handler,
          checkpoint_interval,
          loss_fn_repeat,
          loss_fn_base,
          consensus_caller,
          gap_filterer=None,
          scale_by_length=False,
          use_gpu=False):
    if use_gpu:
        print("USING GPU :)")
        x_dtype = torch.cuda.FloatTensor
        # y_dtype = torch.cuda.FloatTensor  # for MSE Loss or BCE loss
        y_dtype = torch.cuda.LongTensor  # for CE Loss

    else:
        x_dtype = torch.FloatTensor
        # y_dtype = torch.FloatTensor  # for MSE Loss or BCE loss
        y_dtype = torch.LongTensor  # for CE Loss

    total_sequence_confusion = None
    total_expanded_confusion = None
    total_repeat_confusion = list()
    losses = list()

    for b, batch in enumerate(data_loader):
        # sys.stdout.write("\r %.2f%% COMPLETED  " % (100*b/len(data_loader)))

        paths, x_pileup, y_pileup_unfiltered, x_repeat, y_repeat_unfiltered, reversal = batch

        # print()
        # print("X PILEUP", x_pileup.shape)
        # print("Y PILEUP", y_pileup.shape)
        # print("X REPEAT", x_repeat.shape)
        # print("Y REPEAT", y_repeat.shape)
        # print("REVERSAL", reversal.shape)

        if gap_filterer is not None:
            batch = gap_filterer.filter_batch(batch)

        x_pileup, y_pileup, x_repeat, y_repeat, reversal = batch

        x_pileup_n = x_pileup[0, :, :, :]
        y_pileup_n = y_pileup[0, :, :, :]
        x_repeat_n = x_repeat[0, :, :, :]
        y_repeat_n = y_repeat[0, :, :, :]
        reversal_n = reversal[0, :, :]

        # (n,h,w) shape
        batch_size, n_channels, height, width = x_pileup.shape

        _, x_pileup_distribution, x_repeat_distribution = \
            DataLoader.convert_pileup_to_distributions(x_pileup_n, x_repeat_n, reversal_n)

        x_pileup_distribution = numpy.expand_dims(x_pileup_distribution,
                                                  axis=0)
        x_repeat_distribution = numpy.expand_dims(x_repeat_distribution,
                                                  axis=0)

        x_pileup_distribution = torch.FloatTensor(x_pileup_distribution)
        x_repeat_distribution = torch.FloatTensor(x_repeat_distribution)
        y_pileup = torch.FloatTensor(y_pileup)
        y_repeat = torch.FloatTensor(y_repeat)
        y_pileup_unfiltered = torch.FloatTensor(y_pileup_unfiltered)
        y_repeat_unfiltered = torch.FloatTensor(y_repeat_unfiltered)

        x = torch.cat([x_pileup_distribution, x_repeat_distribution],
                      dim=2).reshape([1, 61, width])

        # print(x.shape)

        loss, base_loss, repeat_loss, y_pileup_predict, y_repeat_predict = train_batch(
            model=model,
            optimizer=optimizer,
            x=x,
            y_pileup=y_pileup,
            y_repeat=y_repeat,
            loss_fn_repeat=loss_fn_repeat,
            loss_fn_base=loss_fn_base,
            scale_by_length=scale_by_length)

        losses.append(loss / width)
        print(b, loss / width)

        y_pileup_n = y_pileup[
            0, :,
            0, :]  # ignore the depth dimension because y has always 1 "coverage"
        y_pileup_unfiltered_n = y_pileup_unfiltered[
            0, :,
            0, :]  # ignore the depth dimension because y has always 1 "coverage"
        y_pileup_predict_n = y_pileup_predict[0, :, :]
        y_repeat_n = y_repeat[
            0, :,
            0, :]  # ignore the depth dimension because y has always 1 "coverage"
        y_repeat_unfiltered_n = y_repeat_unfiltered[
            0, :,
            0, :]  # ignore the depth dimension because y has always 1 "coverage"
        y_repeat_predict_n = y_repeat_predict[0, :, :]

        # print(y_pileup_n.shape)
        # print(y_pileup_n)

        y_pileup_predict_n_flattened = torch.argmax(y_pileup_predict_n,
                                                    dim=0).data.numpy()
        y_repeat_predict_n_flattened = y_repeat_predict_n[0, :].data.numpy()

        y_pileup_n_flattened = torch.argmax(y_pileup_unfiltered_n,
                                            dim=0).data.numpy()
        y_repeat_n_flattened = y_repeat_unfiltered_n[0, :].data.numpy()

        # print(y_pileup_n_flattened)
        # print(y_pileup_predict_n.shape)

        print(paths[0])

        # decode as string to compare with non-runlength version
        expanded_consensus_string = \
            consensus_caller.expand_collapsed_consensus_as_string(consensus_indices=y_pileup_predict_n_flattened,
                                                                  repeat_consensus_integers=y_repeat_predict_n_flattened,
                                                                  ignore_spaces=True)

        # decode as string to compare with non-runlength version
        expanded_reference_string = \
            consensus_caller.expand_collapsed_consensus_as_string(consensus_indices=y_pileup_n_flattened,
                                                                  repeat_consensus_integers=y_repeat_n_flattened,
                                                                  ignore_spaces=True)

        # print(expanded_reference_string)
        # print(expanded_consensus_string)

        if len(expanded_consensus_string) == 0:
            expanded_consensus_string = '-'

        # realign strings to each other and convert to one hot
        y_pileup_predict_expanded, y_pileup_expanded = \
            realign_consensus_to_reference(consensus_sequence=expanded_consensus_string,
                                           ref_sequence=expanded_reference_string,
                                           print_alignment=True)

        y_pileup_predict_expanded = y_dtype(y_pileup_predict_expanded)
        y_pileup_expanded = y_dtype(y_pileup_expanded)

        expanded_confusion, _ = sequential_confusion(
            y_predict=y_pileup_predict_expanded, y=y_pileup_expanded)

        # y_pileup_predict = torch.FloatTensor(y_pileup_predict)
        # y_pileup_n = torch.FloatTensor(y_pileup_n)

        sequence_confusion, mismatches = sequential_confusion(
            y_predict=y_pileup_predict_n, y=y_pileup_n)

        repeat_predict = numpy.round(
            y_repeat_predict.reshape([width]).data.numpy(),
            0).astype(numpy.uint8)
        repeat_target = numpy.round(y_repeat.reshape([width]).data.numpy(),
                                    0).astype(numpy.uint8)

        repeat_confusion = sequential_repeat_confusion(
            y_predict=repeat_predict, y=repeat_target)

        total_repeat_confusion.extend(repeat_confusion)

        if total_sequence_confusion is None:
            total_sequence_confusion = sequence_confusion
        else:
            total_sequence_confusion += sequence_confusion

        if total_expanded_confusion is None:
            total_expanded_confusion = expanded_confusion
        else:
            total_expanded_confusion += expanded_confusion

        # plot_confusion(sequence_confusion)

        if b % checkpoint_interval == 0:
            results_handler.save_model(model)

            repeat_predict = numpy.round(
                y_repeat_predict.reshape([1, width]).data.numpy(),
                0).astype(numpy.uint8)
            repeat_target = numpy.round(
                y_repeat.reshape([1, width]).data.numpy(),
                0).astype(numpy.uint8)
            repeat_comparison = numpy.concatenate(
                [repeat_predict, repeat_target], axis=0).T

            print(repeat_comparison)
            accuracy = calculate_accuracy_from_confusion(
                total_expanded_confusion)
            print("Total accuracy", accuracy)

            # total_sequence_confusion = normalize_confusion_matrix(total_sequence_confusion)
            # total_expanded_confusion = normalize_confusion_matrix(total_expanded_confusion)

            pyplot.plot(losses)
            pyplot.show()
            pyplot.close()

            plot_repeat_confusion(total_repeat_confusion)
            plot_confusion(total_sequence_confusion)
            plot_confusion(total_expanded_confusion)

            total_sequence_confusion = None
            total_expanded_confusion = None
            total_repeat_confusion = list()

            # y_pileup_predict = y_pileup_predict.reshape([1, y_pileup_predict.shape[0]])
            # y_repeat_predict = y_repeat_predict.reshape([1, y_repeat_predict.shape[0]])
            #
            # x_pileup_n_flat = flatten_one_hot_tensor(x_pileup_n)
            # y_pileup_n_flat = flatten_one_hot_tensor(y_pileup_n)
            # y_pileup_predict_flat = flatten_one_hot_tensor(y_pileup_predict)
            #
            # plot_runlength_prediction(x_pileup=x_pileup_n_flat, y_pileup=y_pileup_n_flat, x_repeat=x_repeat_n, y_repeat=y_repeat_predict)

        #     plot_repeat_confusion(total_repeat_confusion)

    print()

    # total_sequence_confusion = normalize_confusion_matrix(total_sequence_confusion)
    # total_expanded_confusion = normalize_confusion_matrix(total_expanded_confusion)

    # accuracy = calculate_accuracy_from_confusion(total_expanded_confusion)

    # print("Total accuracy", accuracy)

    # plot_confusion(total_sequence_confusion)
    # plot_confusion(total_expanded_confusion)
    # plot_repeat_confusion(total_repeat_confusion)

    return losses
Ejemplo n.º 13
0
def run():
    # directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003282.8"  # Training Chr4
    directory = "/home/ryan/code/nanopore_assembly/output/spoa_pileup_generation_2018-10-2-10-43-22-1-275/NC_003279.8"  # Testing Chr1

    model_state_path = "/home/ryan/code/nanopore_assembly/output/training_2018-10-9-17-51-4-1-282/model_checkpoint_10"

    file_paths = FileManager.get_all_file_paths_by_type(
        parent_directory_path=directory, file_extension=".npz", sort=False)

    # Architecture parameters
    hidden_size = 512
    input_channels = 7  # 1-dimensional signal
    output_size = 5 + 1  # '-','A','C','T','G' one hot vector
    n_layers = 1

    # Hyperparameters
    learning_rate = 1e-3
    weight_decay = 1e-5
    dropout_rate = 0.1

    # Training parameters
    batch_size_train = 1
    n_batches = None

    scale_by_length = False

    checkpoint_interval = 50

    use_gpu = False

    data_loader = DataLoader(file_paths=file_paths,
                             batch_size=batch_size_train,
                             parse_batches=True,
                             use_gpu=use_gpu)
    model = EncoderDecoder(hidden_size=hidden_size,
                           input_size=input_channels,
                           output_size=output_size,
                           n_layers=n_layers,
                           dropout_rate=dropout_rate)
    results_handler = ResultsHandler()

    model.load_state_dict(torch.load(model_state_path))

    # for alignment/pileup operations + conversions
    consensus_caller = ConsensusCaller(sequence_to_index=sequence_to_index,
                                       sequence_to_float=sequence_to_float)

    if use_gpu:
        model = model.cuda()

    losses = test(model=model,
                  input_channels=input_channels,
                  data_loader=data_loader,
                  n_batches=n_batches,
                  results_handler=results_handler,
                  checkpoint_interval=checkpoint_interval,
                  consensus_caller=consensus_caller,
                  scale_by_length=scale_by_length,
                  use_gpu=use_gpu)

    results_handler.save_model(model)
    results_handler.save_plot(losses)