コード例 #1
0
 def test_list_dir(self):
     """Test if list_dir is working"""
     canonical = '/'.join(os.path.abspath(__file__).split("/")[:-1])
     canonical = os.path.join(canonical,
                              "test_files/minion-reads/canonical")
     # expected = ["CCAGG_modified.bed", "ecoli_k12_mg1655.fa", "ecoli_k12_mg1655_modified.fa"]
     expected = \
         ["miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read207_strand.fast5",
         "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read214_strand.fast5",
         "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read280_strand.fast5"]
     expected_files = sorted([os.path.join(canonical, x) for x in expected])
     self.assertEqual(sorted(list_dir(canonical)), expected_files)
     self.assertEqual(sorted(list_dir(canonical, ext="fast5")),
                      expected_files)
コード例 #2
0
def trim_signal_wrapper(dir, outdir):
    """Wrapper for trim signal function used for whole directory of signal and label files"""
    signal_files = list_dir(dir, ext='signal')
    labels_files = list_dir(dir, ext='label')
    out_files = []
    for signal_f in signal_files:
        try:
            file_pre = os.path.splitext(signal_f)[0]
            f_label = file_pre + '.label'
            assert os.path.isfile(f_label)
            outpath = trim_signal(signal_f, f_label, outdir)
            out_files.append(outpath)
        except (AssertionError, ValueError) as e:
            print("cannot find {}".format(f_label), file=sys.stderr)
            continue
    return out_files
コード例 #3
0
def find_accuracy(fasta_dir, label_dir):
    fastas = list_dir(fasta_dir)
    for fasta in fastas:
        name = fasta.split('/')[-1].split('.')[0]
        label_file = os.path.join(label_dir, name + '.label')
        alignment = create_alignment(fasta, label_file)
        total_counts, base_counts = alignment_stats(alignment)
        create_summary_stats(total_counts)

    return True
コード例 #4
0
def match_label_fasta(fasta_dir, label_dir):
    """match up label files with fasta files from chiron output"""
    pairs = []
    for fasta in list_dir(fasta_dir, ext='fasta'):
        pref = os.path.splitext(fasta)[0].split('/')[-1]
        label = os.path.join(label_dir, pref + '.label')
        if os.path.exists(label):
            pairs.append([fasta, label])
        else:
            print("file not found: {}".format(label))
    return pairs
コード例 #5
0
def main():
    """Main docstring"""
    start = timer()
    # label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/data/raw"
    label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/nanotensor/visualization/rna_training/training"
    label_files = list_dir(label_dir, ext='label')
    rna_event_lengths = []
    for label_f in label_files:
        events = read_label(label_f)
        rna_event_lengths.extend(events.length)
    label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/nanotensor/visualization/dna_training/training"
    label_files = list_dir(label_dir, ext='label')
    dna_event_lengths = []
    for label_f in label_files:
        events = read_label(label_f)
        dna_event_lengths.extend(events.length)

    outpath = "event_hist.png"
    plot_histogram(rna_event_lengths, dna_event_lengths, outpath)
    # densityplot_events([rna_event_lengths, dna_event_lengths], "event_density.png")
    stop = timer()
    print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
コード例 #6
0
def create_label_chiron_data_args(fast5dir,
                                  output_dir,
                                  output_name,
                                  verbose=False):
    """Create arguments for label_chiron_data function"""
    assert os.path.isdir(fast5dir) is True, "fast5 directory does not exist"
    assert os.path.isdir(output_dir) is True, "output directory does not exist"
    fast5files = list_dir(fast5dir, ext="fast5")
    counter = 0
    for read in fast5files:
        name = output_name + str(counter)
        counter += 1
        yield dict(fast5_path=read,
                   output_dir=output_dir,
                   name=name,
                   verbose=verbose)
コード例 #7
0
def main():
    """Main docstring"""
    start = timer()
    # ont_fasta = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test_minion.fa"
    # ont_fastq = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test_minion.fastq"
    # test_fast5 = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/canonical/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch467_read35_strand.fast5"
    #
    # chiron_fast5_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/methylated_test"
    # ecoli_genome = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/reference-sequences/ecoli_k12_mg1655.fa"
    # with open(ecoli_genome, 'r+') as reference:
    #     replace_motif(reference)
    # fasta_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/test_output/result"
    # replace_motif(ecoli_genome)

    files = list_dir(
        "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/methylated"
    )
    thing = create_label_chiron_data_args(
        "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/methylated",
        "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/test_methylated",
        "methylated_aligned",
        verbose=True)
    for args in thing:
        label_chiron_data(fast5_path=args["fast5_path"],
                          output_dir=args["output_dir"],
                          name=args["name"])

    # output = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/test_output/result/all_reads2.fasta"
    # print(files)
    # path = cat_files(files, output)
    # bam = align_to_reference(path, ecoli_genome,
    #                          "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test3.bam", threads=2)
    # data = get_summary_alignment_stats(bam, ecoli_genome, report_all=True)
    # print(data)
    # call_nanoraw("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/canonical", ecoli_genome, 1)
    # for x in range(len(data)):
    #     print(x, (data[x]))
    #
    # fast5 = Fast5(test_fast5)
    # create_label_file(fast5, "/Users/andrewbailey/CLionProjects/nanopore-RNN/", "test1")
    # call_nanoraw(chiron_fast5_dir, ecoli_genome, 2, overwrite=True)
    # indexed = check_indexed_reference(ecoli_genome)
    # print(indexed)
    #
    # if not indexed:
    #     bwa_index_genome(ecoli_genome)
    #     indexed = check_indexed_reference(ecoli_genome)
    #
    # print(indexed)
    # fast5 = Fast5(test_fast5)
    # # data = fast5.get_reads(raw=True, scale=False)
    # # data1 = (next(data))
    # # print(data1)
    # # with open("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/ch467_read35.signal", 'w+') as file:
    # #     for x in data1:
    # #         file.write(str(x)+' ')
    # nanoraw_events = fast5.get_corrected_events()
    # events = nanoraw_events["start", 'length', 'base']
    # with open("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/ch467_read35.label", 'w+') as file:
    #     for event in events:
    #         line = str(event['start']) + ' ' + str(event['start'] + event['length']) + ' ' + str(event['base'] + '\n')
    #         file.write(line)

    # print(row1)
    # print(row1[["start", 'length', 'base']])
    # print(row1['start'])
    #
    # print(row1['length'])
    # print(row1['base'])

    stop = timer()
    print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
コード例 #8
0
def main(in_opts=None):
    """Main docstring"""
    start = timer()

    # allow for a command line to be input into main

    if in_opts is None:
        # get arguments from command line or config file
        command_line = CommandLine()
        args = get_arguments(command_line)
    else:
        command_line = CommandLine(in_opts=in_opts)
        args = command_line.args
    try:
        # args = get_arguments(command_line)
        # make sure they are right format
        args = CommandLine.check_args(args)
        # create directory in the output directory
        log_dir_path = create_time_directory(args.output_dir)
        # save config file in log directory
        save_config_file(args, log_dir_path)
        # reset output directory to new log directory so files are written to correct location
        args.output_dir = log_dir_path
        if args.chiron:
            call_nanoraw(args.fast5_dir,
                         args.reference,
                         args.num_cpu,
                         overwrite=args.overwrite)
            arg_generator = create_label_chiron_data_args(
                args.fast5_dir,
                args.output_dir,
                output_name=args.file_prefix,
                verbose=args.verbose)
            target = label_chiron_data_multiprocess_wrapper

        else:
            log_file = args.log_file
            print("Using log file {}".format(log_file), file=sys.stderr)
            # define number of workers and create queues
            arg_generator = create_training_data_args(log_file,
                                                      args.file_prefix, args)
            target = create_training_data

        if args.debug:
            for arg in arg_generator:
                target(arg)
        else:
            num_workers = args.num_cpu
            multiprocess_data(num_workers, target, arg_generator)

        # if tar or save files create tar archive
        if args.tar:
            tar_name = get_tar_name("training_data", args.output_dir,
                                    args.nanonet, args.deepnano, args.chiron)
            file_paths = list_dir(args.output_dir)
            print("Creating tarball file\n", file=sys.stderr)
            tar_path = tarball_files(tar_name,
                                     file_paths,
                                     output_dir=args.output_dir)
            print("Finished tarball file : {}\n".format(tar_path),
                  file=sys.stderr)
            if args.save2s3:
                print("Uploading {} to s3 bucket {}".format(
                    tar_path, args.bucket),
                      file=sys.stderr)
                upload_file_to_s3(args.bucket, tar_path, tar_name)

        print("\n#  nanotensor - finished creating data set\n",
              file=sys.stderr)
        print("\n#  nanotensor - finished creating data set\n",
              file=sys.stderr)
        # check how long the whole program took

        stop = timer()
        print("Running Time = {} seconds".format(stop - start),
              file=sys.stderr)

    except Usage as err:
        command_line.do_usage_and_die(err.msg)

    return log_dir_path
コード例 #9
0
ファイル: run_nanotensor.py プロジェクト: kdyslj/nanopore-RNN
    def __init__(self, args):
        self.args = args
        # get correct data input pipeline
        dataset_options = {
            "FullSignalSequence": FullSignalSequence,
            "MotifSequence": MotifSequence,
            "NumpyEventData": NumpyEventData,
            "PostProcessGlove": PostProcessGlove,
            "CharacterEmbedding": CharacterEmbedding,
            "RandomZInput": RandomZInput
        }
        self.Dataset = dataset_options[self.args.CreateDataset.dataset]
        # pick graph
        graph_options = {
            "CtcLoss": CtcLoss,
            "CrossEntropy": CrossEntropy,
            "LastLSTMOutput": LastLSTMOutput,
            "Seq2SeqGenerator": Seq2SeqGenerator
        }
        self.Graph = graph_options[self.args.BuildGraph.graph]

        self.global_step = tf.get_variable(
            'global_step', [],
            initializer=tf.constant_initializer(0),
            trainable=False)

        if self.args.train:
            self.training_files = list_dir(
                self.args.CreateDataset.training_dir)
            self.validation_files = list_dir(
                self.args.CreateDataset.validation_dir)
            self.training = "CreateDataset"
            self.validation = "CreateDataset"
            self.train_op = "train_op"
            self.training_model = "BuildGraph"
            self.validation_model = "BuildGraph"
            self.cost_diff_summary = None
            self.tower_grads = []
            learning_rate = tf.train.exponential_decay(self.args.learning_rate,
                                                       self.global_step,
                                                       100000,
                                                       0.96,
                                                       staircase=True)
            self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

        elif self.args.test:
            self.test_files = list_dir(self.args.CreateDataset.test_dir)
            self.testing = "CreateDataset"
            self.testing_model = "BuildGraph"
            self.testing_opts = []

        elif self.args.inference:
            self.inference_files = list_dir(
                self.args.CreateDataset.inference_dir, ext=self.args.file_ext)
            self.inference = "CreateDataset"
            self.inference_model = "BuildGraph"
            self.inference_opts = []

        # load data
        log.info("Data Loading Started")
        speed = time_it(self.load_data)
        log.info("Data Loading took {} seconds to complete".format(speed))

        # look for GPU's
        self.gpu_indexes = test_for_nvidia_gpu(self.args.num_gpu)

        # initialize model
        log.info("Initializing Model")
        speed = time_it(self.initialize_model)
        log.info(
            "Model Initialization took {} seconds to complete".format(speed))

        self.summaries = tf.summary.merge_all()
        self.start = datetime.now()
        if self.args.use_checkpoint:
            self.model_path = tf.train.latest_checkpoint(
                self.args.trained_model)
        else:
            self.model_path = self.args.trained_model_path
コード例 #10
0
ファイル: dataset.py プロジェクト: kdyslj/nanopore-RNN
        elif self.mode == 1:
            dataset = tf.data.Dataset.zip((self.datasetX, self.datasetSeq, self.datasetY))
            dataset = dataset.batch(self.batch_size)
        # inference
        elif self.mode == 2:
            # inference needs to be done per file
            dataset = tf.data.Dataset.from_generator(
                self.load_data_inference, (tf.float32, tf.int32), (tf.TensorShape([self.seq_len]),
                                                                   tf.TensorShape(None)))
            dataset = dataset.batch(self.batch_size)
        dataset = dataset.prefetch(buffer_size=self.prefetch_buffer_size)
        return dataset


if __name__ == "__main__":

    file_list = list_dir("/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/data/raw")

    motif = MotifSequence(file_list, mode=0, batch_size=10, verbose=True, seq_len=100,
                          n_epochs=5)
    full = FullSignalSequence(file_list, mode=0, batch_size=10, verbose=True, seq_len=100,
                              n_epochs=5)
    file_list = list_dir(
        "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/create_training_files/07Jul-20-11h-28m")

    full = NumpyEventData(file_list, mode=0, batch_size=10, verbose=True, seq_len=100,
                          n_epochs=5)
    print("This file is just a library", file=sys.stderr)
    raise SystemExit