def test_rna_reads(self):
        with tempfile.TemporaryDirectory() as tempdir:
            template_model = os.path.join(
                self.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
            args = create_signalAlignment_args(
                alignment_file=self.rna_bam,
                bwa_reference=self.rna_reference,
                forward_reference=self.rna_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=tempdir,
                embed=True,
                delete_tmp=False)

            in_rna_file = os.path.join(
                self.test_dir_rna,
                "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5"
            )
            final_args = merge_dicts([args, dict(in_fast5=in_rna_file)])
            handle = SignalAlignment(**final_args)
            handle.run()
            fh = pysam.FastaFile(self.rna_reference)
            f5fh = Fast5(in_rna_file)
            sa_events = f5fh.get_signalalign_events()
            for i, event in enumerate(sa_events):
                kmer = fh.fetch(reference="rna_fake",
                                start=event["reference_index"],
                                end=event["reference_index"] + 5)[::-1]
                self.assertEqual(event["path_kmer"].decode(), kmer)
                self.assertEqual(event["reference_kmer"].decode(), kmer)

            in_rna_file = os.path.join(
                self.test_dir_rna,
                "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5"
            )
            final_args = merge_dicts([args, dict(in_fast5=in_rna_file)])
            handle = SignalAlignment(**final_args)
            handle.run()
            rev_c = ReverseComplement()
            f5fh = Fast5(in_rna_file)
            sa_events = f5fh.get_signalalign_events()
            for i, event in enumerate(sa_events):
                kmer = fh.fetch(reference="rna_fake",
                                start=event["reference_index"],
                                end=event["reference_index"] + 5)[::-1]
                rev_kmer = rev_c.reverse_complement(kmer)
                self.assertEqual(event["path_kmer"].decode(), rev_kmer)
                self.assertEqual(event["reference_kmer"].decode(), kmer)
def resegment_reads(fast5_path, params, speedy=False, overwrite=False):
    """Re-segment and create anchor alignment from previously base-called fast5 file
    :param fast5_path: path to fast5 file
    :param params: event detection parameters
    :param speedy: boolean option for speedyStatSplit or minknow
    :param overwrite: overwrite a previous event re-segmented event table
    :param name: name of key where events table will be placed (Analyses/'name'/Events)
    :return True when completed
    """
    assert os.path.isfile(fast5_path), "File does not exist: {}".format(fast5_path)
    name = "ReSegmentBasecall_00{}"
    # create Fast5 object
    f5fh = Fast5(fast5_path, read='r+')
    # gather previous event detection
    old_event_table = f5fh.get_basecall_data()
    # assert check_event_table_time(old_event_table), "Old event is not consistent"
    read_id = bytes.decode(f5fh.raw_attributes['read_id'])
    sampling_freq = f5fh.sample_rate
    start_time = f5fh.raw_attributes['start_time']
    # pick event detection algorithm
    signal = f5fh.get_read(raw=True, scale=True)

    if speedy:
        event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params)
        params = merge_dicts([params, {"event_detection": "speedy_stat_split"}])
    else:
        event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params)
        params = merge_dicts([params, {"event_detection": "minknow_event_detect"}])

    keys = ["nanotensor version", "time_stamp"]
    values = ["0.2.0", TimeStamp().posix_date()]
    attributes = merge_dicts([params, dict(zip(keys, values)), f5fh.raw_attributes])
    if f5fh.is_read_rna():
        old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time)
    # set event table
    new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table)
    f5fh.set_new_event_table(name, new_event_table, attributes, overwrite=overwrite)
    # gather new sequence
    sequence = sequence_from_events(new_event_table)
    if f5fh.is_read_rna():
        sequence = ReverseComplement().reverse(sequence)
        sequence = sequence.replace("T", "U")
    quality_scores = '!'*len(sequence)
    fastq = create_fastq_line(read_id+" :", sequence, quality_scores)
    # set fastq
    f5fh.set_fastq(name, fastq)
    return f5fh
    def test_embed_with_both(self):
        signal_file_reads = os.path.join(self.HOME,
                                         "tests/minion_test_reads/pUC/")
        template_model = os.path.join(
            self.HOME, "models/testModelR9_5mer_acegt_template.model")
        complement_model = os.path.join(
            self.HOME, "models/testModelR9_5mer_acegt_complement.model")

        puc_reference = os.path.join(self.HOME,
                                     "tests/test_sequences/pUC19_SspI.fa")
        signal_file_guide_alignment = os.path.join(
            self.HOME, "tests/minion_test_reads/pUC/puc.bam")
        with tempfile.TemporaryDirectory() as tempdir:
            new_dir = os.path.join(tempdir, "new_dir")
            if os.path.exists(new_dir):
                shutil.rmtree(new_dir)
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))

            shutil.copytree(signal_file_reads, new_dir)

            args = create_signalAlignment_args(
                alignment_file=signal_file_guide_alignment,
                bwa_reference=puc_reference,
                forward_reference=puc_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=working_folder.path,
                embed=True,
                output_format="both",
                filter_reads=0,
                twoD_chemistry=True,
                in_complementHmm=complement_model,
                delete_tmp=True)
            final_args = merge_dicts([
                args,
                dict(in_fast5=os.path.join(
                    new_dir,
                    "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5"
                ))
            ])
            handle = SignalAlignment(**final_args)
            handle.run()
            f5fh = Fast5(
                os.path.join(
                    new_dir,
                    "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5"
                ))
            mea = f5fh.get_signalalign_events(mea=True)
            sam = f5fh.get_signalalign_events(sam=True)
            self.assertEqual(mea[0]["raw_start"], 2879)
            self.assertEqual(sam[0], "0")
            self.assertEqual(len(os.listdir(working_folder.path)), 2)
Exemple #4
0
    def test_merge_dicts(self):
        """Test merge_dicts"""
        with captured_output() as (_, _):
            self.assertRaises(AssertionError, merge_dicts, {"test": 1})
            self.assertRaises(AssertionError, merge_dicts, ["test", 1])
            self.assertRaises(AssertionError, merge_dicts, [{
                "test": 1
            }, ["test"]])

            dict1 = {"a": 1}
            dict2 = {"b": 2}
            dict3 = {"c": 3}
            dict4 = {"d": 4}

            merged_dict = merge_dicts([dict1, dict2, dict3, dict4])
            self.assertEqual(dict1["a"], merged_dict["a"])
            self.assertEqual(dict2["b"], merged_dict["b"])
            self.assertEqual(dict3["c"], merged_dict["c"])
            self.assertEqual(dict4["d"], merged_dict["d"])
    def test_signal_file_and_alignment(self):
        signal_file_reads = os.path.join(
            self.HOME, "tests/minion_test_reads/no_event_data_1D_ecoli")
        template_model = os.path.join(
            self.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        ecoli_reference = os.path.join(
            self.HOME, "tests/test_sequences/E.coli_K12.fasta")
        signal_file_guide_alignment = os.path.join(
            self.HOME, "tests/minion_test_reads/oneD_alignments.sam")

        with tempfile.TemporaryDirectory() as tempdir:
            new_dir = os.path.join(tempdir, "new_dir")
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))

            shutil.copytree(signal_file_reads, new_dir)

            args = create_signalAlignment_args(
                alignment_file=signal_file_guide_alignment,
                bwa_reference=ecoli_reference,
                forward_reference=ecoli_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=working_folder.path)
            final_args = merge_dicts([
                args,
                dict(in_fast5=os.path.join(
                    new_dir,
                    "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5"
                ))
            ])
            handle = SignalAlignment(**final_args)
            handle.run()
            self.assertEqual(len(os.listdir(working_folder.path)), 1)
            self.assertEqual(
                sorted(os.listdir(working_folder.path))[0],
                "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv")
Exemple #6
0
def main(args):
    # parse args
    start = timer()

    args = parse_args()
    if args.command == "run":
        if not os.path.exists(args.config):
            print("{config} not found".format(config=args.config))
            exit(1)
        # run training
        config_args = create_dot_dict(load_json(args.config))

        temp_folder = FolderHandler()
        temp_dir_path = temp_folder.open_folder(
            os.path.join(os.path.abspath(config_args.output_dir),
                         "tempFiles_alignment"))
        temp_dir_path = resolvePath(temp_dir_path)
        print(config_args.output_dir)
        print(temp_dir_path)

        sa_args = [
            merge_dicts([
                s, {
                    "quality_threshold": config_args.filter_reads,
                    "workers": config_args.job_count
                }
            ]) for s in config_args.samples
        ]

        samples = [
            SignalAlignSample(working_folder=temp_folder, **s) for s in sa_args
        ]
        copyfile(args.config,
                 os.path.join(temp_dir_path, os.path.basename(args.config)))

        state_machine_type = "threeState"
        if config_args.template_hdp_model_path is not None:
            state_machine_type = "threeStateHdp"

        alignment_args = create_signalAlignment_args(
            destination=temp_dir_path,
            stateMachineType=state_machine_type,
            in_templateHmm=resolvePath(config_args.template_hmm_model),
            in_complementHmm=resolvePath(config_args.complement_hmm_model),
            in_templateHdp=resolvePath(config_args.template_hdp_model),
            in_complementHdp=resolvePath(config_args.complement_hdp_model),
            diagonal_expansion=config_args.diagonal_expansion,
            constraint_trim=config_args.constraint_trim,
            traceBackDiagonals=config_args.traceBackDiagonals,
            twoD_chemistry=config_args.two_d,
            get_expectations=False,
            path_to_bin=resolvePath(config_args.path_to_bin),
            check_for_temp_file_existance=True,
            threshold=config_args.signal_alignment_args.threshold,
            track_memory_usage=config_args.signal_alignment_args.
            track_memory_usage,
            embed=config_args.signal_alignment_args.embed,
            event_table=config_args.signal_alignment_args.event_table,
            output_format=config_args.signal_alignment_args.output_format,
            filter_reads=config_args.filter_reads,
            delete_tmp=config_args.signal_alignment_args.delete_tmp)

        multithread_signal_alignment_samples(samples,
                                             alignment_args,
                                             config_args.job_count,
                                             trim=None,
                                             debug=config_args.debug)

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
        stop = timer()
    else:
        command_line = " ".join(sys.argv[:])
        print(os.getcwd())

        print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
              file=sys.stderr)
        # get absolute paths to inputs
        args.files_dir = resolvePath(args.files_dir)
        args.forward_reference = resolvePath(args.forward_ref)
        args.backward_reference = resolvePath(args.backward_ref)
        args.out = resolvePath(args.out)
        args.bwa_reference = resolvePath(args.bwa_reference)
        args.in_T_Hmm = resolvePath(args.in_T_Hmm)
        args.in_C_Hmm = resolvePath(args.in_C_Hmm)
        args.templateHDP = resolvePath(args.templateHDP)
        args.complementHDP = resolvePath(args.complementHDP)
        args.fofn = resolvePath(args.fofn)
        args.target_regions = resolvePath(args.target_regions)
        args.ambiguity_positions = resolvePath(args.ambiguity_positions)
        args.alignment_file = resolvePath(args.alignment_file)
        start_message = """
    #   Starting Signal Align
    #   Aligning files from: {fileDir}
    #   Aligning to reference: {reference}
    #   Aligning maximum of {nbFiles} files
    #   Using model: {model}
    #   Using banding: True
    #   Aligning to regions in: {regions}
    #   Non-default template HMM: {inThmm}
    #   Non-default complement HMM: {inChmm}
    #   Template HDP: {tHdp}
    #   Complement HDP: {cHdp}
        """.format(fileDir=args.files_dir,
                   reference=args.bwa_reference,
                   nbFiles=args.nb_files,
                   inThmm=args.in_T_Hmm,
                   inChmm=args.in_C_Hmm,
                   model=args.stateMachineType,
                   regions=args.target_regions,
                   tHdp=args.templateHDP,
                   cHdp=args.complementHDP)

        print(start_message, file=sys.stdout)

        if args.files_dir is None and args.fofn is None:
            print("Need to provide directory with .fast5 files of fofn",
                  file=sys.stderr)
            sys.exit(1)

        if not os.path.isfile(args.bwa_reference):
            print("Did not find valid reference file, looked for it {here}".
                  format(here=args.bwa_reference),
                  file=sys.stderr)
            sys.exit(1)

        # make directory to put temporary files
        if not os.path.isdir(args.out):
            print("Creating output directory: {}".format(args.out),
                  file=sys.stdout)
            os.mkdir(args.out)
        temp_folder = FolderHandler()
        temp_dir_path = temp_folder.open_folder(
            os.path.join(os.path.abspath(args.out), "tempFiles_alignment"))
        temp_dir_path = resolvePath(temp_dir_path)
        print(args.out)
        print(temp_dir_path)

        # generate reference sequence if not specified
        if not args.forward_reference or not args.backward_reference:
            args.forward_reference, args.backward_reference = processReferenceFasta(
                fasta=args.bwa_reference,
                work_folder=temp_folder,
                positions_file=args.ambiguity_positions,
                name="")

        # list of read files
        if args.fofn is not None:
            fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")]
        else:
            fast5s = [
                "/".join([args.files_dir, x])
                for x in os.listdir(args.files_dir) if x.endswith(".fast5")
            ]

        nb_files = args.nb_files
        if nb_files < len(fast5s):
            shuffle(fast5s)
            fast5s = fast5s[:nb_files]

        # return alignment_args
        alignment_args = {
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_reference": args.bwa_reference,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "output_format": args.outFmt,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "degenerate": getDegenerateEnum(args.degenerate),
            "twoD_chemistry": args.twoD,
            "target_regions": args.target_regions,
            "embed": args.embed,
            "event_table": args.event_table,
            "backward_reference": args.backward_reference,
            "forward_reference": args.forward_reference,
            "alignment_file": args.alignment_file,
            "check_for_temp_file_existance": True,
            "track_memory_usage": False,
            "get_expectations": False,
            "perform_kmer_event_alignment": args.perform_kmer_event_alignment,
            "enforce_supported_versions": args.enforce_supported_versions,
            "filter_reads": 7 if args.filter_reads else None,
            "path_to_bin": args.path_to_bin,
            "delete_tmp": args.delete_tmp
        }
        filter_read_generator = None
        if args.filter_reads is not None and args.alignment_file and args.readdb and args.files_dir:
            print("[runSignalAlign]:NOTICE: Filtering out low quality reads",
                  file=sys.stdout)

            filter_read_generator = filter_reads_to_string_wrapper(
                filter_reads(args.alignment_file,
                             args.readdb, [args.files_dir],
                             quality_threshold=7,
                             recursive=args.recursive))

        print("[runSignalAlign]:NOTICE: Got {} files to align".format(
            len(fast5s)),
              file=sys.stdout)
        # setup workers for multiprocessing
        multithread_signal_alignment(
            alignment_args,
            fast5s,
            args.nb_jobs,
            debug=args.DEBUG,
            filter_reads_to_string_wrapper=filter_read_generator)
        stop = timer()

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

    print("[signalAlign] Complete")
    print("Running Time = {} seconds".format(stop - start))
Exemple #7
0
def load_from_raw2(np_handle,
                   aligned_segment,
                   model_file_location,
                   path_to_bin="./",
                   analysis_identifier=None,
                   write_failed_alignments=False):
    """Load a nanopore read from raw signal and an alignment file. Need a model to create banded alignment.
    :param np_handle: NanoporeRead class object
    :param aligned_segment: pysam aligned_segment object
    :param model_file_location: path to model file
    :param path_to_bin: bath to signalAlign bin where executables are stored
    :param analysis_identifier: identifier for storage of event table and fastq
    :param write_failed_alignments: still write alignments that failed quality checks
    :return: path to events in fast5 file or -1 if the task fails
    """
    assert os.path.isfile(model_file_location), \
        "Model_file_location must be a real path to a SignalAlign HMM model file"
    assert os.path.exists(path_to_bin), \
        "path_to_bin must exist"
    # check if file is open
    if not np_handle.open():
        return False
    # grab read id
    read_id = np_handle.read_label

    # get nucleotides and qualities
    nucleotide_sequence = aligned_segment.query_sequence.upper()
    nucleotide_qualities = aligned_segment.qual

    # check for reverse mapping
    if aligned_segment.is_reverse:
        nucleotide_sequence = reverse_complement(nucleotide_sequence,
                                                 reverse=True,
                                                 complement=True)
        if nucleotide_qualities is not None and len(nucleotide_qualities) != 0:
            nucleotide_qualities = ''.join(reversed(
                list(nucleotide_qualities)))

    if nucleotide_qualities is None:
        nucleotide_qualities = "!" * len(nucleotide_sequence)

    # get fastq (this is saved with the event table)
    fastq = create_fastq_line(read_id, nucleotide_sequence,
                              nucleotide_qualities)

    # get temp location
    tmp_root = np_handle.fastFive.get_analysis_new(EVENT_KMERALIGN_TMP)
    tmp_dest = np_handle.fastFive.get_analysis_events_path_new(
        EVENT_KMERALIGN_TMP)
    assert tmp_dest.startswith(tmp_root), "Invalid analysis path management"
    file_name = np_handle.filename
    np_handle.close()
    tmp_directory = tempfile.mkdtemp()
    # run the c code which does the required stuff
    status = run_kmeralign_exe(file_name,
                               nucleotide_sequence,
                               model_file_location,
                               tmp_dest,
                               path_to_bin,
                               write_failed_alignments=write_failed_alignments,
                               tmp_directory=tmp_directory)
    os.removedirs(tmp_directory)
    # alignment succeeded, save it to the appropriate location
    if status:
        np_handle.open()
        if analysis_identifier is None:
            analysis_identifier = Fast5.__default_basecall_1d_analysis__
        # get attrs
        keys = ["signalAlign version", "time_stamp"]
        values = ["0.2.0", TimeStamp().posix_date()]
        attributes = merge_dicts(
            [dict(zip(keys, values)), np_handle.fastFive.raw_attributes])
        # get events (and delete tmp location)
        events = np_handle.fastFive.get_custom_analysis_events(
            EVENT_KMERALIGN_TMP)
        np_handle.fastFive.delete(tmp_root, ignore=False)
        # save events and fastq
        saved_loc = save_event_table_and_fastq(
            np_handle.fastFive,
            events,
            fastq,
            attributes,
            analysis_identifier=analysis_identifier)
        return saved_loc

    # alignment failed, remove offending location (if it exists) and report
    else:
        print("[load_from_raw] error performing kmeralign", file=sys.stderr)
        np_handle.open()
        np_handle.fastFive.delete(tmp_root, ignore=True)
        return False
Exemple #8
0
def load_from_raw(np_handle,
                  alignment_file,
                  model_file_location,
                  path_to_bin="./",
                  nucleotide_sequence=None,
                  analysis_identifier=None,
                  write_failed_alignments=False):
    """Load a nanopore read from raw signal and an alignment file. Need a model to create banded alignment.
    :param np_handle: NanoporeRead class object
    :param alignment_file: sam/bam file
    :param model_file_location: path to model file
    :param path_to_bin: bath to signalAlign bin where executables are stored
    :param nucleotide_sequence: nucleotide sequence (needed if no alignment file is available)
    :param analysis_identifier: identifier for storage of event table and fastq
    :param write_failed_alignments: still write alignments that failed quality checks
    :return: path to events in fast5 file or -1 if the task fails
    """
    assert os.path.isfile(model_file_location), \
        "Model_file_location must be a real path to a SignalAlign HMM model file"
    assert os.path.exists(path_to_bin), \
        "path_to_bin must exist"
    if not os.path.isfile(str(alignment_file)) and nucleotide_sequence is None:
        nucleotide_sequence = np_handle.get_template_read(
            initalize_bypass=True)
        assert nucleotide_sequence, "alignment_file must be a real path a SAM/BAM alignment file, or " \
                                    "nucleotide_sequence must be specified (retrieval attempted from fast5). " \
                                    "alignment_file: {}, nucleotide_sequence:{}".format(alignment_file, nucleotide_sequence)

    # check if file is open
    if not np_handle.open():
        return False
    # grab read id
    read_id = np_handle.read_label

    # get nucleotides and qualities
    if nucleotide_sequence is None:
        # get/build nucleotide sequence from alignment file (accounting for hardclipping)
        nucleotide_sequence, nucleotide_qualities, _, _, _ = \
            get_full_nucleotide_read_from_alignment(alignment_file, read_id)
        if nucleotide_sequence is None:
            print("[load_from_raw] nucleotides for {} not found in {}".format(
                read_id, alignment_file),
                  file=sys.stderr)
            return False
    else:
        nucleotide_qualities = None
    if nucleotide_qualities is None:
        nucleotide_qualities = "!" * len(nucleotide_sequence)

    # get fastq (this is saved with the event table)
    fastq = create_fastq_line(read_id, nucleotide_sequence,
                              nucleotide_qualities)

    # get temp location
    tmp_root = np_handle.fastFive.get_analysis_new(EVENT_KMERALIGN_TMP)
    tmp_dest = np_handle.fastFive.get_analysis_events_path_new(
        EVENT_KMERALIGN_TMP)
    assert tmp_dest.startswith(tmp_root), "Invalid analysis path management"
    file_name = np_handle.filename
    np_handle.close()
    tmp_directory = tempfile.mkdtemp()
    # run the c code which does the required stuff
    status = run_kmeralign_exe(file_name,
                               nucleotide_sequence,
                               model_file_location,
                               tmp_dest,
                               path_to_bin,
                               write_failed_alignments=write_failed_alignments,
                               tmp_directory=tmp_directory)
    os.removedirs(tmp_directory)
    # alignment succeeded, save it to the appropriate location
    if status:
        np_handle.open()
        if analysis_identifier is None:
            analysis_identifier = Fast5.__default_basecall_1d_analysis__
        # get attrs
        keys = ["signalAlign version", "time_stamp"]
        values = ["0.2.0", TimeStamp().posix_date()]
        attributes = merge_dicts(
            [dict(zip(keys, values)), np_handle.fastFive.raw_attributes])
        # get events (and delete tmp location)
        events = np_handle.fastFive.get_custom_analysis_events(
            EVENT_KMERALIGN_TMP)
        np_handle.fastFive.delete(tmp_root, ignore=False)
        # save events and fastq
        saved_loc = save_event_table_and_fastq(
            np_handle.fastFive,
            events,
            fastq,
            attributes,
            analysis_identifier=analysis_identifier)
        return saved_loc

    # alignment failed, remove offending location (if it exists) and report
    else:
        print("[load_from_raw] error performing kmeralign", file=sys.stderr)
        np_handle.open()
        np_handle.fastFive.delete(tmp_root, ignore=True)
        return False
Exemple #9
0
def generate_events_and_alignment(
    fast5_path,
    nucleotide_sequence,
    nucleotide_qualities=None,
    event_detection_params=None,
    event_detection_strategy=None,
    save_to_fast5=True,
    overwrite=False,
    analysis_identifier=Fast5.__default_basecall_1d_analysis__,
):

    assert os.path.isfile(fast5_path), "File does not exist: {}".format(
        fast5_path)

    # create Fast5 object
    f5fh = Fast5(fast5_path, read='r+')
    read_id = bytes.decode(f5fh.raw_attributes['read_id'])
    sampling_freq = f5fh.sample_rate
    start_time = f5fh.raw_attributes['start_time']
    success = False

    # event detection prep
    if event_detection_strategy is None:
        event_detection_strategy = EVENT_DETECT_MINKNOW
    if event_detection_params is None:
        event_detection_params = get_default_event_detection_params(
            event_detection_strategy)

    # detect events
    if event_detection_strategy == EVENT_DETECT_SPEEDY:
        signal = f5fh.get_read(raw=True, scale=True)
        event_table = create_speedy_event_table(signal, sampling_freq,
                                                start_time,
                                                **event_detection_params)
        event_detection_params = merge_dicts(
            [event_detection_params, {
                "event_detection": "speedy_stat_split"
            }])
    elif event_detection_strategy == EVENT_DETECT_MINKNOW:
        signal = f5fh.get_read(raw=True, scale=True)
        event_table = create_minknow_event_table(signal, sampling_freq,
                                                 start_time,
                                                 **event_detection_params)
        event_detection_params = merge_dicts([
            event_detection_params, {
                "event_detection": "minknow_event_detect"
            }
        ])
    elif event_detection_strategy == EVENT_DETECT_SCRAPPIE:
        event_table = create_scrappie_event_table(fast5_path, sampling_freq)
        event_detection_params = merge_dicts([
            event_detection_params, {
                "event_detection": "scrappie_event_detect"
            }
        ])
    else:
        raise Exception(
            "PROGRAMMER ERROR: unknown resegment strat {}: expected {}".format(
                event_detection_strategy, [
                    EVENT_DETECT_SPEEDY, EVENT_DETECT_MINKNOW,
                    EVENT_DETECT_SCRAPPIE
                ]))

    # gather attributes
    keys = ["nanotensor version", "time_stamp"]
    values = ["0.2.0", TimeStamp().posix_date()]
    attributes = merge_dicts(
        [event_detection_params,
         dict(zip(keys, values)), f5fh.raw_attributes])

    # do the alignment
    # todo do_alignment(events, nucleotide_sequence)
    # success = evaluate_success()

    # save to fast5 (if appropriate)
    saved_location = None
    if save_to_fast5:
        fastq = create_fastq_line(
            read_id, nucleotide_sequence,
            "*" if nucleotide_qualities is None else nucleotide_qualities)
        saved_location = save_event_table_and_fastq(
            f5fh,
            event_table,
            fastq,
            attributes=attributes,
            overwrite=overwrite,
            analysis_identifier=analysis_identifier)

    # close
    f5fh.close()

    return success, event_table, saved_location
Exemple #10
0
def resegment_reads(fast5_path,
                    params=None,
                    speedy=False,
                    overwrite=True,
                    analysis_path="ReSegmentBasecall_000"):
    """Re-segment and create anchor alignment from previously base-called fast5 file
    :param fast5_path: path to fast5 file
    :param params: event detection parameters
    :param speedy: boolean option for speedyStatSplit or minknow
    :param overwrite: overwrite a previous event re-segmented event table
    :param analysis_path: name of key where events table will be placed (Analyses/'name'/Events)
    :return True when completed
    """
    assert os.path.isfile(fast5_path), "File does not exist: {}".format(
        fast5_path)
    # create Fast5 object and sanity check
    f5fh = Fast5(fast5_path, read='r+')
    if not f5fh.has_basecall_data():
        f5fh.close()
        return None

    # gather previous event detection
    old_event_table = f5fh.get_basecall_data()

    read_id = bytes.decode(f5fh.raw_attributes['read_id'])
    sampling_freq = f5fh.sample_rate
    start_time = f5fh.raw_attributes['start_time']

    # get params
    if params is None:
        params = get_default_event_detection_params(
            EVENT_DETECT_SPEEDY if speedy else EVENT_DETECT_MINKNOW)

    # pick event detection algorithm
    signal = f5fh.get_read(raw=True, scale=True)
    if speedy:
        event_table = create_speedy_event_table(signal, sampling_freq,
                                                start_time, **params)
        params = merge_dicts(
            [params, {
                "event_detection": "speedy_stat_split"
            }])
    else:
        event_table = create_minknow_event_table(signal, sampling_freq,
                                                 start_time, **params)
        params = merge_dicts(
            [params, {
                "event_detection": "minknow_event_detect"
            }])

    # metadata
    keys = ["nanotensor version", "time_stamp"]
    values = ["0.2.0", TimeStamp().posix_date()]
    attributes = merge_dicts(
        [params, dict(zip(keys, values)), f5fh.raw_attributes])

    # do resegmentation
    if f5fh.is_read_rna():
        old_event_table = index_to_time(old_event_table,
                                        sampling_freq=sampling_freq,
                                        start_time=start_time)
    new_event_table = create_anchor_kmers(new_events=event_table,
                                          old_events=old_event_table)

    # get destination in fast5
    #todo find latest location? ie: save_event_table_and_fastq(..)
    destination = f5fh._join_path(f5fh.__base_analysis__, analysis_path)

    f5fh.set_event_table(destination,
                         new_event_table,
                         attributes,
                         overwrite=overwrite)

    # gather new sequence
    sequence = sequence_from_events(new_event_table)
    if f5fh.is_read_rna():
        sequence = ReverseComplement().reverse(sequence)
        sequence = sequence.replace("T", "U")
    quality_scores = '!' * len(sequence)
    fastq = create_fastq_line(read_id + " :", sequence, quality_scores)

    # set fastq
    f5fh.set_fastq(destination, fastq, overwrite=overwrite)
    return f5fh
    def setUpClass(cls):
        super(CreateLabelsTest, cls).setUpClass()
        cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4])
        cls.fasta = os.path.join(cls.HOME,
                                 "tests/test_sequences/E.coli_K12.fasta")
        dna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5")
        rev_dna_file = os.path.join(cls.HOME,
                                    "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5")
        rev_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5")
        forward_rna_file = os.path.join(cls.HOME,
                                "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5")

        rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa")
        ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta")
        cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference)
        cls.rna_reference_handle = pysam.FastaFile(rna_reference)
        cls.tmp_directory = tempfile.mkdtemp()

         # get file locations
        cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5')
        cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5')

        cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5')
        cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5')

        # run signalAlign on one file
        cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model")
        cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam")
        cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam")
        cls.bin_path = os.path.join(cls.HOME, "bin")
        # kmer index
        cls.kmer_index = 2

        # copy file to tmp directory
        shutil.copy(dna_file, cls.tmp_dna_file)
        shutil.copy(rev_dna_file, cls.tmp_dna_file2)

        shutil.copy(forward_rna_file, cls.tmp_rna_file1)
        shutil.copy(rev_rna_file, cls.tmp_rna_file2)

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.rna_model_file,
                                           alignment_file=cls.rna_sam,
                                           forward_reference=rna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=5,
                                           delete_tmp=False)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}]))
        sa_h.run()

        args = create_signalAlignment_args(destination=cls.tmp_directory,
                                           in_templateHmm=cls.dna_model_file_94,
                                           alignment_file=cls.dna_sam,
                                           forward_reference=ecoli_dna_reference,
                                           embed=True,
                                           path_to_bin=cls.bin_path,
                                           diagonal_expansion=10,
                                           traceBackDiagonals=100,
                                           constraint_trim=3)
        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}]))
        sa_h.run()

        sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}]))
        sa_h.run()

        cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index)
        cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index)

        cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index)
        cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index)
        cls.rev_comp = ReverseComplement()

        cls.tmp_dna_file3 = os.path.join(cls.HOME,
                                         "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5")
        cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)