def test_rna_reads(self): with tempfile.TemporaryDirectory() as tempdir: template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") args = create_signalAlignment_args( alignment_file=self.rna_bam, bwa_reference=self.rna_reference, forward_reference=self.rna_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=tempdir, embed=True, delete_tmp=False) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() fh = pysam.FastaFile(self.rna_reference) f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] self.assertEqual(event["path_kmer"].decode(), kmer) self.assertEqual(event["reference_kmer"].decode(), kmer) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() rev_c = ReverseComplement() f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] rev_kmer = rev_c.reverse_complement(kmer) self.assertEqual(event["path_kmer"].decode(), rev_kmer) self.assertEqual(event["reference_kmer"].decode(), kmer)
def resegment_reads(fast5_path, params, speedy=False, overwrite=False): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param name: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format(fast5_path) name = "ReSegmentBasecall_00{}" # create Fast5 object f5fh = Fast5(fast5_path, read='r+') # gather previous event detection old_event_table = f5fh.get_basecall_data() # assert check_event_table_time(old_event_table), "Old event is not consistent" read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "speedy_stat_split"}]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "minknow_event_detect"}]) keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts([params, dict(zip(keys, values)), f5fh.raw_attributes]) if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) # set event table new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) f5fh.set_new_event_table(name, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!'*len(sequence) fastq = create_fastq_line(read_id+" :", sequence, quality_scores) # set fastq f5fh.set_fastq(name, fastq) return f5fh
def test_embed_with_both(self): signal_file_reads = os.path.join(self.HOME, "tests/minion_test_reads/pUC/") template_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_template.model") complement_model = os.path.join( self.HOME, "models/testModelR9_5mer_acegt_complement.model") puc_reference = os.path.join(self.HOME, "tests/test_sequences/pUC19_SspI.fa") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/pUC/puc.bam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") if os.path.exists(new_dir): shutil.rmtree(new_dir) working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=puc_reference, forward_reference=puc_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path, embed=True, output_format="both", filter_reads=0, twoD_chemistry=True, in_complementHmm=complement_model, delete_tmp=True) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() f5fh = Fast5( os.path.join( new_dir, "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5" )) mea = f5fh.get_signalalign_events(mea=True) sam = f5fh.get_signalalign_events(sam=True) self.assertEqual(mea[0]["raw_start"], 2879) self.assertEqual(sam[0], "0") self.assertEqual(len(os.listdir(working_folder.path)), 2)
def test_merge_dicts(self): """Test merge_dicts""" with captured_output() as (_, _): self.assertRaises(AssertionError, merge_dicts, {"test": 1}) self.assertRaises(AssertionError, merge_dicts, ["test", 1]) self.assertRaises(AssertionError, merge_dicts, [{ "test": 1 }, ["test"]]) dict1 = {"a": 1} dict2 = {"b": 2} dict3 = {"c": 3} dict4 = {"d": 4} merged_dict = merge_dicts([dict1, dict2, dict3, dict4]) self.assertEqual(dict1["a"], merged_dict["a"]) self.assertEqual(dict2["b"], merged_dict["b"]) self.assertEqual(dict3["c"], merged_dict["c"]) self.assertEqual(dict4["d"], merged_dict["d"])
def test_signal_file_and_alignment(self): signal_file_reads = os.path.join( self.HOME, "tests/minion_test_reads/no_event_data_1D_ecoli") template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acegt_template.model") ecoli_reference = os.path.join( self.HOME, "tests/test_sequences/E.coli_K12.fasta") signal_file_guide_alignment = os.path.join( self.HOME, "tests/minion_test_reads/oneD_alignments.sam") with tempfile.TemporaryDirectory() as tempdir: new_dir = os.path.join(tempdir, "new_dir") working_folder = FolderHandler() working_folder.open_folder(os.path.join(tempdir, "test_dir")) shutil.copytree(signal_file_reads, new_dir) args = create_signalAlignment_args( alignment_file=signal_file_guide_alignment, bwa_reference=ecoli_reference, forward_reference=ecoli_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=working_folder.path) final_args = merge_dicts([ args, dict(in_fast5=os.path.join( new_dir, "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5" )) ]) handle = SignalAlignment(**final_args) handle.run() self.assertEqual(len(os.listdir(working_folder.path)), 1) self.assertEqual( sorted(os.listdir(working_folder.path))[0], "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv")
def main(args): # parse args start = timer() args = parse_args() if args.command == "run": if not os.path.exists(args.config): print("{config} not found".format(config=args.config)) exit(1) # run training config_args = create_dot_dict(load_json(args.config)) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder( os.path.join(os.path.abspath(config_args.output_dir), "tempFiles_alignment")) temp_dir_path = resolvePath(temp_dir_path) print(config_args.output_dir) print(temp_dir_path) sa_args = [ merge_dicts([ s, { "quality_threshold": config_args.filter_reads, "workers": config_args.job_count } ]) for s in config_args.samples ] samples = [ SignalAlignSample(working_folder=temp_folder, **s) for s in sa_args ] copyfile(args.config, os.path.join(temp_dir_path, os.path.basename(args.config))) state_machine_type = "threeState" if config_args.template_hdp_model_path is not None: state_machine_type = "threeStateHdp" alignment_args = create_signalAlignment_args( destination=temp_dir_path, stateMachineType=state_machine_type, in_templateHmm=resolvePath(config_args.template_hmm_model), in_complementHmm=resolvePath(config_args.complement_hmm_model), in_templateHdp=resolvePath(config_args.template_hdp_model), in_complementHdp=resolvePath(config_args.complement_hdp_model), diagonal_expansion=config_args.diagonal_expansion, constraint_trim=config_args.constraint_trim, traceBackDiagonals=config_args.traceBackDiagonals, twoD_chemistry=config_args.two_d, get_expectations=False, path_to_bin=resolvePath(config_args.path_to_bin), check_for_temp_file_existance=True, threshold=config_args.signal_alignment_args.threshold, track_memory_usage=config_args.signal_alignment_args. track_memory_usage, embed=config_args.signal_alignment_args.embed, event_table=config_args.signal_alignment_args.event_table, output_format=config_args.signal_alignment_args.output_format, filter_reads=config_args.filter_reads, delete_tmp=config_args.signal_alignment_args.delete_tmp) multithread_signal_alignment_samples(samples, alignment_args, config_args.job_count, trim=None, debug=config_args.debug) print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) stop = timer() else: command_line = " ".join(sys.argv[:]) print(os.getcwd()) print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr) # get absolute paths to inputs args.files_dir = resolvePath(args.files_dir) args.forward_reference = resolvePath(args.forward_ref) args.backward_reference = resolvePath(args.backward_ref) args.out = resolvePath(args.out) args.bwa_reference = resolvePath(args.bwa_reference) args.in_T_Hmm = resolvePath(args.in_T_Hmm) args.in_C_Hmm = resolvePath(args.in_C_Hmm) args.templateHDP = resolvePath(args.templateHDP) args.complementHDP = resolvePath(args.complementHDP) args.fofn = resolvePath(args.fofn) args.target_regions = resolvePath(args.target_regions) args.ambiguity_positions = resolvePath(args.ambiguity_positions) args.alignment_file = resolvePath(args.alignment_file) start_message = """ # Starting Signal Align # Aligning files from: {fileDir} # Aligning to reference: {reference} # Aligning maximum of {nbFiles} files # Using model: {model} # Using banding: True # Aligning to regions in: {regions} # Non-default template HMM: {inThmm} # Non-default complement HMM: {inChmm} # Template HDP: {tHdp} # Complement HDP: {cHdp} """.format(fileDir=args.files_dir, reference=args.bwa_reference, nbFiles=args.nb_files, inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions, tHdp=args.templateHDP, cHdp=args.complementHDP) print(start_message, file=sys.stdout) if args.files_dir is None and args.fofn is None: print("Need to provide directory with .fast5 files of fofn", file=sys.stderr) sys.exit(1) if not os.path.isfile(args.bwa_reference): print("Did not find valid reference file, looked for it {here}". format(here=args.bwa_reference), file=sys.stderr) sys.exit(1) # make directory to put temporary files if not os.path.isdir(args.out): print("Creating output directory: {}".format(args.out), file=sys.stdout) os.mkdir(args.out) temp_folder = FolderHandler() temp_dir_path = temp_folder.open_folder( os.path.join(os.path.abspath(args.out), "tempFiles_alignment")) temp_dir_path = resolvePath(temp_dir_path) print(args.out) print(temp_dir_path) # generate reference sequence if not specified if not args.forward_reference or not args.backward_reference: args.forward_reference, args.backward_reference = processReferenceFasta( fasta=args.bwa_reference, work_folder=temp_folder, positions_file=args.ambiguity_positions, name="") # list of read files if args.fofn is not None: fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")] else: fast5s = [ "/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5") ] nb_files = args.nb_files if nb_files < len(fast5s): shuffle(fast5s) fast5s = fast5s[:nb_files] # return alignment_args alignment_args = { "destination": temp_dir_path, "stateMachineType": args.stateMachineType, "bwa_reference": args.bwa_reference, "in_templateHmm": args.in_T_Hmm, "in_complementHmm": args.in_C_Hmm, "in_templateHdp": args.templateHDP, "in_complementHdp": args.complementHDP, "output_format": args.outFmt, "threshold": args.threshold, "diagonal_expansion": args.diag_expansion, "constraint_trim": args.constraint_trim, "degenerate": getDegenerateEnum(args.degenerate), "twoD_chemistry": args.twoD, "target_regions": args.target_regions, "embed": args.embed, "event_table": args.event_table, "backward_reference": args.backward_reference, "forward_reference": args.forward_reference, "alignment_file": args.alignment_file, "check_for_temp_file_existance": True, "track_memory_usage": False, "get_expectations": False, "perform_kmer_event_alignment": args.perform_kmer_event_alignment, "enforce_supported_versions": args.enforce_supported_versions, "filter_reads": 7 if args.filter_reads else None, "path_to_bin": args.path_to_bin, "delete_tmp": args.delete_tmp } filter_read_generator = None if args.filter_reads is not None and args.alignment_file and args.readdb and args.files_dir: print("[runSignalAlign]:NOTICE: Filtering out low quality reads", file=sys.stdout) filter_read_generator = filter_reads_to_string_wrapper( filter_reads(args.alignment_file, args.readdb, [args.files_dir], quality_threshold=7, recursive=args.recursive)) print("[runSignalAlign]:NOTICE: Got {} files to align".format( len(fast5s)), file=sys.stdout) # setup workers for multiprocessing multithread_signal_alignment( alignment_args, fast5s, args.nb_jobs, debug=args.DEBUG, filter_reads_to_string_wrapper=filter_read_generator) stop = timer() print("\n# signalAlign - finished alignments\n", file=sys.stderr) print("\n# signalAlign - finished alignments\n", file=sys.stdout) print("[signalAlign] Complete") print("Running Time = {} seconds".format(stop - start))
def load_from_raw2(np_handle, aligned_segment, model_file_location, path_to_bin="./", analysis_identifier=None, write_failed_alignments=False): """Load a nanopore read from raw signal and an alignment file. Need a model to create banded alignment. :param np_handle: NanoporeRead class object :param aligned_segment: pysam aligned_segment object :param model_file_location: path to model file :param path_to_bin: bath to signalAlign bin where executables are stored :param analysis_identifier: identifier for storage of event table and fastq :param write_failed_alignments: still write alignments that failed quality checks :return: path to events in fast5 file or -1 if the task fails """ assert os.path.isfile(model_file_location), \ "Model_file_location must be a real path to a SignalAlign HMM model file" assert os.path.exists(path_to_bin), \ "path_to_bin must exist" # check if file is open if not np_handle.open(): return False # grab read id read_id = np_handle.read_label # get nucleotides and qualities nucleotide_sequence = aligned_segment.query_sequence.upper() nucleotide_qualities = aligned_segment.qual # check for reverse mapping if aligned_segment.is_reverse: nucleotide_sequence = reverse_complement(nucleotide_sequence, reverse=True, complement=True) if nucleotide_qualities is not None and len(nucleotide_qualities) != 0: nucleotide_qualities = ''.join(reversed( list(nucleotide_qualities))) if nucleotide_qualities is None: nucleotide_qualities = "!" * len(nucleotide_sequence) # get fastq (this is saved with the event table) fastq = create_fastq_line(read_id, nucleotide_sequence, nucleotide_qualities) # get temp location tmp_root = np_handle.fastFive.get_analysis_new(EVENT_KMERALIGN_TMP) tmp_dest = np_handle.fastFive.get_analysis_events_path_new( EVENT_KMERALIGN_TMP) assert tmp_dest.startswith(tmp_root), "Invalid analysis path management" file_name = np_handle.filename np_handle.close() tmp_directory = tempfile.mkdtemp() # run the c code which does the required stuff status = run_kmeralign_exe(file_name, nucleotide_sequence, model_file_location, tmp_dest, path_to_bin, write_failed_alignments=write_failed_alignments, tmp_directory=tmp_directory) os.removedirs(tmp_directory) # alignment succeeded, save it to the appropriate location if status: np_handle.open() if analysis_identifier is None: analysis_identifier = Fast5.__default_basecall_1d_analysis__ # get attrs keys = ["signalAlign version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [dict(zip(keys, values)), np_handle.fastFive.raw_attributes]) # get events (and delete tmp location) events = np_handle.fastFive.get_custom_analysis_events( EVENT_KMERALIGN_TMP) np_handle.fastFive.delete(tmp_root, ignore=False) # save events and fastq saved_loc = save_event_table_and_fastq( np_handle.fastFive, events, fastq, attributes, analysis_identifier=analysis_identifier) return saved_loc # alignment failed, remove offending location (if it exists) and report else: print("[load_from_raw] error performing kmeralign", file=sys.stderr) np_handle.open() np_handle.fastFive.delete(tmp_root, ignore=True) return False
def load_from_raw(np_handle, alignment_file, model_file_location, path_to_bin="./", nucleotide_sequence=None, analysis_identifier=None, write_failed_alignments=False): """Load a nanopore read from raw signal and an alignment file. Need a model to create banded alignment. :param np_handle: NanoporeRead class object :param alignment_file: sam/bam file :param model_file_location: path to model file :param path_to_bin: bath to signalAlign bin where executables are stored :param nucleotide_sequence: nucleotide sequence (needed if no alignment file is available) :param analysis_identifier: identifier for storage of event table and fastq :param write_failed_alignments: still write alignments that failed quality checks :return: path to events in fast5 file or -1 if the task fails """ assert os.path.isfile(model_file_location), \ "Model_file_location must be a real path to a SignalAlign HMM model file" assert os.path.exists(path_to_bin), \ "path_to_bin must exist" if not os.path.isfile(str(alignment_file)) and nucleotide_sequence is None: nucleotide_sequence = np_handle.get_template_read( initalize_bypass=True) assert nucleotide_sequence, "alignment_file must be a real path a SAM/BAM alignment file, or " \ "nucleotide_sequence must be specified (retrieval attempted from fast5). " \ "alignment_file: {}, nucleotide_sequence:{}".format(alignment_file, nucleotide_sequence) # check if file is open if not np_handle.open(): return False # grab read id read_id = np_handle.read_label # get nucleotides and qualities if nucleotide_sequence is None: # get/build nucleotide sequence from alignment file (accounting for hardclipping) nucleotide_sequence, nucleotide_qualities, _, _, _ = \ get_full_nucleotide_read_from_alignment(alignment_file, read_id) if nucleotide_sequence is None: print("[load_from_raw] nucleotides for {} not found in {}".format( read_id, alignment_file), file=sys.stderr) return False else: nucleotide_qualities = None if nucleotide_qualities is None: nucleotide_qualities = "!" * len(nucleotide_sequence) # get fastq (this is saved with the event table) fastq = create_fastq_line(read_id, nucleotide_sequence, nucleotide_qualities) # get temp location tmp_root = np_handle.fastFive.get_analysis_new(EVENT_KMERALIGN_TMP) tmp_dest = np_handle.fastFive.get_analysis_events_path_new( EVENT_KMERALIGN_TMP) assert tmp_dest.startswith(tmp_root), "Invalid analysis path management" file_name = np_handle.filename np_handle.close() tmp_directory = tempfile.mkdtemp() # run the c code which does the required stuff status = run_kmeralign_exe(file_name, nucleotide_sequence, model_file_location, tmp_dest, path_to_bin, write_failed_alignments=write_failed_alignments, tmp_directory=tmp_directory) os.removedirs(tmp_directory) # alignment succeeded, save it to the appropriate location if status: np_handle.open() if analysis_identifier is None: analysis_identifier = Fast5.__default_basecall_1d_analysis__ # get attrs keys = ["signalAlign version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [dict(zip(keys, values)), np_handle.fastFive.raw_attributes]) # get events (and delete tmp location) events = np_handle.fastFive.get_custom_analysis_events( EVENT_KMERALIGN_TMP) np_handle.fastFive.delete(tmp_root, ignore=False) # save events and fastq saved_loc = save_event_table_and_fastq( np_handle.fastFive, events, fastq, attributes, analysis_identifier=analysis_identifier) return saved_loc # alignment failed, remove offending location (if it exists) and report else: print("[load_from_raw] error performing kmeralign", file=sys.stderr) np_handle.open() np_handle.fastFive.delete(tmp_root, ignore=True) return False
def generate_events_and_alignment( fast5_path, nucleotide_sequence, nucleotide_qualities=None, event_detection_params=None, event_detection_strategy=None, save_to_fast5=True, overwrite=False, analysis_identifier=Fast5.__default_basecall_1d_analysis__, ): assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) # create Fast5 object f5fh = Fast5(fast5_path, read='r+') read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] success = False # event detection prep if event_detection_strategy is None: event_detection_strategy = EVENT_DETECT_MINKNOW if event_detection_params is None: event_detection_params = get_default_event_detection_params( event_detection_strategy) # detect events if event_detection_strategy == EVENT_DETECT_SPEEDY: signal = f5fh.get_read(raw=True, scale=True) event_table = create_speedy_event_table(signal, sampling_freq, start_time, **event_detection_params) event_detection_params = merge_dicts( [event_detection_params, { "event_detection": "speedy_stat_split" }]) elif event_detection_strategy == EVENT_DETECT_MINKNOW: signal = f5fh.get_read(raw=True, scale=True) event_table = create_minknow_event_table(signal, sampling_freq, start_time, **event_detection_params) event_detection_params = merge_dicts([ event_detection_params, { "event_detection": "minknow_event_detect" } ]) elif event_detection_strategy == EVENT_DETECT_SCRAPPIE: event_table = create_scrappie_event_table(fast5_path, sampling_freq) event_detection_params = merge_dicts([ event_detection_params, { "event_detection": "scrappie_event_detect" } ]) else: raise Exception( "PROGRAMMER ERROR: unknown resegment strat {}: expected {}".format( event_detection_strategy, [ EVENT_DETECT_SPEEDY, EVENT_DETECT_MINKNOW, EVENT_DETECT_SCRAPPIE ])) # gather attributes keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [event_detection_params, dict(zip(keys, values)), f5fh.raw_attributes]) # do the alignment # todo do_alignment(events, nucleotide_sequence) # success = evaluate_success() # save to fast5 (if appropriate) saved_location = None if save_to_fast5: fastq = create_fastq_line( read_id, nucleotide_sequence, "*" if nucleotide_qualities is None else nucleotide_qualities) saved_location = save_event_table_and_fastq( f5fh, event_table, fastq, attributes=attributes, overwrite=overwrite, analysis_identifier=analysis_identifier) # close f5fh.close() return success, event_table, saved_location
def resegment_reads(fast5_path, params=None, speedy=False, overwrite=True, analysis_path="ReSegmentBasecall_000"): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param analysis_path: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) # create Fast5 object and sanity check f5fh = Fast5(fast5_path, read='r+') if not f5fh.has_basecall_data(): f5fh.close() return None # gather previous event detection old_event_table = f5fh.get_basecall_data() read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # get params if params is None: params = get_default_event_detection_params( EVENT_DETECT_SPEEDY if speedy else EVENT_DETECT_MINKNOW) # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "speedy_stat_split" }]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "minknow_event_detect" }]) # metadata keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [params, dict(zip(keys, values)), f5fh.raw_attributes]) # do resegmentation if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) # get destination in fast5 #todo find latest location? ie: save_event_table_and_fastq(..) destination = f5fh._join_path(f5fh.__base_analysis__, analysis_path) f5fh.set_event_table(destination, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!' * len(sequence) fastq = create_fastq_line(read_id + " :", sequence, quality_scores) # set fastq f5fh.set_fastq(destination, fastq, overwrite=overwrite) return f5fh
def setUpClass(cls): super(CreateLabelsTest, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.fasta = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5") rev_dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5") rev_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5") forward_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5") rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa") ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference) cls.rna_reference_handle = pysam.FastaFile(rna_reference) cls.tmp_directory = tempfile.mkdtemp() # get file locations cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5') cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5') cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5') cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5') # run signalAlign on one file cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model") cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam") cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam") cls.bin_path = os.path.join(cls.HOME, "bin") # kmer index cls.kmer_index = 2 # copy file to tmp directory shutil.copy(dna_file, cls.tmp_dna_file) shutil.copy(rev_dna_file, cls.tmp_dna_file2) shutil.copy(forward_rna_file, cls.tmp_rna_file1) shutil.copy(rev_rna_file, cls.tmp_rna_file2) args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.rna_model_file, alignment_file=cls.rna_sam, forward_reference=rna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=5, delete_tmp=False) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}])) sa_h.run() args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.dna_model_file_94, alignment_file=cls.dna_sam, forward_reference=ecoli_dna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=10, traceBackDiagonals=100, constraint_trim=3) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}])) sa_h.run() cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index) cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index) cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index) cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index) cls.rev_comp = ReverseComplement() cls.tmp_dna_file3 = os.path.join(cls.HOME, "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5") cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)