def makeNanoporeRead(f5_path): # here we load the NanoporeRead and write it to a file np = NanoporeRead(fast_five_file=f5_path, twoD=False) # make this a config arg ok = np.Initialize(job) if not ok: return None _l = np.read_label tF = job.fileStore.getLocalTempFile() fH = open(tF, "w") ok = np.Write(job, fH, initialize=False) if not ok: fH.close() return None fH.close() # then we gzip it and deliver it to the readstore and return the ledger line fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l) fH = open(tF, "rb") gz = gzip.open(fn.fullpathGetter(), "wb") shutil.copyfileobj(fH, gz) fH.close() gz.close() try: deliverOutput(job, fn, readstore_dir) except RuntimeError: job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l) return None return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
def __init__(self, full_data, variants, read_name, forward_mapped): """Marginalize over all posterior probabilities to give a per position read probability :param variants: bases to track probabilities :param full_data: path to full tsv file ['contig', 'reference_index', 'reference_kmer', 'read_file', 'strand', 'event_index', 'event_mean', 'event_noise', 'event_duration', 'aligned_kmer', 'scaled_mean_current', 'scaled_noise', 'posterior_probability', 'descaled_event_mean', 'ont_model_mean', 'path_kmer'] """ self.read_name = read_name self.full_data = full_data self.variant_data = self.full_data[["X" in kmer for kmer in self.full_data["reference_kmer"]]] self.variants = sorted(variants) self.forward_mapped = forward_mapped self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.contig = NanoporeRead.bytes_to_string(self.full_data["contig"][0]) self.position_probs = pd.DataFrame() self.has_data = False self.per_read_calls = pd.DataFrame() self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"], list(self.variants)])
def organize_fast5s(fast5_locations): # gathered data fast5_to_read_id = dict() requires_event_calling = list() # examine each fast5 for fast5 in fast5_locations: npr = NanoporeRead(fast5) success = npr.Initialize() read_id = npr.read_label fast5_id = os.path.basename(fast5)[:-6] fast5_to_read_id[fast5_id] = read_id if not success: requires_event_calling.append((fast5, read_id)) npr.close() return fast5_to_read_id, requires_event_calling
def organize_fast5s(fast5_locations, realign_all=False): # gathered data fast5_to_read_id = dict() requires_event_calling = list() # examine each fast5 for fast5 in fast5_locations: npr = NanoporeRead(fast5, perform_kmer_event_alignment=False) success = npr._initialize_metadata() read_id = npr.read_label fast5_id = os.path.basename(fast5)[:-6] fast5_to_read_id[fast5_id] = read_id if not success or realign_all: requires_event_calling.append((fast5, read_id)) npr.close() return fast5_to_read_id, requires_event_calling
def test_run_kmeralign_exe(self): path_to_bin = os.path.join(self.HOME, "bin") rna_fast5_path = os.path.abspath(self.tmp_rna_file2) nuc_sequence = "CAUCCUGCCCUGUGUUAUCCAGUUAUGAGAUAAAAAAUGAAUAUAAGAGUGCUUGUCAUUAUAAAAGUUUUCCUUUUUAUUACCAUCCAAGCCACCAGCUGCCAGCCACCAGCAGCCAGCUGCCAGCACUAGCUUUUUUUUUUUAGCACUUAGUAUUUAGCAGCAUUUAUUAACAGGUACUUUAAGAAUGAUGAAGCAUUGUUUUAAUCUCACUGACUAUGAAGGUUUUAGUUUCUGCUUUUGCAAUUGUGUUUGUGAAAUUUGAAUACUUGCAGGCUUUGUAUGUGAAUAAUUUUAGCGGCUGGUUGGAGAUAAUCCUACGGGAAUUACUUAAAACUGUGCUUUAACUAAAAUGAAUGAGCUUUAAAAUCCCUCCUCCUACUCCAUCAUCAUCCCACUAUUCAUCUUAUCUCAUUAUCAUCAACCUAUCCCACAUCCCUAUCACCACAGCAAUCCAA" rna_model_file = self.rna_model_file np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3)) np_handle._initialize_metadata() dest = "/Analyses/SignalAlign_Basecall_1D_001/BaseCalled_template" self.rna_handle2.close() status = run_kmeralign_exe(rna_fast5_path, nuc_sequence, rna_model_file, dest, path_to_bin) rna_handle = Fast5(self.tmp_rna_file2, 'r+') events = np.array(rna_handle[dest]) self.assertEqual(events[0]["raw_length"], 7) self.assertTrue(status)
def get_data(self): """Calculate the normalized probability of variant for each nucleotide and across the read""" # final location of per position data and per read data data = [] per_read_data = [] for read_strand in (b"t", b"c"): read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand] read_strand = read_strand.decode("utf-8") if len(read_strand_specifc_data) == 0: continue for forward_mapped in set(self.variant_data["forward_mapped"]): mapping_strand = "-" if forward_mapped == b"forward": mapping_strand = "+" strand_specifc_data = read_strand_specifc_data[read_strand_specifc_data["forward_mapped"] == forward_mapped] if len(strand_specifc_data) == 0: continue # get positions on strand positions = set(strand_specifc_data["reference_position"]) n_positions = len(positions) strand_read_nuc_data = [0] * len(self.variants) # marginalize probabilities for each position for pos in positions: pos_data = strand_specifc_data[strand_specifc_data["reference_position"] == pos] total_prob = 0 position_nuc_dict = {x: 0.0 for x in self.variants} # Get total probability for each nucleotide for nuc in set(pos_data["base"]): nuc_data = pos_data[pos_data["base"] == nuc] nuc_prob = sum(nuc_data["posterior_probability"]) total_prob += nuc_prob position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob # normalize probabilities over each position nuc_data = [0] * len(self.variants) for nuc in position_nuc_dict.keys(): index = self.variants.index(nuc) nuc_data[index] = position_nuc_dict[nuc] / total_prob strand_read_nuc_data[index] += nuc_data[index] data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strand], nuc_data])) if n_positions > 0: per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strand, n_positions], [prob / n_positions for prob in strand_read_nuc_data]])) self.position_probs = pd.DataFrame(data, columns=self.columns) self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns) self.has_data = True return self.position_probs
def __init__(self, variant_data, variants, read_name): """Marginalize over all posterior probabilities to give a per position read probability :param variants: bases to track probabilities :param variant_data: variant data """ self.read_name = read_name self.variant_data = variant_data self.variants = sorted(variants) self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.contig = NanoporeRead.bytes_to_string(self.variant_data["contig"][0]) self.position_probs = pd.DataFrame() self.has_data = False self.per_read_calls = pd.DataFrame() self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"], list(self.variants)])
def test_load_from_raw(self): path_to_bin = os.path.join(self.HOME, "bin") np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3)) np_handle._initialize_metadata() alignment_file = os.path.join( self.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.sam") saved_location = load_from_raw(np_handle, alignment_file, self.rna_model_file, path_to_bin) # close and reopen np_handle.close() np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3)) # get events and validate events = np.array( np_handle. fastFive["/Analyses/Basecall_1D_001/BaseCalled_template/Events"]) self.assertEqual(events[0]["raw_length"], 11) self.assertTrue("/Analyses/Basecall_1D_001/BaseCalled_template/Fastq" in np_handle.fastFive) self.assertEqual(saved_location, "/Analyses/Basecall_1D_001")
def check_alignments(self, true_alignments, reads, reference, kmer_length, contig_name, extra_args=None, rna=False): # TODO remove this from the framework and code true_alignments = lambda x: 1 / 0 def get_kmer(start): kmer = referece_sequence[start:start + kmer_length] if type(kmer) is str: return kmer else: return bytes.decode(kmer) input_fast5s = glob.glob(os.path.join(reads, "*.fast5")) assert len(input_fast5s) > 0, "Didn't find test MinION reads" assert os.path.isfile(reference), "Didn't find reference sequence" # it's this or rewrite all the relative locations of the files os.chdir(BIN_PATH) # prep command run_signal_align = os.path.join(BIN_PATH, "runSignalAlign") # removed: --debug alignment_command = "{runsignalalign} run2 -d={reads} --bwa_reference={ref} -smt=threeState -o={testDir} " \ "".format(runsignalalign=run_signal_align, reads=reads, ref=reference, testDir="./signalAlign_unittest/") if extra_args is not None: alignment_command += extra_args # run signalAlign result = call(alignment_command, shell=True, bufsize=-1) self.assertTrue( result == 0, "Error running signalAlign. Command was {}".format( alignment_command)) # get alignments test_alignments = glob.glob( "./signalAlign_unittest/tempFiles_alignment/*.tsv") self.assertTrue( len(test_alignments) == len(input_fast5s), "Didn't make all alignments got {got} should be {should}".format( got=len(test_alignments), should=len(input_fast5s))) # prep for verification referece_sequence = getFastaDictionary(reference)[contig_name] alignment2events = dict() for fast5 in input_fast5s: with closing(NanoporeRead(fast5, initialize=True)) as read: event_count = len(read.get_template_events()) read_id = read.read_label self.assertTrue( event_count > 0, "Got no events for fast5 {} with read_id {}".format( fast5, read_id)) for alignment in test_alignments: if os.path.basename(alignment).startswith(read_id): self.assertTrue( alignment not in alignment2events, "Fast5 {} matched read_id {} with multiple output alignments" .format(fast5, read_id)) alignment2events[alignment] = event_count for alignment in test_alignments: alignment_file = alignment.split("/")[-1] # expected = parse_alignment_full(os.path.join(true_alignments, alignment_file)) obs = parse_alignment_full(alignment) for row in obs.itertuples(): ref_pos = row[1] obs_kmer = row[2] strand = row[3] exp_kmer = get_kmer(ref_pos) if rna: exp_kmer = exp_kmer[::-1] self.assertEqual( obs_kmer, exp_kmer, msg="kmer at index {idx} on strand {strand} is {obs} " "should be {exp}, file {f}".format(idx=ref_pos, strand=strand, obs=obs_kmer, exp=exp_kmer, f=alignment)) signal_align_event_count = len(obs) intial_event_count = alignment2events[alignment] self.assertTrue( signal_align_event_count >= intial_event_count, "SignalAlign produced {} events, less than inital count {}". format(signal_align_event_count, intial_event_count)) # this is a magic number self.assertTrue( signal_align_event_count <= intial_event_count * 3, "SignalAlign produced {} events, more than 3x the initial count {}" .format(signal_align_event_count, intial_event_count))
def run(self): print("[SignalAlignment.run] INFO: Starting on {read}".format( read=self.in_fast5)) if self.get_expectations: assert self.in_templateHmm is not None, "Need template HMM files for model training" if self.twoD_chemistry: assert self.in_complementHmm is not None, "Need compement HMM files for model training" if not os.path.isfile(self.in_fast5): print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False # prep self.openTempFolder("tempFiles_%s" % self.read_name) if self.twoD_chemistry: npRead = NanoporeRead2D(fast_five_file=self.in_fast5, event_table=self.event_table, initialize=True) else: npRead = NanoporeRead(fast_five_file=self.in_fast5, event_table=self.event_table, initialize=True) #todo need to validate / generate events and nucleotide read # read label read_label = npRead.read_label # use this to identify the read throughout self.read_label = read_label # nanopore read (event table, etc) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) if not (self.check_for_temp_file_existance and os.path.isfile(npRead_)): # TODO is this totally f****d for RNA because of 3'-5' mapping? fH = open(npRead_, "w") ok = npRead.Write(out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run] File: %s did not pass initial checks" % self.read_name, npRead) return False # nucleotide read read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) ok = self.write_nucleotide_read(npRead, read_fasta_) if not ok: print( "[SignalAlignment.run] Failed to write nucleotide read. Continuing execution." ) # alignment info cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) strand = None reference_name = None if not (self.check_for_temp_file_existance and os.path.isfile(cigar_file_)): # need guide alignment to generate cigar file guide_alignment = None # get from alignment file if self.alignment_file is not None: guide_alignment = getGuideAlignmentFromAlignmentFile( self.alignment_file, read_name=read_label) if guide_alignment is None: print( "[SignalAlignment.run] read {} not found in {}".format( read_label, self.alignment_file)) # get from bwa if guide_alignment is None and self.bwa_reference is not None: guide_alignment = generateGuideAlignment( reference_fasta=self.bwa_reference, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) if guide_alignment is None: print( "[SignalAlignment.run] read {} could not be aligned with BWA" .format(read_label)) # could not map if guide_alignment is None: self.failStop( "[SignalAlignment.run] ERROR getting guide alignment", npRead) return False # ensure valid if not guide_alignment.validate(): self.failStop( "[SignalAlignment.run] ERROR invalid guide alignment", npRead) return False strand = guide_alignment.strand reference_name = guide_alignment.reference_name # write cigar to file cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # otherwise, get strand from file else: strand, reference_name = getInfoFromCigarFile(cigar_file_) # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv # forward strand if strand == "+": if self.output_format == "full": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".forward.tsv") elif self.output_format == "variantCaller": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".tsv") else: posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".assignments.tsv") # backward strand elif strand == "-": if self.output_format == "full": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".backward.tsv") elif self.output_format == "variantCaller": posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".tsv") else: posteriors_file_path = os.path.join( self.destination, read_label + model_label + ".assignments.tsv") # sanity check else: self.failStop( "[SignalAlignment.run] ERROR Unexpected strand {}".format( strand), npRead) return False # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion( strand="template", version=npRead.version) if self.twoD_chemistry and self.in_complementHmm is None: pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model" self.in_complementHmm = defaultModelFromVersion( strand="complement", version=npRead.version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalAlignment.run] NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm)) # reference sequences assert os.path.isfile(self.forward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference) if self.backward_reference: assert os.path.isfile(self.backward_reference) backward_ref_flag = "-b {b_ref} ".format( b_ref=self.backward_reference) else: backward_ref_flag = "" # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in list(self.output_formats.keys()): self.failStop( "[SignalAlignment.run] ERROR illegal output format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" # twoD flag if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if self.get_expectations: template_expectations_file_path = os.path.join( self.destination, read_label + ".template.expectations.tsv") complement_expectations_file_path = os.path.join( self.destination, read_label + ".complement.expectations.tsv") command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) else: command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) # run print("[SignalAlignment.run] running command: ", command, end="\n") try: command = command.split() if self.track_memory_usage: mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n'] print( "[SignalAlignment.run] Prepending command to track mem usage: {}" .format(mem_command)) mem_command.extend(command) command = mem_command output = subprocess.check_output(command, stderr=subprocess.STDOUT) output = str(output).split("\\n") for line in output: print("[SignalAlignment.run] {}: {}".format( read_label, line)) if line.startswith("DEBUG_MAX_MEM"): self.max_memory_usage_kb = int(line.split(":")[1]) except Exception as e: print( "[SignalAlignment.run] exception ({}) running signalAlign: {}". format(type(e), e)) raise e # save to fast5 file (if appropriate) if self.embed: print("[SignalAlignment.run] embedding into Fast5 ") data = self.read_in_signal_align_tsv(posteriors_file_path, file_type=self.output_format) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) npRead.Initialize(None) signal_align_path = npRead.get_latest_basecall_edition( "/Analyses/SignalAlign_00{}", new=False) assert signal_align_path, "There is no path in Fast5 file: {}".format( "/Analyses/SignalAlign_00{}") output_path = npRead._join_path(signal_align_path, self.output_format) npRead.write_data(data, output_path) # Todo add attributes to signalalign output if self.output_format == "full": print( "[SignalAlignment.run] writing maximum expected alignment " ) alignment = mea_alignment_from_signal_align(None, events=data) mae_path = npRead._join_path(signal_align_path, "MEA_alignment_labels") events = npRead.get_template_events() if events: if strand == "-": minus = True else: minus = False labels = match_events_with_signalalign( sa_events=alignment, event_detections=np.asanyarray(npRead.template_events), minus=minus, rna=npRead.is_read_rna()) npRead.write_data(labels, mae_path) sam_string = str() if os.path.isfile(temp_samfile_): with open(temp_samfile_, 'r') as test: for line in test: sam_string += line sam_path = npRead._join_path(signal_align_path, "sam") # print(sam_string) npRead.write_data(data=sam_string, location=sam_path, compression=None) # self.temp_folder.remove_folder() return True
def fast5_file_organization_service(work_queue, done_queue, output_dir_count, output_base, output_index_base, copy_files, service_name="fast5_file_organization"): # prep total_handled = 0 failure_count = 0 total_reads = 0 name = current_process().name index_files = {} #catch overall exceptions try: # each thread and outputdir gets its own index file (start header for all of them) for i in range(output_dir_count): index_file = "{}{}_{}.tsv".format(output_index_base, i, name) write_header = not os.path.isfile(index_file) index_files[i] = open(index_file, 'a') if write_header: index_files[i].write("##{}:{}\n".format( FAST5_ROOT, get_output_directory(output_base, i))) index_files[i].write("#{}\t{}\t{}\n".format( FAST5_LOCATION, READ_ID, RUN_NAME)) assert len( index_files ) == output_dir_count, "unexpected count of index files {} (expected {})".format( len(index_files), output_dir_count) idx = -1 for f in iter(work_queue.get, 'STOP'): # randomly place files in appropriate directory idx = (idx + 1) % output_dir_count try: # sanity check assert 0 <= idx < output_dir_count # file organization destination_dir = get_output_directory(output_base, idx) source = f[FAST5_SRC_LOCATION] filename = os.path.basename(source) destination = os.path.join(destination_dir, filename) action = shutil.move if copy_files: action = shutil.copy # fast5 organization read = NanoporeRead(source) if not read._initialize_metadata(): failure_count += 1 continue read_id = read.read_label run_id = read.run_id assert None not in [ read_id, run_id ], "Missing read or run id for {}".format(source) # move or copy the file action(source, destination) # write the contents to the index index_files[idx].write("{}\t{}\t{}\n".format( destination, read_id, run_id)) except Exception as e: # get error and log it message = "{}:{}".format(type(e), str(e)) error = "{} '{}' failed with: {}".format( service_name, current_process().name, message) print("[{}] ".format(service_name) + error) done_queue.put(error) failure_count += 1 finally: # increment total handling total_handled += 1 except Exception as e: # get error and log it message = "{}:{}".format(type(e), str(e)) error = "{} '{}' critically failed with: {}".format( service_name, current_process().name, message) print("[{}] ".format(service_name) + error) done_queue.put(error) finally: # close all index files for index_file in index_files.values(): if index_file is not None: index_file.close() # logging and final reporting print("[%s] '%s' completed %d calls with %d failures" % (service_name, current_process().name, total_handled, failure_count)) done_queue.put("{}:{}".format(TOTAL_KEY, total_handled)) done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))
def get_alignment_summary_info(fast5s, alignment_file, pass_threshold=7, gap_size=10, verbose=False, max_reads=100, number=0): """Filter fast5 files based on a quality threhsold and if there is an alignment""" # collect for every read fast5_dict = defaultdict() # loop through fast5s for fast5_path in fast5s: assert os.path.exists(fast5_path), "fast5 path does not exist: {}".format(fast5_path) f5h = NanoporeRead(fast5_path) f5h._initialize_metadata() read_name = f5h.read_label fast5_dict[read_name] = fast5_path print("Created read_id to fast5_path mapping") # summary data stored here mapped_reads = get_summary_info_table(list(fast5_dict.keys())) # grab aligned segment seen_counter = 0 reads_seen = set() print("first_len reads_seen: {}".format(len(reads_seen)), file=sys.stderr) with closing(pysam.AlignmentFile(alignment_file, 'rb' if alignment_file.endswith("bam") else 'r')) as aln: for aligned_segment in aln.fetch(until_eof=True): if seen_counter > max_reads: break try: print("reads_seen: {}".format(len(reads_seen)), file=sys.stderr) read_name = aligned_segment.qname.split("_")[0] fast5_path = fast5_dict[read_name] if read_name not in reads_seen: reads_seen |= {read_name} seen_counter += 1 mapped_reads["seen"][read_name] = 1 print(fast5_path) cl_handle = CreateLabels(fast5_path, kmer_index=2) seq_start_time = cl_handle.raw_attributes['start_time'] q_score_average = 0 if aligned_segment.query_qualities is None: print("Alignment done with fasta instead of fastq so read qualities will not be reported") else: q_score_average = np.mean(aligned_segment.query_qualities) mapped_reads["q_score_average"][read_name] = q_score_average mapped_reads["seq_start_time"][read_name] = seq_start_time if aligned_segment.is_secondary or aligned_segment.is_unmapped \ or aligned_segment.is_supplementary or aligned_segment.has_tag("SA") \ or q_score_average < pass_threshold: if aligned_segment.is_secondary: mapped_reads["num_secondary_mappings"][read_name] += 1 if aligned_segment.is_unmapped: mapped_reads["no_mapping"][read_name] = 1 if aligned_segment.is_supplementary or aligned_segment.has_tag("SA"): mapped_reads["chimera_mapping"][read_name] += 1 else: mapped_reads["map_q"][read_name] = aligned_segment.mapq soft_clipped_percentage = \ 1 - float(len(aligned_segment.query_alignment_sequence)) / len(aligned_segment.query_sequence) mapped_reads["soft_clipped_percentage"][read_name] = soft_clipped_percentage handle = AlignmentSegmentWrapper(aligned_segment) handle.initialize() accuracy = handle.alignment_accuracy() mapped_reads["basecalled_accuracy"][read_name] = accuracy try: mea = cl_handle.add_mea_labels(number=int(number)) sa_full = cl_handle.add_signal_align_predictions(number=int(number), add_basecall=True) all_basecall_data = [] for name, basecall_data in cl_handle.aligned_signal.prediction.items(): if "guide" in name: all_basecall_data.extend(basecall_data) alignment_summary = analyze_event_skips(mea, sa_full, all_basecall_data, generate_plot=False) flagged_gaps_summary = flag_large_gaps(alignment_summary, gap_size, verbose=verbose) counter = 0 total_distance = 0 for gap in flagged_gaps_summary: if gap["mea_peak_distance"] > 10: counter += 1 total_distance += gap["mea_peak_distance"] if counter > 0: mapped_reads["num_flagged_gaps"][read_name] = counter mapped_reads["avg_flagged_gap_size"][read_name] = float(total_distance) / counter if mapped_reads["q_score_average"][read_name] > pass_threshold: mapped_reads["pass"][read_name] = 1 except KeyError: mapped_reads["other_errors"][read_name] = 1 except Exception as e: print(e, file=sys.stderr) return mapped_reads[mapped_reads["seen"] == 1]
def event_detection(work_queue, done_queue, alignment_file, model_file_location, event_detection_strategy=None, event_detection_params=None, tmp_directory=None, write_failed_alignments=True, service_name="event_detection"): # prep total_handled = 0 failure_count = 0 #catch overall exceptions try: for tmp in iter(work_queue.get, 'STOP'): # get data from iterator fast5, read_id = tmp['fast5'] np_handle = None # catch exceptions on each element try: np_handle = NanoporeRead(fast5, initialize=False) success = load_from_raw( np_handle, alignment_file, model_file_location, write_failed_alignments=write_failed_alignments) if not success: raise Exception( "load_from_raw failed on read {} in {}".format( read_id, fast5)) except Exception as e: # get error and log it message = "{}:{}".format(type(e), str(e)) error = "{} '{}' failed with: {}".format( service_name, current_process().name, message) print("[{}] ".format(service_name) + error) done_queue.put(error) failure_count += 1 finally: if np_handle is not None: np_handle.close() # increment total handling total_handled += 1 except Exception as e: # get error and log it message = "{}:{}".format(type(e), str(e)) error = "{} '{}' critically failed with: {}".format( service_name, current_process().name, message) print("[{}] ".format(service_name) + error) done_queue.put(error) finally: # logging and final reporting print("[%s] '%s' completed %d calls with %d failures" % (service_name, current_process().name, total_handled, failure_count)) done_queue.put("{}:{}".format(TOTAL_KEY, total_handled)) done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))
def run(self, get_expectations=False): print("[SignalAlignment.run]INFO: Starting on {read}".format( read=self.in_fast5), file=sys.stderr) if get_expectations: assert self.in_templateHmm is not None and self.in_complementHmm is not None,\ "Need HMM files for model training" # file checks if os.path.isfile(self.in_fast5) is False: print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False self.openTempFolder("tempFiles_%s" % self.read_name) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry) fH = open(npRead_, "w") ok = npRead.Write(parent_job=None, out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run]File: %s did not pass initial checks" % self.read_name, npRead) return False read_label = npRead.read_label # use this to identify the read throughout read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) if self.twoD_chemistry: ok, version, pop1_complement = self.prepare_twod( nanopore_read=npRead, twod_read_path=read_fasta_) else: ok, version, _ = self.prepare_oned(nanopore_read=npRead, oned_read_path=read_fasta_) pop1_complement = None # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" guide_alignment = generateGuideAlignment( bwa_index=self.bwa_index, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) ok = guide_alignment.validate(self.reference_map.keys()) if not ok: self.failStop("[SignalAlignment.run]ERROR getting guide alignment", npRead) return False cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv posteriors_file_path = '' # forward strand if guide_alignment.strand == "+": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # backward strand if guide_alignment.strand == "-": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # Alignment/Expectations routine path_to_signalAlign = "./signalMachine" # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion(strand="template", version=version) if self.twoD_chemistry: if self.in_complementHmm is None: self.in_complementHmm = defaultModelFromVersion( strand="complement", version=version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalALignment.run]NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm), file=sys.stderr) # reference sequences assert self.reference_map[ guide_alignment.reference_name]["forward"] is not None assert self.reference_map[ guide_alignment.reference_name]["backward"] is not None forward_reference = self.reference_map[ guide_alignment.reference_name]["forward"] backward_reference = self.reference_map[ guide_alignment.reference_name]["backward"] assert os.path.isfile(forward_reference) assert os.path.isfile(backward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=forward_reference) backward_ref_flag = "-b {b_ref} ".format(b_ref=backward_reference) # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in self.output_formats.keys(): self.failStop( "[SignalAlignment.run]ERROR illegal outpur format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if get_expectations: template_expectations_file_path = self.destination + read_label + ".template.expectations" complement_expectations_file_path = self.destination + read_label + ".complement.expectations" command = \ "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations}"\ .format(vA=path_to_signalAlign, model=stateMachineType_flag, f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt) else: print("read_label", read_label) command = \ "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel}"\ .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt, f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag) # run print("signalAlign - running command: ", command, end="\n", file=sys.stderr) os.system(command) self.temp_folder.remove_folder() return True
def run(self, get_expectations=False): print("[SignalAlignment.run]INFO: Starting on {read}".format( read=self.in_fast5), file=sys.stderr) if get_expectations: assert self.in_templateHmm is not None and self.in_complementHmm is not None, \ "Need HMM files for model training" # file checks if os.path.isfile(self.in_fast5) is False: print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}". format(file=self.in_fast5)) return False self.openTempFolder("tempFiles_%s" % self.read_name) npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name) # TODO is this totally f****d for RNA because of 3'-5' mapping? npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) fH = open(npRead_, "w") ok = npRead.Write(parent_job=None, out_file=fH, initialize=True) fH.close() if not ok: self.failStop( "[SignalAlignment.run]File: %s did not pass initial checks" % self.read_name, npRead) return False read_label = npRead.read_label # use this to identify the read throughout read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label) temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" % read_label) cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label) if self.twoD_chemistry: ok, version, pop1_complement = self.prepare_twod( nanopore_read=npRead, twod_read_path=read_fasta_) else: ok, version, _ = self.prepare_oned(nanopore_read=npRead, oned_read_path=read_fasta_) pop1_complement = None # add an indicator for the model being used if self.stateMachineType == "threeState": model_label = ".sm" stateMachineType_flag = "" elif self.stateMachineType == "threeStateHdp": model_label = ".sm3Hdp" stateMachineType_flag = "--sm3Hdp " if self.twoD_chemistry: assert (self.in_templateHdp is not None) and (self.in_complementHdp is not None), "Need to provide HDPs" else: assert self.in_templateHdp is not None, "Need to provide Template HDP" else: # make invalid stateMachine control? model_label = ".sm" stateMachineType_flag = "" guide_alignment = generateGuideAlignment( bwa_index=self.bwa_index, query=read_fasta_, temp_sam_path=temp_samfile_, target_regions=self.target_regions) # ok = guide_alignment.validate(list(self.reference_map.keys())) ok = guide_alignment.validate() if not ok: self.failStop("[SignalAlignment.run]ERROR getting guide alignment", npRead) return False cig_handle = open(cigar_file_, "w") cig_handle.write(guide_alignment.cigar + "\n") cig_handle.close() # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv posteriors_file_path = '' # forward strand if guide_alignment.strand == "+": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # backward strand if guide_alignment.strand == "-": if self.output_format == "full": posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv" elif self.output_format == "variantCaller": posteriors_file_path = self.destination + read_label + model_label + ".tsv" else: posteriors_file_path = self.destination + read_label + model_label + ".assignments" # Alignment/Expectations routine path_to_signalAlign = "./signalMachine" # flags # input (match) models if self.in_templateHmm is None: self.in_templateHmm = defaultModelFromVersion(strand="template", version=version) if self.twoD_chemistry: if self.in_complementHmm is None: self.in_complementHmm = defaultModelFromVersion( strand="complement", version=version, pop1_complement=pop1_complement) assert self.in_templateHmm is not None if self.twoD_chemistry: if self.in_complementHmm is None: self.failStop( "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis", npRead) return False template_model_flag = "-T {} ".format(self.in_templateHmm) if self.twoD_chemistry: complement_model_flag = "-C {} ".format(self.in_complementHmm) else: complement_model_flag = "" print( "[SignalALignment.run]NOTICE: template model {t} complement model {c}" "".format(t=self.in_templateHmm, c=self.in_complementHmm), file=sys.stderr) # reference sequences assert os.path.isfile(self.forward_reference) forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference) if self.backward_reference: assert os.path.isfile(self.backward_reference) backward_ref_flag = "-b {b_ref} ".format( b_ref=self.backward_reference) else: backward_ref_flag = "" # input HDPs if (self.in_templateHdp is not None) or (self.in_complementHdp is not None): hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp) if self.twoD_chemistry and self.in_complementHdp is not None: hdp_flags += "-w {cHdp_loc} ".format( cHdp_loc=self.in_complementHdp) else: hdp_flags = "" # threshold if self.threshold is not None: threshold_flag = "-D {threshold} ".format(threshold=self.threshold) else: threshold_flag = "" # diagonal expansion if self.diagonal_expansion is not None: diag_expansion_flag = "-x {expansion} ".format( expansion=self.diagonal_expansion) else: diag_expansion_flag = "" # constraint trim if self.constraint_trim is not None: trim_flag = "-m {trim} ".format(trim=self.constraint_trim) else: trim_flag = "" # output format if self.output_format not in list(self.output_formats.keys()): self.failStop( "[SignalAlignment.run]ERROR illegal output format selected %s" % self.output_format) return False out_fmt = "-s {fmt} ".format( fmt=self.output_formats[self.output_format]) # degenerate nucleotide information if self.degenerate is not None: degenerate_flag = "-o {} ".format(self.degenerate) else: degenerate_flag = "" if self.twoD_chemistry: twoD_flag = "--twoD" else: twoD_flag = "" # commands if get_expectations: template_expectations_file_path = self.destination + read_label + ".template.expectations" complement_expectations_file_path = self.destination + read_label + ".complement.expectations" command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \ "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=path_to_signalAlign, model=stateMachineType_flag, cigarFile=cigar_file_, npRead=npRead_, readLabel=read_label, td=twoD_flag, templateExpectations=template_expectations_file_path, hdp=hdp_flags, complementExpectations=complement_expectations_file_path, t_model=template_model_flag, c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=guide_alignment.reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) else: command = \ "{vA} {td} {degen}{sparse}{model} -q {npRead} " \ "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \ "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \ .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt, cigarFile=cigar_file_, readLabel=read_label, npRead=npRead_, td=twoD_flag, t_model=template_model_flag, c_model=complement_model_flag, posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag, trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=guide_alignment.reference_name, f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag) # run print("signalAlign - running command: ", command, end="\n", file=sys.stderr) os.system(command) if self.embed: print("signalAlign - embedding into Fast5 ", file=sys.stderr) data = self.read_in_signal_align_tsv(posteriors_file_path, file_type=self.output_format) npRead = NanoporeRead(fast_five_file=self.in_fast5, twoD=self.twoD_chemistry, event_table=self.event_table) npRead.Initialize(None) signal_align_path = npRead.get_latest_basecall_edition( "/Analyses/SignalAlign_00{}", new=False) assert signal_align_path, "There is no path in Fast5 file: {}".format( "/Analyses/SignalAlign_00{}") output_path = npRead._join_path(signal_align_path, self.output_format) npRead.write_data(data, output_path) # Todo add attributes to signalalign output if self.output_format == "full": print("signalAlign - writing maximum expected alignment ", file=sys.stderr) alignment = mea_alignment_from_signal_align(None, events=data) mae_path = npRead._join_path(signal_align_path, "MEA_alignment_labels") events = npRead.get_template_events() if events: if guide_alignment.strand == "-": minus = True else: minus = False labels = match_events_with_signalalign( sa_events=alignment, event_detections=np.asanyarray(npRead.template_events), minus=minus, rna=npRead.is_read_rna()) npRead.write_data(labels, mae_path) sam_string = str() with open(temp_samfile_, 'r') as test: for line in test: sam_string += line sam_path = npRead._join_path(signal_align_path, "sam") # print(sam_string) npRead.write_data(data=sam_string, location=sam_path, compression=None) # self.temp_folder.remove_folder() return True
def get_data(self): """Calculate the normalized probability of variant for each nucleotide and across the read""" # final location of per position data and per read data data = [] per_read_data = [] if self.forward_mapped: mapping_strands = ["+", "-"] else: mapping_strands = ["-", "+"] if len(self.variant_data) > 0: kmer_len_1 = len(self.variant_data["reference_kmer"].iloc[0]) - 1 mapping_index = 0 for read_strand in ("t", "c"): read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand] # read_strand = read_strand.decode("utf-8") if len(read_strand_specifc_data) == 0: continue # get positions on strand positions = sorted(set(read_strand_specifc_data["reference_index"])) if mapping_strands[mapping_index] == "-": positions = positions[::-1] strand_read_nuc_data = [0] * len(self.variants) # marginalize probabilities for each position n_positions = 0 for pos in positions: pos_data = read_strand_specifc_data[read_strand_specifc_data["reference_index"] == pos] if pos_data["aligned_kmer"].iloc[0][kmer_len_1] != "X": continue n_positions += 1 total_prob = 0 position_nuc_dict = {x: 0.0 for x in self.variants} # Get total probability for each nucleotide for nuc in self.variants: # kmer_len_1 = pos_data["reference_kmer"].iloc[0].find("X") # print(pos_data["reference_kmer"].iloc[0]) nuc_data = pos_data[[nuc == kmer[kmer_len_1] for kmer in pos_data["path_kmer"]]] nuc_prob = sum(nuc_data["posterior_probability"]) total_prob += nuc_prob position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob # normalize probabilities over each position nuc_data = [0] * len(self.variants) for index, nuc in enumerate(self.variants): assert total_prob > 0, "Check 'variants' parameter. There seems to be no kmers with those " \ "variant characters" nuc_data[index] = position_nuc_dict[nuc] / total_prob strand_read_nuc_data[index] += nuc_data[index] data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strands[mapping_index]], nuc_data])) if n_positions > 0: per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strands[mapping_index], n_positions], [prob / n_positions for prob in strand_read_nuc_data]])) mapping_index += 1 self.position_probs = pd.DataFrame(data, columns=self.columns) self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns) self.has_data = True else: self.has_data = False return self.position_probs