def helper(hashTable, reads, infoString): counter, total, totalNum = 0, 0, 0 for readFile in reads: if totalNum == kmerNum: break try: readFastq, _ = getSeqfromRead(readFile) except: continue hits = [ aln for aln in referenceIdx.map(readFastq) if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1 ] if infoString == "+" and len(hits) != 1: continue if infoString == "-" and len(hits) != 0: continue readSignal = getSignalFromRead(readFile) if len(readSignal) <= toRead: continue counter += 1 readSignal = readSignal[fromRead:toRead] readString = getLevelStr(readSignal, levels) for i in range(len(readString)): if kmerNum == totalNum: break kmer = readString[i:i + k] totalNum += 1 total += hashTable.get(kmer, 0) print(f"{infoString} k {k} l {levels} -> {total} / {totalNum}")
readString = getLevelString(readSignal, smoothParam, levels, overflow) readDict = buildDictionary(readString, kmerLength) return overlap(readDict, hashTable) ######################################## data = [] for filePath in posFast5: if posTestCases == 0: break try: readSeq, basecallTable = getSeqfromRead(filePath) except: continue if len(readSeq) < (toRead // repeatSignal): continue hits = [ aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) and aln.strand == 1 and aln.ctg == workingContig ] if len(hits) == 0: continue hit = hits[0] posTestCases -= 1 lvlStringHits = processRead(filePath, workingContig)
### get corresponding part of the reference using minimap2 referenceIdx = mp.Aligner(refFile) assert referenceIdx, "failed to load/build reference index" mod = KmerModel.load_from_hdf5(kmerModelFilePath) posReads = getReadsInFolder(readsPosFilePath, minSize=1000000) negReads = getReadsInFolder(readsNegFilePath, minSize=1000000) goodK, totalK = 0, 0 dobre, zle = 0, 0 for readFile in posReads[:min(len(posReads), maxTests)]: #print(readFile) try: readFastq, readEvents = getSeqfromRead(readFile) except: continue readSeq = seqSignalCor(signalFrom, signalTo, readEvents) hits = [ aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) ] if len(hits) != 1: # print("Too many or too few hits, skipping read.") continue print(readFile) hit = hits[0] successfulReads += 1
levelStr = line[3] storeContig[line[0]] = levelStr hashTable = {} hashTables = {} for contigName in storeContig.keys(): hashTables[contigName] = {} buildDictionarySpecial(hashTables[contigName], storeContig[contigName], kmerLen) buildDictionarySpecial(hashTable, storeContig[contigName], kmerLen) print("Preparation done!") for sample in posReadsPaths[:400]: try: readFastq, readEvents = getSeqfromRead(sample) except: continue hits = [ aln for aln in referenceIdx.map(readFastq) if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1 and aln.ctg == workingContig ] if len(hits) != 1: continue hit = hits[0] refPosition = hit.r_st / len(ref[hit.ctg])
assert referenceIdx, "failed to load/build reference index" posReads = getReadsInFolder(readsPosFilePath, minSize=0) negReads = getReadsInFolder(readsNegFilePath, minSize=0) ################################################################################ negHitsByRatios = [0] * len(ratios) totalCount = 0 for readFile in negReads: if totalCount == readCount: break try: readFastq, _ = getSeqfromRead(readFile) except: continue hits = [aln for aln in referenceIdx.map(readFastq)] hits = [(hit.q_en - hit.q_st) for hit in hits] totalCount += 1 for i in reversed(range(len(ratios))): hit = max(hits + [0]) if hit >= ratios[i] * len(readFastq): negHitsByRatios[i] += 1 # break posHitsByRatios = [0] * len(ratios)
################################################################################ pomery = [[[] for j in kmerLen] for i in levels] overlap = [[[] for j in kmerLen] for i in levels] goodDash = [[] for i in levels] badDash = [[] for i in levels] alignLenRead = [0 for _ in levels] alignLenFake = [0 for _ in levels] readCounter = 0 for posRead in posReads: if readCounter == readNum: break try: readFastq, readEvents = getSeqfromRead(posRead) except: continue readSeq = seqSignalCor(signalFrom, signalTo, readEvents) hits = [ aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) ] if len(hits) != 1: # print("Too many or too few hits, skipping read.") continue hit = hits[0] print("Working on", posRead) print(f"So far done {readCounter} reads")
import numpy as np sys.path.append("../") from signalHelper import getReadsInFolder, getSignalFromRead, getSeqfromRead reads = getReadsInFolder(readsFilePath, minSize=0) signalLengths, seqLengths = [], [] for read in reads: if maxReads == 0: break try: signal = getSignalFromRead(read) seq, _ = getSeqfromRead(read) except: continue maxReads -= 1 signalLengths.append(len(signal)) seqLengths.append(len(seq)) meanSignal = np.mean(signalLengths) medianSignal = np.median(signalLengths) meanSeq = np.mean(seqLengths) medianSeq = np.median(seqLengths) print(f"Not found {maxReads} reads!")