def processRead(path, contig_name=None, refPosition=-1): print(path) readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readLevelString = getGlobalString(readSignal) helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"} helperString = "".join(helperDict[i] for i in readLevelString) levelHits = list(index.map(helperString)) if len(levelHits) == 0: #print("Return False.") return False for hit in levelHits: diff = (hit.r_en - hit.r_st) - (hit.q_en - hit.q_st) print("{0}: {1} vs {2}".format(hit.ctg, hit.r_en - hit.r_st, hit.q_en - hit.q_st)) hitPosition = hit.r_st / lengths[hit.ctg] print(f"Position of hit is {hitPosition}") if abs(hitPosition - refPosition) <= 0.001: if contig_name != None and hit.ctg != contig_name: print("Zle urceny contig!") print("Return False.") #return False #print("Return true.") #return True '''if diff < 0.05*(hit.q_en-hit.q_st): a, b = stringAllignment(str(refFasta[hit.ctg][hit.r_st:hit.r_en]), helperString[hit.q_st:hit.q_en]) for i in range(1, 20): print(i, ":", countDashes(a, i)+countDashes(b, i))''' print("Return False.") return False
def processRead(path, contigName): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readString = getLevelString(readSignal, smoothParam, levels, overflow) readDict = buildDictionary(readString, kmerLength) return overlap(readDict, hashTable)
def processRead(path, contig_name=None): print(path) readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[500:7000] start_time = time.time() readLevelString = getGlobalString(readSignal) print("--- Getting global string takes %s seconds ---" % (time.time() - start_time)) helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"} helperString = "".join(helperDict[i] for i in readLevelString) #levelHits = list(index.map(helperString)) print("Len of signal is {}".format(len(readSignal))) print("Len of helper string is {}".format(len(helperString))) start_time = time.time() print("Fir") for hit in index.map(helperString[:1000]): print("Position of hit is {0}".format(hit.r_st / lengths[hit.ctg])) print("Sec") for hit in index.map(helperString[-1000:]): print("Position of hit is {0}".format(hit.r_st / lengths[hit.ctg])) print("--- Searching string takes %s seconds ---" % (time.time() - start_time)) return False
def processRead(path, contigName, goodTable=-1): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readString = getLevelString(readSignal, smoothParam, levels, overflow) readDict = buildDictionary(readString, kmerLength) hits = [ overlap(readDict, hashTable) for hashTable in hashTables[contigName] ] max_hits = max(hits) max_i = -1 for i in range(len(hits)): if i == goodTable: print("->", end='') if hits[i] == max_hits: print("max->", end='') max_i = i print(hits[i], end=' ') print() global good, bad if goodTable != -1 and max_i == goodTable: print("Good match!") good += 1 if goodTable != -1 and max_i != goodTable: bad += 1 return
def processRead(path, contig_name=None, refPosition=-1): print(path) readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readLevelString = getGlobalString(readSignal) helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"} helperString = "".join(helperDict[i] for i in readLevelString) levelHits = list(index.map(helperString)) if len(levelHits) == 0: return False for hit in levelHits: diff = (hit.r_en - hit.r_st) - (hit.q_en - hit.q_st) hitPosition = hit.r_st / lengths[hit.ctg] print(f"Hit position is {hitPosition}") #if diff < 0.05*(hit.q_en-hit.q_st): if abs(hitPosition - refPosition) <= 0.001: if contig_name != None and hit.ctg != contig_name: print("Zle urceny contig!") print("Return False.") return False return True print("Return False.") return False
def processRead(path, contigName, goodTable=-1): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readDict = getDictFromSequence(readSignal, refWindowSize, refWindowJump) hits = [ overlap(readDict, hashTable) for hashTable in hashTables[contigName] ] max_hits = max(hits) max_i = -1 for i in range(len(hits)): if i == goodTable: print("->", end='') if hits[i] == max_hits: print("max->", end='') max_i = i print(hits[i], end=' ') print() global good, bad if goodTable != -1 and max_i == goodTable: print("Good match!") good += 1 if goodTable != -1 and max_i != goodTable: bad += 1 return
def processRead(path, readFromRef=False): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readDict = getDictFromSequence(readSignal, refWindowSize, refWindowJump) hits = overlap(readDict, hashTable) print("Number of hits is {0}".format(hits)) return hits
def processRead(path, readFromRef=False): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readString = getDictFromSequence(readSignal, l=True) #hits = overlap(readDict, hashTable) #return hits myHits = 0 jump = 2 * kmerLength operateRange = 10 * kmerLength for i in range(100): beg = random.randint(0, len(readString) - operateRange + 1) w = readString[beg:beg + kmerLength] candidates = hashTable.get(w, []) while len(candidates) > 1: j = random.randint(beg + jump, beg + operateRange - kmerLength + 1) w = readString[j:j + kmerLength] newcand = [] for k in candidates: for l in hashTable.get(w, []): if abs(k - l) < operateRange * 1.30: newcand.append(k) break candidates = newcand if len(candidates) != 0: myHits += 1 print(f"Candidates len is {len(candidates)}") print(str(candidates)[:200]) print(f"My hits is {myHits}") #print("Number of hits is {0}".format(hits)) return myHits
def helper(hashTable, reads, infoString): counter, total, totalNum = 0, 0, 0 for readFile in reads: if totalNum == kmerNum: break try: readFastq, _ = getSeqfromRead(readFile) except: continue hits = [ aln for aln in referenceIdx.map(readFastq) if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1 ] if infoString == "+" and len(hits) != 1: continue if infoString == "-" and len(hits) != 0: continue readSignal = getSignalFromRead(readFile) if len(readSignal) <= toRead: continue counter += 1 readSignal = readSignal[fromRead:toRead] readString = getLevelStr(readSignal, levels) for i in range(len(readString)): if kmerNum == totalNum: break kmer = readString[i:i + k] totalNum += 1 total += hashTable.get(kmer, 0) print(f"{infoString} k {k} l {levels} -> {total} / {totalNum}")
continue # require a single hit with at least 95% coverage of length hits = [ aln for aln in referenceIdx.map(readFastq) if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1 and (aln.ctg == "contig1") ] if len(hits) != 1: #print("Too many or too few hits, skipping read.") continue counter += 1 print(readFile) readSignal = np.array(getSignalFromRead(readFile), float) readString = getGlobalString(readSignal) readString = "".join(helperDict[i] for i in readString) readString = readString[:1500] levelHits = list(refLevelIdx.map(readString, cs = True)) levelHits = [i for i in levelHits if i.strand == 1] if len(levelHits) == 0: print("No hits!") continue if abs((levelHits[0].r_st/len(refString))-(hits[0].r_st/len(contig)))<0.001: goodPosReads += 1
assert sequenceIndex, "failed to load/build reference index" ################################################################################ # nadavca nadavca_align = nadavca.align_signal(refFilePath, [sampleRead], bwa_executable='./bwa/bwa') assert (len(nadavca_align) == 1), "Error! More than one alignment!" nadavca_align = nadavca_align[0] assert (nadavca_align[0].reverse_complement == False), "Error! Reverse strand!" ################################################################################ refStr = str(ref[nadavca_align[0].contig_name]) fromSignal, toSignal = nadavca_align[0].signal_range # load original signal from read originalSignal = getSignalFromRead(sampleRead) originalSignal = np.array(originalSignal, dtype=float) table = nadavca_align[1][:40] refSeq = "".join(nadavca_align[0].reference_part)[:40] x, y = [], [] for entry in table: #entry is list of [ref_index, signal_start, signal_end] x.append(str(refStr[entry[0]])) for i in range(entry[1], entry[2]): x.append(" ") y.append(originalSignal[i]) #plt.plot(y) #plt.xticks(y_pos, x, color='orange', rotation=45, fontweight='bold', horizontalalignment='right')
] if len(hits) != 1: # print("Too many or too few hits, skipping read.") continue print(readFile) hit = hits[0] successfulReads += 1 if hit.strand == 1: refSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) fakeSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) else: refSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) fakeSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo], dtype=float) refSignal = np.array( stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) # fakeSignal = np.array(stringToSignal(fakeSeq, mod, repeatSignal = repeatSignal), # float) fakeSignal = [] fakeIndex = -1 while len(fakeSignal) <= signalTo: fakeIndex = random.randint(0, len(negReads) - 1) fakeSignal = np.array(getSignalFromRead(negReads[fakeIndex]), dtype=float) fakeSignal = fakeSignal[signalFrom:signalTo] readSignalSm = smoothSignal(readSignal, 5) refSignalSm = smoothSignal(refSignal, 5)
if (toSignal - fromSignal) < workingLen: continue #print(f"Signal alligned from {fromSignal} to {toSignal}") print("Working on", posRead) print(f"So far done {readCounter} reads") readCounter += 1 if strand == 1: refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef]) else: refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef]) refSignal = np.array( stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) readSignal = np.array(getSignalFromRead(posRead), dtype=float) readSignal = readSignal[fromSignal:toSignal] readSignal = readSignal[:workingLen] refSignal = refSignal[:workingLen] readSignalSm = smoothSignal(readSignal, smoothParam) refSignalSm = smoothSignal(refSignal, smoothParam) readShift, readScale = computeNorm(readSignal, 0, len(readSignal)) readShiftSm, readScaleSm = computeNorm(readSignalSm, 0, len(readSignalSm)) refShift, refScale = computeNorm(refSignal, 0, len(refSignal)) refShiftSm, refScaleSm = computeNorm(refSignalSm, 0, len(refSignalSm)) readStrings, readStringsSm, refStrings, refStringsSm = {}, {}, {}, {}
if (toSignal - fromSignal) < workingLen: continue # print(f"Signal alligned from {fromSignal} to {toSignal}") print("Working on", posRead) print(f"So far done {readCounter} reads") readCounter += 1 if strand == 1: refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef]) else: refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef]) refSignal = np.array( stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) readSignal = np.array(getSignalFromRead(posRead), dtype=float) readSignal = readSignal[fromSignal:toSignal] fakeSignal = [] fakeIndex = -1 while len(fakeSignal) <= toSignal: fakeIndex = random.randint(0, len(negReadsPaths) - 1) fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]), dtype=float) fakeSignal = fakeSignal[fromSignal:toSignal] readSignal = readSignal[:workingLen] refSignal = refSignal[:workingLen] fakeSignal = fakeSignal[:workingLen] readSignal = smoothSignal(readSignal, smoothParam) refSignal = smoothSignal(refSignal, smoothParam)
posReads = getReadsInFolder(readsPosFilePath, minSize=0) negReads = getReadsInFolder(readsNegFilePath, minSize=0) ################################################################################ totalCount = 0 suma = 0 for readFile in posReads: if totalCount == readCount: break try: readSignal = getSignalFromRead(readFile) except: continue mean = np.mean(readSignal) median = np.median(readSignal) stdev = np.std(readSignal) totalCount += 1 # print(f"Mean is {mean}") # print(f"Median is {median}") #print(f"Stdeviation is {stdev}") suma += abs(mean - median)
### get corresponding part of the reference using minimap2 referenceIdx = mp.Aligner(refFile) assert referenceIdx, "failed to load/build reference index" for readFile in posReads: if successfulReads == maxTests: break print(readFile) ### read read try: readFastq, readEvents = getSeqfromRead(readFile) except: print("Bad read!") continue readSeq = seqSignalCor(signalFrom, signalTo, readEvents) readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo], dtype=float) # readSeq - sequence cut out from read # readSignal - corresponding signal section # require a single hit with at least 95% coverage of length hits = [ aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) ] if len(hits) != 1: print("Too many or too few hits, skipping read.") continue hit = hits[0] successfulReads += 1
read = "../data/pos-basecalled/magnu_20181010_FAH93149_MN26672_sequencing_run_sapIng_19842_read_1706_ch_249_strand.fast5" # fromSignal, toSignal = 10050, 10110 # fromSignal, toSignal = 10050, 10200 fromSignal, toSignal = 10150, 10200 levels = 6 import sys import numpy as np sys.path.append("../helpers/hypothesis") from signalHelper import getSignalFromRead import matplotlib.pyplot as plt signal = getSignalFromRead(read)[fromSignal:toSignal] mini, maxi = min(signal), max(signal) + 5 levelSize = (maxi - mini) / levels y_values = [chr(ord("a") + i) for i in range(levels)] y_axis = np.arange(0, levels, 1) for a in np.arange(mini, maxi + levelSize, levelSize): plt.axhline(y=a, color="r", linewidth="2") helper = [chr(ord("a") + int((i - mini) / levelSize)) for i in signal] signalLevels = [" "] * len(helper) events = [] begg = 0
readFile = sys.argv[1] refFile = sys.argv[2] signalFrom = int(sys.argv[3]) signalTo = int(sys.argv[4]) levels = 6 repeatSignal = 10 kmerModelFilePath = "../../../data/kmer_model.hdf5" mod = KmerModel.load_from_hdf5(kmerModelFilePath) ### read read readFastq, readEvents = getSeqfromRead(readFile) readSeq = seqSignalCor(signalFrom, signalTo, readEvents) readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo], dtype=float) # readSeq - sequence cut out from read # readSignal - corresponding signal section ### get corresponding part of the reference using minimap2 referenceIdx = mp.Aligner(refFile) assert referenceIdx, "failed to load/build reference index" # require a single hit with at least 95% coverage of length hits = [ aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) ] if len(hits) != 1: print("Too many or too few hits, skipping read.") exit(0)
def normal(signal): newsignal = signal - np.mean(signal) newsignal /= np.std(newsignal) return newsignal def f(signal, mini, levelSize): helper = [chr(ord("a") + int((i - mini) / levelSize)) for i in signal] helper = "".join(helper) helper = "".join([k for k, g in groupby(helper)]) return helper signal = getSignalFromRead(read) signal1, signal2 = [], [] found = False levelSize = 0, 0, 0 signal = normal(signal) signal[signal > maxi] = maxi signal[signal < mini] = mini helper = {} counter = 3 for i in range(0, len(signal) - 2 * workLen + 1, 30):
refPosition = hit.r_st / len(ref[hit.ctg]) print(f"I am in ctg {hit.ctg} in around {refPosition}") if hit.strand == 1: refSeq = str(ref[hit.ctg][hit.r_st : hit.r_en]) refSignal = stringToSignal(refSeq, mod, repeatSignal) refSignal = smoothSignal(refSignal, smoothParam) #refShift, refScale = computeNorm(refSignal, 0, len(refSignal)) refShift, refScale = globalNorms[hit.ctg][0], globalNorms[hit.ctg][1] refString = computeString( refSignal, 0, len(refSignal), refShift, refScale, level, overflow=overflow, ) refString = refString[5:-5] readSignal = getSignalFromRead(sample) readSignalLen = len(readSignal) readSignal = readSignal[readSignalBeg:readSignalEnd] readString = getLevelString(readSignal, smoothParam, level, overflow) found = None for i in range(len(storeContig[hit.ctg]) - len(refString) + 1): w = storeContig[hit.ctg][i : i + len(refString)] if w == refString: found = i break if found == None: print("Problem") exit(0)