x += 1 X.append(x) Y.append(y) return X, Y hashTable = {} processed = [] for contig in Fasta(refFilePath): if contig.name != workingContig: continue processed.append(contig.name) contigStr = str(contig) contigSignal = stringToSignal(contigStr, mod, repeatSignal=repeatSignal) hashTable = getDictFromSequence(contigSignal, refWindowSize, refWindowJump) #print("Hashtable readyfor {0} nums!".format(contigNum)) ####################################### ''' print("Overlap is:") for i in range(len(hashTables)): for j in range(len(hashTables)): if i != j: counter = 0 for k in hashTables[i]: if k in hashTables[j]: counter += 1 print("{0} {1}: {2} with sizes {3} {4}".format(i, j, counter, len(hashTables[i]), len(hashTables[j]))) '''
if (toSignal - fromSignal) < workingLen: continue # print(f"Signal alligned from {fromSignal} to {toSignal}") print("Working on", posRead) print(f"So far done {readCounter} reads") readCounter += 1 if strand == 1: refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef]) else: refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef]) refSignal = np.array( stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) readSignal = np.array(getSignalFromRead(posRead), dtype=float) readSignal = readSignal[fromSignal:toSignal] fakeSignal = [] fakeIndex = -1 while len(fakeSignal) <= toSignal: fakeIndex = random.randint(0, len(negReadsPaths) - 1) fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]), dtype=float) fakeSignal = fakeSignal[fromSignal:toSignal] readSignal = readSignal[:workingLen] refSignal = refSignal[:workingLen] fakeSignal = fakeSignal[:workingLen] readSignal = smoothSignal(readSignal, smoothParam)
# load filenames of all positive and negative reads posFast5 = glob.glob(readsPosFilePath + '/*.fast5', recursive=True) negFast5 = glob.glob(readsNegFilePath + '/*.fast5', recursive=True) assert len(posFast5) >= posTestCases, "Not enough positive testcases!" assert len(negFast5) >= negTestCases, "Not enough negative testcases!" ################################################################################ hashTables = {} processed = [] for contig in Fasta(refFilePath): processed.append(contig.name) hashTables[contig.name] = [] contigSignal = stringToSignal(str(contig), mod, repeatSignal=repeatSignal) for i in range(0, len(contigSignal) - hashWinSize + 1, hashWinJump): hashTables[contig.name].append( getDictFromSequence(contigSignal[i:i + hashWinSize], refWindowSize, refWindowJump)) contigNum -= 1 if contigNum == 0: break print("Hashtable readyfor {0} nums!".format(contigNum)) ####################################### ''' print("Overlap is:") for i in range(len(hashTables)): for j in range(len(hashTables)): if i != j:
table = nadavca_align[1][:40] refSeq = "".join(nadavca_align[0].reference_part)[:40] x, y = [], [] for entry in table: #entry is list of [ref_index, signal_start, signal_end] x.append(str(refStr[entry[0]])) for i in range(entry[1], entry[2]): x.append(" ") y.append(originalSignal[i]) #plt.plot(y) #plt.xticks(y_pos, x, color='orange', rotation=45, fontweight='bold', horizontalalignment='right') #plt.tick_params(labelbottom='off') refSignal = stringToSignal(refSeq, mod, repeatSignal=repeatSignal) refSeqHelper = [] for i in refSeq: refSeqHelper.append(i) for k in range(repeatSignal - 1): refSeqHelper.append("_") y = smoothSignal(y, 5) refSignal = smoothSignal(refSignal, 5) ySignalShift, ySignalScale = computeNorm(y, 0, len(y)) #y -= ySignalShift #y /= ySignalScale refSignalShift, refSignalScale = computeNorm(refSignal, 0, len(refSignal))
# kmer model kmerModelFilePath = "../data/kmer_model.hdf5" repeatSignal = 10 fromRef, toRef = 100000, 100050 contig = "contig1" import sys import numpy as np from pyfaidx import Fasta from nadavca.dtw import KmerModel sys.path.append("../helpers/hypothesis") from signalHelper import stringToSignal import matplotlib.pyplot as plt ################################################################################ # load reference sequence and create index for fast mapping ref = Fasta(refFilePath) mod = KmerModel.load_from_hdf5(kmerModelFilePath) sequence = str(ref[contig][fromRef:toRef]) signal = stringToSignal(sequence, mod, repeatSignal=10) plt.plot(signal) plt.show()
ctg = index[readName][0] if (toSignal - fromSignal) < workingLen: continue #print(f"Signal alligned from {fromSignal} to {toSignal}") print("Working on", posRead) print(f"So far done {readCounter} reads") readCounter += 1 if strand == 1: refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef]) else: refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef]) refSignal = np.array(stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) readSignal = np.array(getSignalFromRead(posRead), dtype=float) readSignal = readSignal[fromSignal:toSignal] fakeSignal = [] fakeIndex = -1 while len(fakeSignal) <= toSignal: fakeIndex = random.randint(0, len(negReadsPaths) - 1) fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]), dtype=float) fakeSignal = fakeSignal[fromSignal:toSignal] readSignal = readSignal[:workingLen] refSignal = refSignal[:workingLen] fakeSignal = fakeSignal[:workingLen] #readSignal = smoothSignal(readSignal, smoothParam) #refSignal = smoothSignal(refSignal, smoothParam)
] posFast5 = [data[i] for i in range(0, len(data), 2)] basecalledFast5 = [data[i] for i in range(1, len(data), 2)] negFast5 = glob.glob(readsNegFilePath + "/*.fast5", recursive=True) ################################################################################ hashTable = {} for contig in Fasta(refFilePath): if contig.name != targetContig: continue ref = str(contig) ref = ref[targetBeg:targetEnd] contigSignal = stringToSignal(ref, mod, repeatSignal=repeatSignal) hashTable = getDictFromSequence(contigSignal) break #for k in sorted(hashTable, key=hashTable.get, reverse=True)[:100]: # pass # # print("{0} {1}".format(k, hashTable[k])) # # del hashTable[k] def processRead(path, readFromRef=False): readSignal = np.array(getSignalFromRead(path), dtype=float) readSignal = readSignal[fromRead:toRead] readString = getDictFromSequence(readSignal, l=True)
aln for aln in referenceIdx.map(readSeq) if aln.q_en - aln.q_st > 0.95 * len(readSeq) ] if len(hits) != 1: print("Too many or too few hits, skipping read.") exit(0) hit = hits[0] if (hit.strand == 1): refSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) fakeSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) else: refSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) fakeSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en]) refSignal = np.array(stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float) fakeSignal = np.array(stringToSignal(fakeSeq, mod, repeatSignal=repeatSignal), float) print(readSeq) print(refSeq) print(fakeSeq) # refSeq - part of the reference sequence corresponding to the read segment readSignalSm = smoothSignal(readSignal, 5) refSignalSm = smoothSignal(refSignal, 5) fakeSignalSm = smoothSignal(fakeSignal, 5) readShift, readScale = computeNorm(readSignal, 0, len(readSignal)) refShift, refScale = computeNorm(refSignal, 0, len(refSignal))
import numpy as np from pyfaidx import Fasta from nadavca.dtw import KmerModel from signalHelper import stringToSignal from signalHelper import smoothSignal, computeNorm, computeString ref = Fasta(refFilePath) mod = KmerModel.load_from_hdf5(kmerModelFilePath) for contig in ref: refSeqPos = str(contig[:]) refSeqNeg = str(contig[:].complement) refSignalPos = np.array( stringToSignal(refSeqPos, mod, repeatSignal=repeatSignal), float) refSignalNeg = np.array( stringToSignal(refSeqNeg, mod, repeatSignal=repeatSignal), float) refSignalPos = smoothSignal(refSignalPos, smoothParam) refSignalNeg = smoothSignal(refSignalNeg, smoothParam) refSignalPosShift, refSignalPosScale = computeNorm(refSignalPos, 0, len(refSignalPos)) refSignalNegShift, refSignalNegScale = computeNorm(refSignalNeg, 0, len(refSignalNeg)) for l in levels: refStringPos = computeString( refSignalPos, 0,