Beispiel #1
0
            x += 1
        X.append(x)
        Y.append(y)
    return X, Y


hashTable = {}
processed = []

for contig in Fasta(refFilePath):
    if contig.name != workingContig:
        continue
    processed.append(contig.name)

    contigStr = str(contig)
    contigSignal = stringToSignal(contigStr, mod, repeatSignal=repeatSignal)
    hashTable = getDictFromSequence(contigSignal, refWindowSize, refWindowJump)

#print("Hashtable readyfor {0} nums!".format(contigNum))
#######################################
'''
print("Overlap is:")
for i in range(len(hashTables)):
    for j in range(len(hashTables)):
        if i != j:
            counter = 0
            for k in hashTables[i]:
                if k in hashTables[j]:
                    counter += 1
            print("{0} {1}: {2} with sizes {3} {4}".format(i, j, counter, len(hashTables[i]), len(hashTables[j])))
'''
Beispiel #2
0
    if (toSignal - fromSignal) < workingLen:
        continue

    # print(f"Signal alligned from {fromSignal} to {toSignal}")
    print("Working on", posRead)
    print(f"So far done {readCounter} reads")
    readCounter += 1

    if strand == 1:
        refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef])
    else:
        refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef])

    refSignal = np.array(
        stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float)
    readSignal = np.array(getSignalFromRead(posRead), dtype=float)
    readSignal = readSignal[fromSignal:toSignal]
    fakeSignal = []
    fakeIndex = -1
    while len(fakeSignal) <= toSignal:
        fakeIndex = random.randint(0, len(negReadsPaths) - 1)
        fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]),
                              dtype=float)
    fakeSignal = fakeSignal[fromSignal:toSignal]

    readSignal = readSignal[:workingLen]
    refSignal = refSignal[:workingLen]
    fakeSignal = fakeSignal[:workingLen]

    readSignal = smoothSignal(readSignal, smoothParam)
Beispiel #3
0
# load filenames of all positive and negative reads
posFast5 = glob.glob(readsPosFilePath + '/*.fast5', recursive=True)
negFast5 = glob.glob(readsNegFilePath + '/*.fast5', recursive=True)

assert len(posFast5) >= posTestCases, "Not enough positive testcases!"
assert len(negFast5) >= negTestCases, "Not enough negative testcases!"

################################################################################

hashTables = {}
processed = []

for contig in Fasta(refFilePath):
    processed.append(contig.name)
    hashTables[contig.name] = []
    contigSignal = stringToSignal(str(contig), mod, repeatSignal=repeatSignal)
    for i in range(0, len(contigSignal) - hashWinSize + 1, hashWinJump):
        hashTables[contig.name].append(
            getDictFromSequence(contigSignal[i:i + hashWinSize], refWindowSize,
                                refWindowJump))
    contigNum -= 1
    if contigNum == 0:
        break

print("Hashtable readyfor {0} nums!".format(contigNum))
#######################################
'''
print("Overlap is:")
for i in range(len(hashTables)):
    for j in range(len(hashTables)):
        if i != j:
table = nadavca_align[1][:40]
refSeq = "".join(nadavca_align[0].reference_part)[:40]

x, y = [], []

for entry in table:  #entry is list of [ref_index, signal_start, signal_end]
    x.append(str(refStr[entry[0]]))
    for i in range(entry[1], entry[2]):
        x.append(" ")
        y.append(originalSignal[i])

#plt.plot(y)
#plt.xticks(y_pos, x, color='orange', rotation=45, fontweight='bold', horizontalalignment='right')
#plt.tick_params(labelbottom='off')

refSignal = stringToSignal(refSeq, mod, repeatSignal=repeatSignal)
refSeqHelper = []

for i in refSeq:
    refSeqHelper.append(i)
    for k in range(repeatSignal - 1):
        refSeqHelper.append("_")

y = smoothSignal(y, 5)
refSignal = smoothSignal(refSignal, 5)

ySignalShift, ySignalScale = computeNorm(y, 0, len(y))
#y -= ySignalShift
#y /= ySignalScale

refSignalShift, refSignalScale = computeNorm(refSignal, 0, len(refSignal))
Beispiel #5
0
# kmer model
kmerModelFilePath = "../data/kmer_model.hdf5"

repeatSignal = 10

fromRef, toRef = 100000, 100050
contig = "contig1"

import sys
import numpy as np
from pyfaidx import Fasta
from nadavca.dtw import KmerModel

sys.path.append("../helpers/hypothesis")
from signalHelper import stringToSignal

import matplotlib.pyplot as plt

################################################################################
# load reference sequence and create index for fast mapping

ref = Fasta(refFilePath)
mod = KmerModel.load_from_hdf5(kmerModelFilePath)

sequence = str(ref[contig][fromRef:toRef])

signal = stringToSignal(sequence, mod, repeatSignal=10)

plt.plot(signal)
plt.show()
Beispiel #6
0
    ctg = index[readName][0]

    if (toSignal - fromSignal) < workingLen:
        continue

    #print(f"Signal alligned from {fromSignal} to {toSignal}")
    print("Working on", posRead)
    print(f"So far done {readCounter} reads")
    readCounter += 1

    if strand == 1:
        refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef])
    else:
        refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef])

    refSignal = np.array(stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float)
    readSignal = np.array(getSignalFromRead(posRead), dtype=float)
    readSignal = readSignal[fromSignal:toSignal]
    fakeSignal = []
    fakeIndex = -1
    while len(fakeSignal) <= toSignal:
        fakeIndex = random.randint(0, len(negReadsPaths) - 1)
        fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]), dtype=float)
    fakeSignal = fakeSignal[fromSignal:toSignal]

    readSignal = readSignal[:workingLen]
    refSignal = refSignal[:workingLen]
    fakeSignal = fakeSignal[:workingLen]

    #readSignal = smoothSignal(readSignal, smoothParam)
    #refSignal = smoothSignal(refSignal, smoothParam)
Beispiel #7
0
]
posFast5 = [data[i] for i in range(0, len(data), 2)]
basecalledFast5 = [data[i] for i in range(1, len(data), 2)]

negFast5 = glob.glob(readsNegFilePath + "/*.fast5", recursive=True)

################################################################################

hashTable = {}

for contig in Fasta(refFilePath):
    if contig.name != targetContig:
        continue
    ref = str(contig)
    ref = ref[targetBeg:targetEnd]
    contigSignal = stringToSignal(ref, mod, repeatSignal=repeatSignal)
    hashTable = getDictFromSequence(contigSignal)
    break

#for k in sorted(hashTable, key=hashTable.get, reverse=True)[:100]:
#    pass
#    # print("{0} {1}".format(k, hashTable[k]))
#    # del hashTable[k]


def processRead(path, readFromRef=False):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readString = getDictFromSequence(readSignal, l=True)
Beispiel #8
0
    aln for aln in referenceIdx.map(readSeq)
    if aln.q_en - aln.q_st > 0.95 * len(readSeq)
]
if len(hits) != 1:
    print("Too many or too few hits, skipping read.")
    exit(0)
hit = hits[0]

if (hit.strand == 1):
    refSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
    fakeSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
else:
    refSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
    fakeSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])

refSignal = np.array(stringToSignal(refSeq, mod, repeatSignal=repeatSignal),
                     float)
fakeSignal = np.array(stringToSignal(fakeSeq, mod, repeatSignal=repeatSignal),
                      float)

print(readSeq)
print(refSeq)
print(fakeSeq)
# refSeq - part of the reference sequence corresponding to the read segment

readSignalSm = smoothSignal(readSignal, 5)
refSignalSm = smoothSignal(refSignal, 5)
fakeSignalSm = smoothSignal(fakeSignal, 5)

readShift, readScale = computeNorm(readSignal, 0, len(readSignal))
refShift, refScale = computeNorm(refSignal, 0, len(refSignal))
Beispiel #9
0
import numpy as np

from pyfaidx import Fasta
from nadavca.dtw import KmerModel
from signalHelper import stringToSignal
from signalHelper import smoothSignal, computeNorm, computeString

ref = Fasta(refFilePath)
mod = KmerModel.load_from_hdf5(kmerModelFilePath)

for contig in ref:
    refSeqPos = str(contig[:])
    refSeqNeg = str(contig[:].complement)

    refSignalPos = np.array(
        stringToSignal(refSeqPos, mod, repeatSignal=repeatSignal), float)
    refSignalNeg = np.array(
        stringToSignal(refSeqNeg, mod, repeatSignal=repeatSignal), float)

    refSignalPos = smoothSignal(refSignalPos, smoothParam)
    refSignalNeg = smoothSignal(refSignalNeg, smoothParam)

    refSignalPosShift, refSignalPosScale = computeNorm(refSignalPos, 0,
                                                       len(refSignalPos))
    refSignalNegShift, refSignalNegScale = computeNorm(refSignalNeg, 0,
                                                       len(refSignalNeg))

    for l in levels:
        refStringPos = computeString(
            refSignalPos,
            0,