コード例 #1
0
def processRead(path, contig_name=None, refPosition=-1):
    print(path)
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]
    readLevelString = getGlobalString(readSignal)

    helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"}
    helperString = "".join(helperDict[i] for i in readLevelString)
    levelHits = list(index.map(helperString))

    if len(levelHits) == 0:
        #print("Return False.")
        return False

    for hit in levelHits:
        diff = (hit.r_en - hit.r_st) - (hit.q_en - hit.q_st)
        print("{0}: {1} vs {2}".format(hit.ctg, hit.r_en - hit.r_st,
                                       hit.q_en - hit.q_st))
        hitPosition = hit.r_st / lengths[hit.ctg]
        print(f"Position of hit is {hitPosition}")
        if abs(hitPosition - refPosition) <= 0.001:
            if contig_name != None and hit.ctg != contig_name:
                print("Zle urceny contig!")
                print("Return False.")
                #return False
            #print("Return true.")
            #return True
        '''if diff < 0.05*(hit.q_en-hit.q_st):
            a, b = stringAllignment(str(refFasta[hit.ctg][hit.r_st:hit.r_en]), helperString[hit.q_st:hit.q_en])
            for i in range(1, 20):
                print(i, ":", countDashes(a, i)+countDashes(b, i))'''
    print("Return False.")
    return False
コード例 #2
0
def processRead(path, contigName):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readString = getLevelString(readSignal, smoothParam, levels, overflow)
    readDict = buildDictionary(readString, kmerLength)
    return overlap(readDict, hashTable)
コード例 #3
0
def processRead(path, contig_name=None):
    print(path)
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[500:7000]
    start_time = time.time()
    readLevelString = getGlobalString(readSignal)
    print("--- Getting global string takes %s seconds ---" %
          (time.time() - start_time))

    helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"}
    helperString = "".join(helperDict[i] for i in readLevelString)
    #levelHits = list(index.map(helperString))

    print("Len of signal is {}".format(len(readSignal)))
    print("Len of helper string is {}".format(len(helperString)))

    start_time = time.time()

    print("Fir")
    for hit in index.map(helperString[:1000]):
        print("Position of hit is {0}".format(hit.r_st / lengths[hit.ctg]))

    print("Sec")
    for hit in index.map(helperString[-1000:]):
        print("Position of hit is {0}".format(hit.r_st / lengths[hit.ctg]))

    print("--- Searching string takes %s seconds ---" %
          (time.time() - start_time))

    return False
コード例 #4
0
def processRead(path, contigName, goodTable=-1):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readString = getLevelString(readSignal, smoothParam, levels, overflow)
    readDict = buildDictionary(readString, kmerLength)
    hits = [
        overlap(readDict, hashTable) for hashTable in hashTables[contigName]
    ]
    max_hits = max(hits)
    max_i = -1

    for i in range(len(hits)):
        if i == goodTable:
            print("->", end='')
        if hits[i] == max_hits:
            print("max->", end='')
            max_i = i
        print(hits[i], end=' ')
    print()
    
    global good, bad
    if goodTable != -1 and max_i == goodTable:
        print("Good match!")
        good += 1
    if goodTable != -1 and max_i != goodTable:
        bad += 1
    return
コード例 #5
0
ファイル: select.py プロジェクト: Aj0SK/bachelor_thesis
def processRead(path, contig_name=None, refPosition=-1):
    print(path)
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]
    readLevelString = getGlobalString(readSignal)

    helperDict = {"a": "A", "b": "C", "c": "G", "d": "T"}
    helperString = "".join(helperDict[i] for i in readLevelString)
    levelHits = list(index.map(helperString))

    if len(levelHits) == 0:
        return False

    for hit in levelHits:
        diff = (hit.r_en - hit.r_st) - (hit.q_en - hit.q_st)
        hitPosition = hit.r_st / lengths[hit.ctg]

        print(f"Hit position is {hitPosition}")

        #if diff < 0.05*(hit.q_en-hit.q_st):
        if abs(hitPosition - refPosition) <= 0.001:
            if contig_name != None and hit.ctg != contig_name:
                print("Zle urceny contig!")
                print("Return False.")
                return False
            return True

    print("Return False.")
    return False
コード例 #6
0
def processRead(path, contigName, goodTable=-1):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readDict = getDictFromSequence(readSignal, refWindowSize, refWindowJump)
    hits = [
        overlap(readDict, hashTable) for hashTable in hashTables[contigName]
    ]
    max_hits = max(hits)
    max_i = -1

    for i in range(len(hits)):
        if i == goodTable:
            print("->", end='')
        if hits[i] == max_hits:
            print("max->", end='')
            max_i = i
        print(hits[i], end=' ')
    print()
    global good, bad
    if goodTable != -1 and max_i == goodTable:
        print("Good match!")
        good += 1
    if goodTable != -1 and max_i != goodTable:
        bad += 1
    return
コード例 #7
0
def processRead(path, readFromRef=False):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readDict = getDictFromSequence(readSignal, refWindowSize, refWindowJump)
    hits = overlap(readDict, hashTable)

    print("Number of hits is {0}".format(hits))
    
    return hits
コード例 #8
0
ファイル: debug.py プロジェクト: Aj0SK/bachelor_thesis
def processRead(path, readFromRef=False):
    readSignal = np.array(getSignalFromRead(path), dtype=float)
    readSignal = readSignal[fromRead:toRead]

    readString = getDictFromSequence(readSignal, l=True)

    #hits = overlap(readDict, hashTable)
    #return hits

    myHits = 0
    jump = 2 * kmerLength
    operateRange = 10 * kmerLength
    for i in range(100):
        beg = random.randint(0, len(readString) - operateRange + 1)
        w = readString[beg:beg + kmerLength]
        candidates = hashTable.get(w, [])

        while len(candidates) > 1:
            j = random.randint(beg + jump, beg + operateRange - kmerLength + 1)

            w = readString[j:j + kmerLength]
            newcand = []
            for k in candidates:
                for l in hashTable.get(w, []):
                    if abs(k - l) < operateRange * 1.30:
                        newcand.append(k)
                        break

            candidates = newcand

        if len(candidates) != 0:
            myHits += 1
            print(f"Candidates len is {len(candidates)}")
            print(str(candidates)[:200])

    print(f"My hits is {myHits}")
    #print("Number of hits is {0}".format(hits))

    return myHits
コード例 #9
0
def helper(hashTable, reads, infoString):
    counter, total, totalNum = 0, 0, 0

    for readFile in reads:
        if totalNum == kmerNum:
            break
        try:
            readFastq, _ = getSeqfromRead(readFile)
        except:
            continue

        hits = [
            aln for aln in referenceIdx.map(readFastq)
            if aln.q_en - aln.q_st > 0.95 * len(readFastq) and aln.strand == 1
        ]

        if infoString == "+" and len(hits) != 1:
            continue
        if infoString == "-" and len(hits) != 0:
            continue

        readSignal = getSignalFromRead(readFile)

        if len(readSignal) <= toRead:
            continue

        counter += 1

        readSignal = readSignal[fromRead:toRead]
        readString = getLevelStr(readSignal, levels)

        for i in range(len(readString)):
            if kmerNum == totalNum:
                break
            kmer = readString[i:i + k]
            totalNum += 1
            total += hashTable.get(kmer, 0)
    print(f"{infoString} k {k} l {levels} -> {total} / {totalNum}")
コード例 #10
0
ファイル: testAlligner.py プロジェクト: Aj0SK/bachelor_thesis
        continue

    # require a single hit with at least 95% coverage of length
    hits = [
        aln for aln in referenceIdx.map(readFastq)
        if aln.q_en - aln.q_st > 0.95 *
        len(readFastq) and aln.strand == 1 and (aln.ctg == "contig1")
    ]
    if len(hits) != 1:
        #print("Too many or too few hits, skipping read.")
        continue

    counter += 1
    print(readFile)

    readSignal = np.array(getSignalFromRead(readFile), float)
    readString = getGlobalString(readSignal)

    readString = "".join(helperDict[i] for i in readString)
    readString = readString[:1500]
    
    levelHits = list(refLevelIdx.map(readString, cs = True))
    levelHits = [i for i in levelHits if i.strand == 1]
    
    if len(levelHits) == 0:
        print("No hits!")
        continue
    
    if abs((levelHits[0].r_st/len(refString))-(hits[0].r_st/len(contig)))<0.001:
        goodPosReads += 1
コード例 #11
0
assert sequenceIndex, "failed to load/build reference index"
################################################################################
# nadavca
nadavca_align = nadavca.align_signal(refFilePath, [sampleRead],
                                     bwa_executable='./bwa/bwa')

assert (len(nadavca_align) == 1), "Error! More than one alignment!"
nadavca_align = nadavca_align[0]
assert (nadavca_align[0].reverse_complement == False), "Error! Reverse strand!"
################################################################################

refStr = str(ref[nadavca_align[0].contig_name])
fromSignal, toSignal = nadavca_align[0].signal_range

# load original signal from read
originalSignal = getSignalFromRead(sampleRead)
originalSignal = np.array(originalSignal, dtype=float)

table = nadavca_align[1][:40]
refSeq = "".join(nadavca_align[0].reference_part)[:40]

x, y = [], []

for entry in table:  #entry is list of [ref_index, signal_start, signal_end]
    x.append(str(refStr[entry[0]]))
    for i in range(entry[1], entry[2]):
        x.append(" ")
        y.append(originalSignal[i])

#plt.plot(y)
#plt.xticks(y_pos, x, color='orange', rotation=45, fontweight='bold', horizontalalignment='right')
コード例 #12
0
    ]
    if len(hits) != 1:
        # print("Too many or too few hits, skipping read.")
        continue
    print(readFile)
    hit = hits[0]
    successfulReads += 1

    if hit.strand == 1:
        refSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
        fakeSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
    else:
        refSeq = str(-Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])
        fakeSeq = str(Fasta(refFile)[hit.ctg][hit.r_st:hit.r_en])

    readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo],
                          dtype=float)
    refSignal = np.array(
        stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float)
    # fakeSignal = np.array(stringToSignal(fakeSeq, mod, repeatSignal = repeatSignal),
    #                float)
    fakeSignal = []
    fakeIndex = -1
    while len(fakeSignal) <= signalTo:
        fakeIndex = random.randint(0, len(negReads) - 1)
        fakeSignal = np.array(getSignalFromRead(negReads[fakeIndex]),
                              dtype=float)
    fakeSignal = fakeSignal[signalFrom:signalTo]

    readSignalSm = smoothSignal(readSignal, 5)
    refSignalSm = smoothSignal(refSignal, 5)
コード例 #13
0
    if (toSignal - fromSignal) < workingLen:
        continue

    #print(f"Signal alligned from {fromSignal} to {toSignal}")
    print("Working on", posRead)
    print(f"So far done {readCounter} reads")
    readCounter += 1

    if strand == 1:
        refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef])
    else:
        refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef])

    refSignal = np.array(
        stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float)
    readSignal = np.array(getSignalFromRead(posRead), dtype=float)
    readSignal = readSignal[fromSignal:toSignal]

    readSignal = readSignal[:workingLen]
    refSignal = refSignal[:workingLen]

    readSignalSm = smoothSignal(readSignal, smoothParam)
    refSignalSm = smoothSignal(refSignal, smoothParam)

    readShift, readScale = computeNorm(readSignal, 0, len(readSignal))
    readShiftSm, readScaleSm = computeNorm(readSignalSm, 0, len(readSignalSm))
    refShift, refScale = computeNorm(refSignal, 0, len(refSignal))
    refShiftSm, refScaleSm = computeNorm(refSignalSm, 0, len(refSignalSm))

    readStrings, readStringsSm, refStrings, refStringsSm = {}, {}, {}, {}
コード例 #14
0
    if (toSignal - fromSignal) < workingLen:
        continue

    # print(f"Signal alligned from {fromSignal} to {toSignal}")
    print("Working on", posRead)
    print(f"So far done {readCounter} reads")
    readCounter += 1

    if strand == 1:
        refSeq = str(Fasta(refFilePath)[ctg][fromRef:toRef])
    else:
        refSeq = str(-Fasta(refFilePath)[ctg][fromRef:toRef])

    refSignal = np.array(
        stringToSignal(refSeq, mod, repeatSignal=repeatSignal), float)
    readSignal = np.array(getSignalFromRead(posRead), dtype=float)
    readSignal = readSignal[fromSignal:toSignal]
    fakeSignal = []
    fakeIndex = -1
    while len(fakeSignal) <= toSignal:
        fakeIndex = random.randint(0, len(negReadsPaths) - 1)
        fakeSignal = np.array(getSignalFromRead(negReadsPaths[fakeIndex]),
                              dtype=float)
    fakeSignal = fakeSignal[fromSignal:toSignal]

    readSignal = readSignal[:workingLen]
    refSignal = refSignal[:workingLen]
    fakeSignal = fakeSignal[:workingLen]

    readSignal = smoothSignal(readSignal, smoothParam)
    refSignal = smoothSignal(refSignal, smoothParam)
コード例 #15
0
posReads = getReadsInFolder(readsPosFilePath, minSize=0)
negReads = getReadsInFolder(readsNegFilePath, minSize=0)

################################################################################

totalCount = 0

suma = 0

for readFile in posReads:
    if totalCount == readCount:
        break

    try:
        readSignal = getSignalFromRead(readFile)
    except:
        continue

    mean = np.mean(readSignal)
    median = np.median(readSignal)
    stdev = np.std(readSignal)

    totalCount += 1

    # print(f"Mean is {mean}")
    # print(f"Median is {median}")
    #print(f"Stdeviation is {stdev}")

    suma += abs(mean - median)
コード例 #16
0
ファイル: globalScorer.py プロジェクト: Aj0SK/bachelor_thesis
### get corresponding part of the reference using minimap2
referenceIdx = mp.Aligner(refFile)
assert referenceIdx, "failed to load/build reference index"

for readFile in posReads:
    if successfulReads == maxTests:
        break
    print(readFile)
    ### read read
    try:
        readFastq, readEvents = getSeqfromRead(readFile)
    except:
        print("Bad read!")
        continue
    readSeq = seqSignalCor(signalFrom, signalTo, readEvents)
    readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo],
                          dtype=float)
    # readSeq - sequence cut out from read
    # readSignal - corresponding signal section

    # require a single hit with at least 95% coverage of length
    hits = [
        aln for aln in referenceIdx.map(readSeq)
        if aln.q_en - aln.q_st > 0.95 * len(readSeq)
    ]
    if len(hits) != 1:
        print("Too many or too few hits, skipping read.")
        continue
    hit = hits[0]
    successfulReads += 1
コード例 #17
0
read = "../data/pos-basecalled/magnu_20181010_FAH93149_MN26672_sequencing_run_sapIng_19842_read_1706_ch_249_strand.fast5"

# fromSignal, toSignal = 10050, 10110
# fromSignal, toSignal = 10050, 10200
fromSignal, toSignal = 10150, 10200
levels = 6

import sys
import numpy as np

sys.path.append("../helpers/hypothesis")
from signalHelper import getSignalFromRead

import matplotlib.pyplot as plt

signal = getSignalFromRead(read)[fromSignal:toSignal]

mini, maxi = min(signal), max(signal) + 5
levelSize = (maxi - mini) / levels

y_values = [chr(ord("a") + i) for i in range(levels)]
y_axis = np.arange(0, levels, 1)

for a in np.arange(mini, maxi + levelSize, levelSize):
    plt.axhline(y=a, color="r", linewidth="2")

helper = [chr(ord("a") + int((i - mini) / levelSize)) for i in signal]
signalLevels = [" "] * len(helper)

events = []
begg = 0
コード例 #18
0
ファイル: vinkotester.py プロジェクト: Aj0SK/bachelor_thesis
readFile = sys.argv[1]
refFile = sys.argv[2]
signalFrom = int(sys.argv[3])
signalTo = int(sys.argv[4])

levels = 6

repeatSignal = 10
kmerModelFilePath = "../../../data/kmer_model.hdf5"
mod = KmerModel.load_from_hdf5(kmerModelFilePath)

### read read
readFastq, readEvents = getSeqfromRead(readFile)
readSeq = seqSignalCor(signalFrom, signalTo, readEvents)
readSignal = np.array(getSignalFromRead(readFile)[signalFrom:signalTo],
                      dtype=float)
# readSeq - sequence cut out from read
# readSignal - corresponding signal section

### get corresponding part of the reference using minimap2
referenceIdx = mp.Aligner(refFile)
assert referenceIdx, "failed to load/build reference index"
# require a single hit with at least 95% coverage of length
hits = [
    aln for aln in referenceIdx.map(readSeq)
    if aln.q_en - aln.q_st > 0.95 * len(readSeq)
]
if len(hits) != 1:
    print("Too many or too few hits, skipping read.")
    exit(0)
コード例 #19
0

def normal(signal):
    newsignal = signal - np.mean(signal)
    newsignal /= np.std(newsignal)
    return newsignal


def f(signal, mini, levelSize):
    helper = [chr(ord("a") + int((i - mini) / levelSize)) for i in signal]
    helper = "".join(helper)
    helper = "".join([k for k, g in groupby(helper)])
    return helper


signal = getSignalFromRead(read)
signal1, signal2 = [], []
found = False

levelSize = 0, 0, 0

signal = normal(signal)

signal[signal > maxi] = maxi
signal[signal < mini] = mini

helper = {}

counter = 3

for i in range(0, len(signal) - 2 * workLen + 1, 30):
コード例 #20
0
    refPosition = hit.r_st / len(ref[hit.ctg])
    print(f"I am in ctg {hit.ctg} in around {refPosition}")

    if hit.strand == 1:
        refSeq = str(ref[hit.ctg][hit.r_st : hit.r_en])

        refSignal = stringToSignal(refSeq, mod, repeatSignal)
        refSignal = smoothSignal(refSignal, smoothParam)
        #refShift, refScale = computeNorm(refSignal, 0, len(refSignal))
        refShift, refScale = globalNorms[hit.ctg][0], globalNorms[hit.ctg][1]
        refString = computeString(
            refSignal, 0, len(refSignal), refShift, refScale, level, overflow=overflow,
        )
        refString = refString[5:-5]

        readSignal = getSignalFromRead(sample)
        readSignalLen = len(readSignal)
        readSignal = readSignal[readSignalBeg:readSignalEnd]
        readString = getLevelString(readSignal, smoothParam, level, overflow)

        found = None
        for i in range(len(storeContig[hit.ctg]) - len(refString) + 1):
            w = storeContig[hit.ctg][i : i + len(refString)]
            if w == refString:
                found = i
                break

        if found == None:
            print("Problem")
            exit(0)