def allTLSeqCounter(): for i in range(1, numofruns+1): x = TSeqCounter(filenames[i]) y = LSeqCounter(filenames[i]) b = barcodechecker(filenames[i]) TSeqNums.update({i:x}) LSeqNums.update({i:y}) Barcodevalues.update({i:b})
def listinit(mink, maxk): l = lvalues[1] for i in range(0,numofruns+1): poslist.append({}) rposlist.append({}) try: avg = barcodechecker(filenames[i]) except: continue for k in range(mink, maxk+1): poslist[i].update({k:{}}) rposlist[i].update({k:{}}) for x in range(0,(l+1-k-(2*avg))): poslist[i][k].update({x:[]}) rposlist[i][k].update({x:[]})
def listinit(): for i in range(0,numofruns+1): poslist.append({}) rposlist.append({}) consensuslist.append({}) cooccurencelist.append({}) cooccurenceindexlist.append({}) gaplist.append({}) gapstdev.append({}) for j in range(1, numofruns+1): if extype == "SELEX": l = lvalues[j] if knownbarcode == False: if nobarcode == True: avg5 = 0 avg3 = 0 if nobarcode == False: avg5 = barcodechecker(filenames[j]) avg3 = avg5 if knownbarcode == True: avg5 = len(barcodeprimers53[j][0]) avg3 = len(barcodeprimers53[j][1]) for k in range(mink, maxk+1): poslist[j][k] = {} rposlist[j][k] = {} consensuslist[j][k] = {} cooccurencelist[j][k] = {} cooccurenceindexlist[j][k] = {} gaplist[j][k] = {} gapstdev[j][k] = {} for n in range(1, nmotifs+1): poslist[j][k][n] = {} rposlist[j][k][n] = {} consensuslist[j][k][n] = {} cooccurencelist[j][k][n] = {} cooccurenceindexlist[j][k][n] = {} gaplist[j][k][n] = [] gapstdev[j][k][n] = {} gapstdev[j][k][n]["mean"] = 0 gapstdev[j][k][n]["stdev"] = 0 if extype == "SELEX": for x in range(0,(l+1-k-(avg5+avg3))): poslist[j][k][n].update({x:0}) rposlist[j][k][n].update({x:0}) else: poslist[j][k][n] = [] rposlist[j][k][n] = []
def seqwtfbsfinder(FileName, k): TFBSSeqNum = 0 l = lvalues[(ufilenames[FileName])] fastaFileName = open(FileName, "r") avg = barcodechecker(FileName) for line in fastaFileName: c = 0 line = line.strip() if line.startswith(">"): continue if len(line) == l and "N" not in line: for x in range(0,((len(line)+1)-k)-(2*avg)): kmers = str(line[x+avg:x+k+avg]) if len(kmers) > 0 and line.count(kmers) == 1: hkmer = kmer2hash(kmers) if hkmer in hamminglist2[(ufilenames[FileName])][k]: if c == 0: TFBSSeqNum += 1 c += 1 else: continue if revcompwanted == True: if len(line) == l and "N" not in line: for x in range(0,((len(line)+1)-k)-(2*avg)): rkmers = revComp(line[x+avg:x+k+avg]) if len(rkmers) > 0 and (line.count(kmers) + line.count(rkmers)) == 1: hkmer = kmer2hash(rkmers) if hkmer in hamminglist2[(ufilenames[FileName])][k]: if c == 0: TFBSSeqNum += 1 c += 1 else: continue return TFBSSeqNum
def CreatePosList(FileName, k, runnum): fastaFileName = open(FileName, "r") avg = barcodechecker(FileName) for line in fastaFileName: line = line.strip() if line.startswith(">"): continue if len(line) == lvalues[runnum] and "N" not in line: for x in range(0,((len(line)+1)-k)-(2*avg)): kmers = str(line[x+avg:x+k+avg]) if len(kmers) > 0 and line.count(kmers) == 1: hkmers = kmer2hash(kmers) if hkmers in hamminglist2[runnum][k]: poslist[runnum][k][x].append(hkmers) if revcompwanted == True: if len(line) == lvalues[runnum] and "N" not in line: for x in range(0,((len(line)+1)-k)-(2*avg)): kmers = str(line[x+avg:x+k+avg]) rkmers = revComp(line[x+avg:x+k+avg]) if len(rkmers) > 0 and (line.count(rkmers) + line.count(kmers)) == 1: hrkmers = kmer2hash(rkmers) if hrkmers in hamminglist2[runnum][k]: rposlist[runnum][k][x].append(hrkmers)
def CreatePosListNORMAL(FileName, k, runnum): fastaFileName = open(FileName, "r") if knownbarcode == False: if nobarcode == True: avg5 = 0 avg3 = 0 if nobarcode == False: avg5 = barcodechecker(FileName) avg3 = avg5 if knownbarcode == True: avg5 = len(barcodeprimers53[runnum][0]) avg3 = len(barcodeprimers53[runnum][1]) firstline = fastaFileName.readline() firstline = firstline.strip() if firstline.startswith(">"): filetype = "fasta" if firstline.startswith("@"): filetype = "fastq" for n in range(1, nmotifs+1): seq1 = kmer2hash(consensuslist[runnum][k][n]) seq2 = kmer2hash(revComp(consensuslist[runnum][k][n])) for sequence in SeqIO.parse(FileName, filetype): line = str(sequence.seq) lenline = len(line) c = 0 LSeqNums[runnum] += 1 TSeqNums[runnum] += 1 done = set() noisefilter = {"F": [],"R": []} for x in range(0,(lenline+1-k)-(avg5+avg3)): try: kmers = kmer2hash(str(line[x+avg5:x+k+avg5])) #if kmers not in done: if kmers in hamminglist2[runnum][k][n]: poslist[runnum][k][n].append(x/(lenline-k+1-(avg5+avg3))) done.add(kmers) if c == 0: numoftfbsseq[runnum][k] += 1 c += 1 if kmers in rhamminglist2[runnum][k][n]: rposlist[runnum][k][n].append((x)/(lenline-k+1-(avg5+avg3))) done.add(kmers) if c == 0: numoftfbsseq[runnum][k] += 1 c += 1 if kmers == seq1: noisefilter["F"].append(x) if kmers == seq2: noisefilter["R"].append(x) except: continue if x == (len(line)-k)-(avg5+avg3): if seq1 & seq2 in done: numf = len(noisefilter["F"]) numr = len(noisefilter["R"]) a = 0 b = 0 mindiff = 1000 while (a < numf and b < numr): if (k < abs(noisefilter["F"][a] - noisefilter["R"][b]) < mindiff): mindiff = abs(noisefilter["F"][a] - noisefilter["R"][b]) if (noisefilter["F"][a] < noisefilter["R"][b]): a += 1 else: b += 1 if k < mindiff <= (k+10): cooccurencelist[runnum][k][n]["fr"] += 1 gaplist[runnum][k][n].append(mindiff-k) elif seq1 in done and seq2 not in done: cooccurencelist[runnum][k][n]["f"] += 1 elif seq2 in done and seq1 not in done: cooccurencelist[runnum][k][n]["r"] += 1