def prepareNextBatch(self): self.nextBatchX = np.zeros( (self.batchSize, self.numKds, self.numTFs)) + np.log(99999.0) self.nextBatchY = np.zeros((self.batchSize)) b = 0 while b < self.batchSize: line = self.curFH.readline() if line == "": if self.numRuns == 1: self.nextBatchX = self.nextBatchX[0:b, :, :] self.nextBatchY = self.nextBatchY[0:b] self.numRuns -= 1 return self.curFH.close() self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r') self.numRuns -= 1 line = self.curFH.readline() if line is None or line[0] == "#": continue curData = line.split("\t") self.nextBatchY[b] = float(curData[0]) for t in range(1, len(curData)): curKds = [np.log(float(x)) for x in curData[t].split(";")] self.nextBatchX[b, 0:min(self.numKds, len(curKds)), t - 1] = curKds[0:min(self.numKds, len(curKds))] b += 1
def makeBowtieDB(): global args ## Uses global keyword to access variables defined outside the function. global promoterSeqs fastaOut = MYUTILS.smartGZOpen(args.tempFilePre + ".seqs.fasta", 'w') ## Define the output FASTA file. for i in range(0, len(promoterSeqs)): fastaOut.write(">%i\n%s\n" % (i, promoterSeqs[i])) ## Prints the promoter seqs in FASTA format: ## >1 ## ATCGATCGATCGTCAGTAGCTCGTACGTAGCGACTGCTCGTAGC... ## >2 ## ATCGATCGCTACGATGCTAGATGCTCGATCGTCGTACGTACGTA... fastaOut.close() ## Close the file. #subprocess.check_call(["bowtie2-build","%s.seqs.fasta"%args.tempFilePre,"%s.bowtie2"%args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE); p = subprocess.Popen( ["bowtie2-build"] + args.bowtieBuildParams.split() + ["%s.seqs.fasta" % args.tempFilePre, "%s.bowtie2" % args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) if args.verbose > 0: sys.stderr.write("Running bowtie2-build...") (curStdout, stderrData) = p.communicate() ## Popen.communicate interact with process: ## Send data to stdin. Read data from stdout and stderr, ## until end-of-file is reached. if args.verbose > 0: sys.stderr.write(curStdout) sys.stderr.write(stderrData) sys.stderr.write("done!\n")
def prepareNextBatch(self): self.nextBatchX = np.zeros( (self.batchSize, self.seqLen - self.wordLen + 1)).astype("int32") self.nextBatchY = np.zeros((self.batchSize)) b = 0 while b < self.batchSize: line = self.curFH.readline() if line == "": if self.numRuns == 1: self.nextBatchX = self.nextBatchX[0:b, :, :] self.nextBatchY = self.nextBatchY[0:b] self.numRuns -= 1 return self.curFH.close() self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r') self.numRuns -= 1 line = self.curFH.readline() if line is None or line[0] == "#": continue curData = line.rstrip().split("\t") self.nextBatchY[b] = float(curData[0]) curSeq = curData[1] if len(curSeq) < self.seqLen: curSeq = "N" * (self.seqLen - len(curSeq)) + curSeq ### prepend Ns if the sequence is too short curSeq = curSeq[( len(curSeq) - self.seqLen):len(curSeq)] # trim distal bases if too long for si in range(0, self.seqLen - self.wordLen + 1): self.nextBatchX[b, si] = self.kmer2index[curSeq[si:( si + self.wordLen )]] #fill X with the indeces of the various k-mers b += 1
def __init__(self, inFP, batchSize, numRuns, seqLen): self.inFP = inFP self.batchSize = batchSize self.numRuns = numRuns self.seqLen = seqLen self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r') self.curThread = Thread(target=self.prepareNextBatch) self.curThread.start()
def saveMatrix(outFileName, rowLabs, colLabs, dataMatrix): outFile = MYUTILS.smartGZOpen(outFileName, "w") outFile.write("\t".join(colLabs) + "\n") for i in range(0, len(rowLabs)): outFile.write(rowLabs[i]) for j in range(0, dataMatrix.shape[1]): outFile.write("\t%g" % dataMatrix[i, j]) outFile.write("\n") outFile.close()
def __init__(self, inFP, batchSize, numRuns,numTFs, numKds): self.inFP = inFP; self.batchSize = batchSize; self.numRuns= numRuns; self.numTFs= numTFs; self.numKds= numKds; self.curFH = MYUTILS.smartGZOpen(self.inFP,'r') self.curThread = Thread(target = self.prepareNextBatch); self.curThread.start()
def __init__(self, inFP, batchSize, numRuns,seqLen, kmer2index, wordLen): self.inFP = inFP; self.batchSize = batchSize; self.numRuns= numRuns; self.seqLen= seqLen; self.wordLen= wordLen; self.kmer2index= kmer2index; self.curFH = MYUTILS.smartGZOpen(self.inFP,'r') self.curThread = Thread(target = self.prepareNextBatch); self.curThread.start()
def prepareNextBatch(self): self.nextBatchX = np.zeros((self.batchSize, 4, self.seqLen, 1)) self.nextBatchY = np.zeros((self.batchSize)) b = 0 while b < self.batchSize: line = self.curFH.readline() if line == "": if self.numRuns == 1: self.nextBatchX = self.nextBatchX[0:b, :, :, :] self.nextBatchY = self.nextBatchY[0:b] self.numRuns -= 1 return self.curFH.close() self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r') self.numRuns -= 1 line = self.curFH.readline() if line is None or line[0] == "#": continue curData = np.fromstring(line, dtype=float, sep="\t") self.nextBatchY[b] = curData[0] self.nextBatchX[b, :, :, 0] = curData[1:].reshape((4, self.seqLen)) b += 1
def prepareNextBatch(self): self.nextBatchX = np.zeros((self.batchSize,self.numKds,self.numTFs)) +np.log(99999.0); self.nextBatchY = np.zeros((self.batchSize)) b=0 while b < self.batchSize: line = self.curFH.readline() if line =="": if self.numRuns==1: self.nextBatchX = self.nextBatchX[0:b,:,:] self.nextBatchY = self.nextBatchY[0:b] self.numRuns-=1; return; self.curFH.close(); self.curFH = MYUTILS.smartGZOpen(self.inFP,'r') self.numRuns-=1; line = self.curFH.readline() if line is None or line[0]=="#": continue curData = np.fromstring(line, dtype=float, sep="\t") self.nextBatchY[b]=curData[0]; self.nextBatchX[b,:,:] = np.transpose(curData[1:len(curData)].reshape((self.numTFs,self.numKds))) b+=1
def makeBowtieDB(): global args global promoterSeqs fastaOut = MYUTILS.smartGZOpen(args.tempFilePre + ".seqs.fasta", 'w') for i in range(0, len(promoterSeqs)): fastaOut.write(">%i\n%s\n" % (i, promoterSeqs[i])) fastaOut.close() #subprocess.check_call(["bowtie2-build","%s.seqs.fasta"%args.tempFilePre,"%s.bowtie2"%args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE); p = subprocess.Popen( ["bowtie2-build"] + args.bowtieBuildParams.split() + ["%s.seqs.fasta" % args.tempFilePre, "%s.bowtie2" % args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) if args.verbose > 0: sys.stderr.write("Running bowtie2-build...") (curStdout, stderrData) = p.communicate() if args.verbose > 0: sys.stderr.write(curStdout) sys.stderr.write(stderrData) sys.stderr.write("done!\n")
required=False) parser.add_argument('-l', dest='logFP', metavar='<logFile>', help='Where to output errors/warnings [default=stderr]', required=False) parser.add_argument('-v', dest='verbose', action='count', help='Verbose output?', required=False, default=0) args = parser.parse_args() inFileDict = MYUTILS.smartGZOpen(args.inFPDict, 'r') inFileSeqs = MYUTILS.smartGZOpen(args.inFPSeqs, 'r') if (args.logFP is not None): logFile = MYUTILS.smartGZOpen(args.logFP, 'w') sys.stderr = logFile if (args.outFP is None): outFile = sys.stdout else: if args.verbose > 0: sys.stderr.write("Outputting to file " + args.outFP + "\n") outFile = MYUTILS.smartGZOpen(args.outFP, 'w') translationDict = {} #raise Exception("Reached bad state=%d for '%s.%d' '%s' at line '%s'" %(state,mid,ver,tfid,line));
required=False) parser.add_argument('-l', dest='logFP', metavar='<logFile>', help='Where to output errors/warnings [default=stderr]', required=False) parser.add_argument('-v', dest='verbose', action='count', help='Verbose output?', required=False, default=0) args = parser.parse_args() inFile1 = MYUTILS.smartGZOpen(args.inFP1, 'r') inFile2 = MYUTILS.smartGZOpen(args.inFP2, 'r') if (args.logFP is not None): logFile = MYUTILS.smartGZOpen(args.logFP, 'w') sys.stderr = logFile if (args.outFP is None): outFile = sys.stdout else: if args.verbose > 0: warnings.warn("Outputting to file " + args.outFP) outFile = MYUTILS.smartGZOpen(args.outFP, 'w') def getNextRead(inFile): name = inFile.readline().rstrip()
## Can execute 'python seqToOHC.py --help' from command line to see options. ## Supporting information at https://docs.python.org/3/library/argparse.html parser = argparse.ArgumentParser(description='Converts a set of sequences into a one-hot-code (binary) representation - excludes non [ATGC] chars. Output in ACGT order, one line per sequence, base then position.'); parser.add_argument('-i',dest='inFP', metavar='<inFile>', help='Input file of sequences with a value in the second column that will preceed the OHC output on each line, separated by a tab', required=True); parser.add_argument('-m',dest='maxLen', metavar='<maxSeqLen>',help='The maximum sequence length to consider (truncated after this point)', required=True); parser.add_argument('-b',dest='orientBack', action='count',help='Align sequences of different sizes to back [default=front]?', required=False, default=0); parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False); parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False); parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0); ## initialize parser args = parser.parse_args(); ## Uses the smartGZOpen function in MYUTILS to read/parse through a file ## (indicated by 'r' argument). inFile=MYUTILS.smartGZOpen(args.inFP,'r'); ## Initialize max length integer maxSeqLen = int(args.maxLen); ## Creates log file of errors/warnings ## logFile = flexible framework to emit log messages ## (logging = tracking events when software runs) if (args.logFP is not None): logFile=MYUTILS.smartGZOpen(args.logFP,'w'); sys.stderr=logFile; ## Creates output directions (inculding warnings) if (args.outFP is None): ## system specific function - standard output, if output outFile= sys.stdout; else:
dest='skipAlignment', action='count', help= 'Skip the alignment step (e.g. was already done)? - also skip DB creation', required=False, default=0) args = parser.parse_args() ## The variable 'verbose' is an integer with the value given above, where the '-v' ## argument is added. verbose = args.verbose ## Creates a log file of errors/warnings (as described above) if (args.logFP is not None): logFile = MYUTILS.smartGZOpen(args.logFP, 'w') sys.stderr = logFile #test if bowtie exists ## 'subprocess' spawns new process. Here the call is 'which bowtie'. If the call is successful, the returncode (and ## therefore the value of p) is 0. Otherwise, the returncode will be non-zero, and an Exception is raised. p = subprocess.call(["which", "bowtie2"], stdin=subprocess.PIPE, stderr=subprocess.PIPE) if p != 0: raise Exception("could not find bowtie2. Did you use Bowtie2 ?") ## Uses the smartGZOpen function in MYUTILS to write (indicated by 'w' argument) a file. ## File has name with the prefix passed by user using the '-o' flag, and will have ## suffix as indicated in the arguments. outFileMap = MYUTILS.smartGZOpen(args.outFPre + "_map.txt.gz", 'w')