Beispiel #1
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros(
         (self.batchSize, self.numKds, self.numTFs)) + np.log(99999.0)
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = line.split("\t")
         self.nextBatchY[b] = float(curData[0])
         for t in range(1, len(curData)):
             curKds = [np.log(float(x)) for x in curData[t].split(";")]
             self.nextBatchX[b, 0:min(self.numKds, len(curKds)), t -
                             1] = curKds[0:min(self.numKds, len(curKds))]
         b += 1
Beispiel #2
0
def makeBowtieDB():
    global args
    ## Uses global keyword to access variables defined outside the function.
    global promoterSeqs
    fastaOut = MYUTILS.smartGZOpen(args.tempFilePre + ".seqs.fasta", 'w')
    ## Define the output FASTA file.
    for i in range(0, len(promoterSeqs)):
        fastaOut.write(">%i\n%s\n" % (i, promoterSeqs[i]))
        ## Prints the promoter seqs in FASTA format:
        ## >1
        ## ATCGATCGATCGTCAGTAGCTCGTACGTAGCGACTGCTCGTAGC...
        ## >2
        ## ATCGATCGCTACGATGCTAGATGCTCGATCGTCGTACGTACGTA...
    fastaOut.close()
    ## Close the file.
    #subprocess.check_call(["bowtie2-build","%s.seqs.fasta"%args.tempFilePre,"%s.bowtie2"%args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE);
    p = subprocess.Popen(
        ["bowtie2-build"] + args.bowtieBuildParams.split() +
        ["%s.seqs.fasta" % args.tempFilePre,
         "%s.bowtie2" % args.tempFilePre],
        stdout=subprocess.PIPE,
        stdin=subprocess.PIPE,
        stderr=subprocess.PIPE)
    if args.verbose > 0:
        sys.stderr.write("Running bowtie2-build...")

    (curStdout, stderrData) = p.communicate()
    ## Popen.communicate interact with process:
    ## Send data to stdin. Read data from stdout and stderr,
    ## until end-of-file is reached.
    if args.verbose > 0:
        sys.stderr.write(curStdout)
        sys.stderr.write(stderrData)
        sys.stderr.write("done!\n")
Beispiel #3
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros(
         (self.batchSize, self.seqLen - self.wordLen + 1)).astype("int32")
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = line.rstrip().split("\t")
         self.nextBatchY[b] = float(curData[0])
         curSeq = curData[1]
         if len(curSeq) < self.seqLen:
             curSeq = "N" * (self.seqLen - len(curSeq)) + curSeq
             ### prepend Ns if the sequence is too short
         curSeq = curSeq[(
             len(curSeq) -
             self.seqLen):len(curSeq)]  # trim distal bases if too long
         for si in range(0, self.seqLen - self.wordLen + 1):
             self.nextBatchX[b, si] = self.kmer2index[curSeq[si:(
                 si + self.wordLen
             )]]  #fill X with the indeces of the various k-mers
         b += 1
Beispiel #4
0
 def __init__(self, inFP, batchSize, numRuns, seqLen):
     self.inFP = inFP
     self.batchSize = batchSize
     self.numRuns = numRuns
     self.seqLen = seqLen
     self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
     self.curThread = Thread(target=self.prepareNextBatch)
     self.curThread.start()
Beispiel #5
0
def saveMatrix(outFileName, rowLabs, colLabs, dataMatrix):
    outFile = MYUTILS.smartGZOpen(outFileName, "w")
    outFile.write("\t".join(colLabs) + "\n")
    for i in range(0, len(rowLabs)):
        outFile.write(rowLabs[i])
        for j in range(0, dataMatrix.shape[1]):
            outFile.write("\t%g" % dataMatrix[i, j])
        outFile.write("\n")
    outFile.close()
Beispiel #6
0
	def __init__(self, inFP, batchSize, numRuns,numTFs, numKds):
		self.inFP = inFP;
		self.batchSize = batchSize;
		self.numRuns= numRuns;
		self.numTFs= numTFs;
		self.numKds= numKds;
		self.curFH = MYUTILS.smartGZOpen(self.inFP,'r')
		self.curThread = Thread(target = self.prepareNextBatch);
		self.curThread.start()
Beispiel #7
0
	def __init__(self, inFP, batchSize, numRuns,seqLen, kmer2index, wordLen):
		self.inFP = inFP;
		self.batchSize = batchSize;
		self.numRuns= numRuns;
		self.seqLen= seqLen;
		self.wordLen= wordLen;
		self.kmer2index= kmer2index;
		self.curFH = MYUTILS.smartGZOpen(self.inFP,'r')
		self.curThread = Thread(target = self.prepareNextBatch);
		self.curThread.start()
Beispiel #8
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros((self.batchSize, 4, self.seqLen, 1))
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = np.fromstring(line, dtype=float, sep="\t")
         self.nextBatchY[b] = curData[0]
         self.nextBatchX[b, :, :, 0] = curData[1:].reshape((4, self.seqLen))
         b += 1
Beispiel #9
0
	def prepareNextBatch(self):
		self.nextBatchX = np.zeros((self.batchSize,self.numKds,self.numTFs)) +np.log(99999.0);
		self.nextBatchY = np.zeros((self.batchSize))
		b=0
		while b < self.batchSize:
			line = self.curFH.readline()
			if line =="":
				if self.numRuns==1:
					self.nextBatchX = self.nextBatchX[0:b,:,:]
					self.nextBatchY = self.nextBatchY[0:b]
					self.numRuns-=1;
					return;
				self.curFH.close();
				self.curFH = MYUTILS.smartGZOpen(self.inFP,'r')
				self.numRuns-=1;
				line = self.curFH.readline()
			if line is None or line[0]=="#": continue
			curData = np.fromstring(line, dtype=float, sep="\t")
			self.nextBatchY[b]=curData[0];
			self.nextBatchX[b,:,:] = np.transpose(curData[1:len(curData)].reshape((self.numTFs,self.numKds)))
			b+=1
Beispiel #10
0
def makeBowtieDB():
    global args
    global promoterSeqs
    fastaOut = MYUTILS.smartGZOpen(args.tempFilePre + ".seqs.fasta", 'w')
    for i in range(0, len(promoterSeqs)):
        fastaOut.write(">%i\n%s\n" % (i, promoterSeqs[i]))
    fastaOut.close()
    #subprocess.check_call(["bowtie2-build","%s.seqs.fasta"%args.tempFilePre,"%s.bowtie2"%args.tempFilePre], stdout=subprocess.PIPE, stdin=subprocess.PIPE);
    p = subprocess.Popen(
        ["bowtie2-build"] + args.bowtieBuildParams.split() +
        ["%s.seqs.fasta" % args.tempFilePre,
         "%s.bowtie2" % args.tempFilePre],
        stdout=subprocess.PIPE,
        stdin=subprocess.PIPE,
        stderr=subprocess.PIPE)
    if args.verbose > 0:
        sys.stderr.write("Running bowtie2-build...")
    (curStdout, stderrData) = p.communicate()
    if args.verbose > 0:
        sys.stderr.write(curStdout)
        sys.stderr.write(stderrData)
        sys.stderr.write("done!\n")
Beispiel #11
0
                    required=False)
parser.add_argument('-l',
                    dest='logFP',
                    metavar='<logFile>',
                    help='Where to output errors/warnings [default=stderr]',
                    required=False)
parser.add_argument('-v',
                    dest='verbose',
                    action='count',
                    help='Verbose output?',
                    required=False,
                    default=0)

args = parser.parse_args()

inFileDict = MYUTILS.smartGZOpen(args.inFPDict, 'r')
inFileSeqs = MYUTILS.smartGZOpen(args.inFPSeqs, 'r')

if (args.logFP is not None):
    logFile = MYUTILS.smartGZOpen(args.logFP, 'w')
    sys.stderr = logFile

if (args.outFP is None):
    outFile = sys.stdout
else:
    if args.verbose > 0:
        sys.stderr.write("Outputting to file " + args.outFP + "\n")
    outFile = MYUTILS.smartGZOpen(args.outFP, 'w')

translationDict = {}
#raise Exception("Reached bad state=%d for '%s.%d' '%s' at line '%s'" %(state,mid,ver,tfid,line));
                    required=False)
parser.add_argument('-l',
                    dest='logFP',
                    metavar='<logFile>',
                    help='Where to output errors/warnings [default=stderr]',
                    required=False)
parser.add_argument('-v',
                    dest='verbose',
                    action='count',
                    help='Verbose output?',
                    required=False,
                    default=0)

args = parser.parse_args()

inFile1 = MYUTILS.smartGZOpen(args.inFP1, 'r')
inFile2 = MYUTILS.smartGZOpen(args.inFP2, 'r')

if (args.logFP is not None):
    logFile = MYUTILS.smartGZOpen(args.logFP, 'w')
    sys.stderr = logFile

if (args.outFP is None):
    outFile = sys.stdout
else:
    if args.verbose > 0: warnings.warn("Outputting to file " + args.outFP)
    outFile = MYUTILS.smartGZOpen(args.outFP, 'w')


def getNextRead(inFile):
    name = inFile.readline().rstrip()
Beispiel #13
0
## Can execute 'python seqToOHC.py --help' from command line to see options.
## Supporting information at https://docs.python.org/3/library/argparse.html
parser = argparse.ArgumentParser(description='Converts a set of sequences into a one-hot-code (binary) representation - excludes non [ATGC] chars.  Output in ACGT order, one line per sequence, base then position.');
parser.add_argument('-i',dest='inFP', metavar='<inFile>', help='Input file of sequences with a value in the second column that will preceed the OHC output on each line, separated by a tab', required=True);
parser.add_argument('-m',dest='maxLen',	metavar='<maxSeqLen>',help='The maximum sequence length to consider (truncated after this point)', required=True);
parser.add_argument('-b',dest='orientBack', action='count',help='Align sequences of different sizes to back [default=front]?', required=False, default=0);
parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);

## initialize parser
args = parser.parse_args();

## Uses the smartGZOpen function in MYUTILS to read/parse through a file
## (indicated by 'r' argument).
inFile=MYUTILS.smartGZOpen(args.inFP,'r');
## Initialize max length integer
maxSeqLen = int(args.maxLen);

## Creates log file of errors/warnings
## logFile = flexible framework to emit log messages
## (logging = tracking events when software runs)
if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

## Creates output directions (inculding warnings)
if (args.outFP is None):
		## system specific function - standard output, if output
	outFile= sys.stdout;
else:
Beispiel #14
0
    dest='skipAlignment',
    action='count',
    help=
    'Skip the alignment step (e.g. was already done)? - also skip DB creation',
    required=False,
    default=0)

args = parser.parse_args()

## The variable 'verbose' is an integer with the value given above, where the '-v'
## argument is added.
verbose = args.verbose

## Creates a log file of errors/warnings (as described above)
if (args.logFP is not None):
    logFile = MYUTILS.smartGZOpen(args.logFP, 'w')
    sys.stderr = logFile

#test if bowtie exists
## 'subprocess' spawns new process. Here the call is 'which bowtie'. If the call is successful, the returncode (and
## therefore the value of p) is 0. Otherwise, the returncode will be non-zero, and an Exception is raised.
p = subprocess.call(["which", "bowtie2"],
                    stdin=subprocess.PIPE,
                    stderr=subprocess.PIPE)
if p != 0:
    raise Exception("could not find bowtie2.  Did you use Bowtie2 ?")

## Uses the smartGZOpen function in MYUTILS to write (indicated by 'w' argument) a file.
## File has name with the prefix passed by user using the '-o' flag, and will have
## suffix as indicated in the arguments.
outFileMap = MYUTILS.smartGZOpen(args.outFPre + "_map.txt.gz", 'w')