Ejemplo n.º 1
0
	def __init__(self,file):
		self.chrs={};
		inFile = MYUTILS.smartGZOpen(file,"r")
		for l in inFile:
			l = l.rstrip();
			c,len = l.split("\t");
			self.chrs[c]=int(len);
Ejemplo n.º 2
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros(
         (self.batchSize, self.seqLen - self.wordLen + 1)).astype("int32")
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = line.rstrip().split("\t")
         self.nextBatchY[b] = float(curData[0])
         curSeq = curData[1]
         if len(curSeq) < self.seqLen:
             curSeq = "N" * (self.seqLen - len(curSeq)) + curSeq
             ### prepend Ns if the sequence is too short
         curSeq = curSeq[(
             len(curSeq) -
             self.seqLen):len(curSeq)]  # trim distal bases if too long
         for si in range(0, self.seqLen - self.wordLen + 1):
             self.nextBatchX[b, si] = self.kmer2index[curSeq[si:(
                 si + self.wordLen
             )]]  #fill X with the indeces of the various k-mers
         b += 1
Ejemplo n.º 3
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros(
         (self.batchSize, self.numKds, self.numTFs)) + np.log(99999.0)
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = line.split("\t")
         self.nextBatchY[b] = float(curData[0])
         for t in range(1, len(curData)):
             curKds = [np.log(float(x)) for x in curData[t].split(";")]
             self.nextBatchX[b, 0:min(self.numKds, len(curKds)), t -
                             1] = curKds[0:min(self.numKds, len(curKds))]
         b += 1
Ejemplo n.º 4
0
 def __init__(self, inFP, batchSize, numRuns, seqLen):
     self.inFP = inFP
     self.batchSize = batchSize
     self.numRuns = numRuns
     self.seqLen = seqLen
     self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
     self.curThread = Thread(target=self.prepareNextBatch)
     self.curThread.start()
Ejemplo n.º 5
0
	def readGFF(self,gffFP): #readGFF
		self.allData = [];
		for line in MYUTILS.smartGZForeach(gffFP):
			line = line.rstrip();
			if line is None  or line=="" or line[0]=="#" or line[0]==">" or line[0:11]=="track name=":
			  continue;
			data = line.split("\t");
			data2 = [data[GFF_CHR], data[GFF_ST], data[GFF_EN], data[GFF_NAME], data[GFF_STR], data[GFF_SCORE], data[GFF_RF], data[GFF_SOURCE], data[GFF_TYPE]];
			self.allData.append(data2);
Ejemplo n.º 6
0
def saveMatrix(outFileName, rowLabs, colLabs, dataMatrix):
    outFile = MYUTILS.smartGZOpen(outFileName, "w")
    outFile.write("\t".join(colLabs) + "\n")
    for i in range(0, len(rowLabs)):
        outFile.write(rowLabs[i])
        for j in range(0, dataMatrix.shape[1]):
            outFile.write("\t%g" % dataMatrix[i, j])
        outFile.write("\n")
    outFile.close()
Ejemplo n.º 7
0
def saveMatrix(outFileName, rowLabs, colLabs, dataMatrix):
	outFile = MYUTILS.smartGZOpen(outFileName, "w");
	outFile.write("\t".join(colLabs)+"\n");
	for i in range(0,len(rowLabs)):
		outFile.write(rowLabs[i]);
		for j in range(0,dataMatrix.shape[1]):
			outFile.write("\t%g"%dataMatrix[i,j]);
		outFile.write("\n");
	outFile.close();
Ejemplo n.º 8
0
 def prepareNextBatch(self):
     self.nextBatchX = np.zeros((self.batchSize, 4, self.seqLen, 1))
     self.nextBatchY = np.zeros((self.batchSize))
     b = 0
     while b < self.batchSize:
         line = self.curFH.readline()
         if line == "":
             if self.numRuns == 1:
                 self.nextBatchX = self.nextBatchX[0:b, :, :, :]
                 self.nextBatchY = self.nextBatchY[0:b]
                 self.numRuns -= 1
                 return
             self.curFH.close()
             self.curFH = MYUTILS.smartGZOpen(self.inFP, 'r')
             self.numRuns -= 1
             line = self.curFH.readline()
         if line is None or line[0] == "#": continue
         curData = np.fromstring(line, dtype=float, sep="\t")
         self.nextBatchY[b] = curData[0]
         self.nextBatchX[b, :, :, 0] = curData[1:].reshape((4, self.seqLen))
         b += 1
Ejemplo n.º 9
0
	def readBED(self,bedFP): #readBED
		self.allData = [];
		for line in MYUTILS.smartGZForeach(bedFP):
			line = line.rstrip();
			if line is None  or line=="" or line[0]=="#" or line[0]==">" or line[0:11]=="track name=":
			  continue;
			data = line.split("\t");
			data2 = [data[BED_CHR], data[BED_ST], data[BED_EN]];
			if len(data)>=(BED_NAME+1):
				data2.append(data[BED_NAME]);
			else:
				data2.append("");
			if len(data)>=(BED_STR+1):
				data2.append(data[BED_STR]);
			else:
				data2.append("");
			if len(data)>=(BED_SCORE+1):
				data2.append(data[BED_SCORE]);
			else:
				data2.append("");
			if len(data)>=(BED_SCORE+2):
				data2.append(data[6:]); # the rest
			self.allData.append(data2);
Ejemplo n.º 10
0
parser.add_argument('-l',
                    dest='logFP',
                    metavar='<logFile>',
                    help='Where to output errors/warnings [default=stderr]',
                    required=False)
parser.add_argument('-v',
                    dest='verbose',
                    action='count',
                    help='Verbose output?',
                    required=False,
                    default=0)

args = parser.parse_args()

if (args.logFP is not None):
    logFile = MYUTILS.smartGZOpen(args.logFP, 'w')
    sys.stderr = logFile

#raise Exception("Reached bad state=%d for '%s.%d' '%s' at line '%s'" %(state,mid,ver,tfid,line));

I_VARID = 0
I_SNP = 1
I_CHR = 2
I_POS = 3
I_GC = 4
I_LEN = 5
I_J1 = 6
I_REFA = 7
I_ALTA = 8
I_INCSNP = 9
I_SEQL = 10
Ejemplo n.º 11
0
parser.add_argument('-it',dest='inTagMap',	metavar='<inTagMap>',help='Input file of the mRNA tag -enhancer map', required=True);
parser.add_argument('-iq',dest='inFastq',	metavar='<inFastq>',help='Input file of fastq barcodes', required=True);
parser.add_argument('-c',dest='constT', metavar='<constT>',help='the constant region following the reads [default=None]', required=False);
parser.add_argument('-mc',dest='mmc', metavar='<mismatchesConst>',help='the number of mismatches to allow in constant region [default=1]', required=False,default = 1);
parser.add_argument('-mt',dest='mmt', metavar='<mismatchesTag>',help='the number of mismatches to allow in tag region [default=0]', required=False);
parser.add_argument('-o',dest='outFPre', metavar='<outFilePre>',help='Where to output results, prefix', required=True);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-nc',dest='noConstCheck', action='count',help='Ignore constant region matching', required=False, default=0);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);

args = parser.parse_args();
args.mmc = int(args.mmc);
args.mmt = int(args.mmt);

if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

sys.stderr.write("Compiling possible mismatches to constant region...\n");
#adds mismatches to tag2tag hash
def addMMToTags(myHash, baseSeq, numToAdd, alphabet, ref):
	if numToAdd<=0:
		if baseSeq in myHash and myHash[baseSeq]!=ref: #collision
			if baseSeq!=myHash[baseSeq]:#other is not an exact match
				myHash[baseSeq]="NA";
		else:	
			myHash[baseSeq]=ref;
	else:
		for i in range(0,len(baseSeq)):
			for a in range(0,len(alphabet)):
				addMMToTags(myHash, baseSeq[0:i]+alphabet[a]+baseSeq[(i+1):len(baseSeq)], numToAdd-1, alphabet, ref);
Ejemplo n.º 12
0
import MYUTILS
import GENOMEDATA
import sys

if args.inBED>0:
	scanThese = GENOMEDATA.BED(args.lociFile);
else: #GFF
	scanThese = GENOMEDATA.GFF(args.lociFile);

inclusive=1;
if args.exclusive>0:
	inclusive=0;


if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

outFile = MYUTILS.smartGZOpen(args.outFP,'w');

#reverse loci if req
if args.rev>0:
	scanThese.flipStrands_();

if scanThese.length()==0:
	outFile.write("");
	outFile.close();
	quit();
#calculate average and max length of GFF entries
scanThese.coord_to_i_();
lengthMax = 0;
Ejemplo n.º 13
0
# this is for dubugging in interactive mode - comment out normally
#args = lambda: None
#setattr(args,"sample","0.01")
#setattr(args,"dim","3")
#setattr(args,"inFP","calcPOpenTestData.txt")
#setattr(args,"outFPre","calcPOpenTestData_out")
#setattr(args,"chrsFile","/home/unix/cgdeboer/genomes/sc/20110203_R64/chrom.sizes");
#setattr(args,"logFP",None);
	

args.sample = float(args.sample);
args.dim = float(args.dim);

if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;


IDs =[];
files = [];
smoothings = [];
isOpenness = [];
defaultVal = [];
doLog = [];
clusterInit = [];
inFile=MYUTILS.smartGZOpen(args.inFP,'r');
for line in inFile:
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	IDs.append(data[0]);
Ejemplo n.º 14
0
parser.add_argument('-is',dest='inSAM',	metavar='<inFile>',help='Input file of aligned reads to enhancers in sam format. must be sorted (unix sort)', required=True);
parser.add_argument('-ib',dest='inBarcodes',	metavar='<inFile>',help='Input file of barcodes in read\tbarcode format (must also be sorted)', required=True);
parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);

args = parser.parse_args();

SAM_ID=0;
SAM_STARTPOS=3;
SAM_CIGAR=5;
SAM_SCORE=4;
SAM_TLEN=8;
SAM_REF=2;

inSAM=MYUTILS.smartGZOpen(args.inSAM,'r');
inBC=MYUTILS.smartGZOpen(args.inBarcodes,'r');


if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

if (args.outFP is None):
	outFile= sys.stdout;
else:
	if args.verbose>0: warnings.warn("Outputting to file "+args.outFP);
	outFile = MYUTILS.smartGZOpen(args.outFP,'w');


nextSAMLine = inSAM.readline().rstrip().split("\t");
import numpy as np
import scipy as sp
#from scipy.ndimage.filters import gaussian_filter;
#from scipy import linalg
#import matplotlib
#matplotlib.use('Agg')
#import matplotlib.pyplot as plt
#from sklearn import mixture
import sys
from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile


# this is for dubugging in interactive mode - comment out normally
if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

oldToNew = {};
transChrs = [];
inFile=MYUTILS.smartGZOpen(args.inMap,'r');
for line in inFile:
	if line is None or line == "" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	for i in range(0,len(data)):
		oldToNew[data[i]] = data[0];
		transChrs.append(data[i]);

inFile.close();

chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');
Ejemplo n.º 16
0
import re
import sys
import argparse
parser = argparse.ArgumentParser(description='Goes through the provided set of genomic coordinates and throws out any that are for invalid chromosomes, or are outside the bounds of a chromosome.')
parser.add_argument('-i',dest='inFP', metavar='<inFile>',help='Input file of coordinates', required=True);
parser.add_argument('-s',dest='chromSizes', metavar='<chromSizes>',help='Input file of chrom.sizes', required=True);
parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);

args = parser.parse_args();



chromSizes={}
chromSizesFile=MYUTILS.smartGZOpen(args.chromSizes,'r');
for line in chromSizesFile:
  if line is None or line == "" or line[0]=="#": continue
  data=line.rstrip().split("\t");
  chromSizes[data[0]]=int(data[1]);


if (args.logFP is not None):
  logFile=MYUTILS.smartGZOpen(args.logFP,'w');
  sys.stderr=logFile;

if (args.outFP is None):
  outFile= sys.stdout;
else:
  if args.verbose>0: warnings.warn("Outputting to file "+args.outFP);
  outFile = MYUTILS.smartGZOpen(args.outFP,'w');
Ejemplo n.º 17
0
import warnings
import MYUTILS
import sys
import argparse
parser = argparse.ArgumentParser(description='DESCRIPTION.')
parser.add_argument('-i',dest='inFP',	metavar='<inFile>',help='Input file of fastq', required=True);
parser.add_argument('-s',dest='startPos',	metavar='<startPos>',help='Where the barcode starts in fastq', required=True);
parser.add_argument('-n',dest='numBases',	metavar='<numBases>',help='Length of barcode', required=True);
parser.add_argument('-o',dest='outFP', metavar='<outFile>',help='Where to output results [default=stdout]', required=False);
parser.add_argument('-l',dest='logFP', metavar='<logFile>',help='Where to output errors/warnings [default=stderr]', required=False);
parser.add_argument('-v',dest='verbose', action='count',help='Verbose output?', required=False, default=0);

args = parser.parse_args();


inFile=MYUTILS.smartGZOpen(args.inFP,'r');

args.startPos = int(args.startPos);
args.numBases = int(args.numBases);

if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

if (args.outFP is None):
	outFile= sys.stdout;
else:
	if args.verbose>0: warnings.warn("Outputting to file "+args.outFP);
	outFile = MYUTILS.smartGZOpen(args.outFP,'w');

Ejemplo n.º 18
0
                    required=True)
parser.add_argument('-l',
                    dest='logFP',
                    metavar='<logFile>',
                    help='Where to output errors/warnings [default=stderr]',
                    required=False)
parser.add_argument('-v',
                    dest='verbose',
                    action='count',
                    help='Verbose output?',
                    required=False,
                    default=0)

args = parser.parse_args()

inFile = MYUTILS.smartGZOpen(args.inFP, 'r')

if (args.logFP is not None):
    logFile = MYUTILS.smartGZOpen(args.logFP, 'w')
    sys.stderr = logFile

#raise Exception("Reached bad state=%d for '%s.%d' '%s' at line '%s'" %(state,mid,ver,tfid,line));
F_RID = 0
F_R1_REF = 1
F_R1_MAPQ = 2
F_R1_CIGAR = 3
F_R1_START = 4
F_R2_REF = 5
F_R2_MAPQ = 6
F_R2_CIGAR = 7
F_R2_START = 8
Ejemplo n.º 19
0
import sys
from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

args.sample = float(args.sample);
args.dim = float(args.dim);
args.iterations = int(args.iterations);
args.components = int(args.components);
args.chunks = int(args.chunks);
args.scaleTo = args.scaleTo.upper();

if args.approach!="PCA" and args.components==-1:
	raise Exception("components has no default for NMF/ICA - must specify with -x");

if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;


IDs =[];
files = [];
smoothings = [];
defaultVal = [];
doLog = [];
inFile=MYUTILS.smartGZOpen(args.inFP,'r');
for line in inFile:
	if line is None or line == "" or line.rstrip()=="" or line[0]=="#": continue
	data=line.rstrip().split("\t");
	if len(data)!=5: raise Exception("Incorrect number of fields in input: %s\n"%(line));
	IDs.append(data[0]);
	files.append(data[1]);
Ejemplo n.º 20
0
import scipy as sp
from scipy.ndimage.filters import gaussian_filter;
from scipy.ndimage.filters import uniform_filter1d;
#from scipy import linalg
#import matplotlib
#matplotlib.use('Agg')
#import matplotlib.pyplot as plt
#from sklearn import mixture
import re
import sys
from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

# this is for dubugging in interactive mode - comment out normally
if (args.logFP is not None):
	logFile=MYUTILS.smartGZOpen(args.logFP,'w');
	sys.stderr=logFile;

args.chunks = int(args.chunks);
if args.lessThan is not None:
	args.lessThan = float(args.lessThan);
if args.greaterThan is not None:
	args.greaterThan = float(args.greaterThan);
if args.lessThan is None and args.greaterThan is None:
	raise Exception("must specify one of -gt or -lt!");





chromSizesFile = MYUTILS.smartGZOpen(args.chrsFile,'r');