def readFastaSequences(fileName, out_type='DataSet'): """ Reads a file in fasta format and returns the sequence in a DataSet object @param fileName: Name of the input file @param: type of output object: DataSet or ConstrainedDataSet @return: list of sequence lists """ f = open(fileName, "r") index = -1 seqM = [] nameList = [] partSeq = "" nameReg = re.compile("^\>(.*)") try: while 1 == 1: line = f.next() s = nameReg.search(line) if s: if index != -1: if partSeq[len(partSeq) - 2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq) - 2] partSeq = partSeq.upper( ) # upper case letters by convention seqM.append(list(partSeq)) partSeq = "" index += 1 nameList.append(mixture.chomp(s.group(1))) else: partSeq += mixture.chomp(line) except StopIteration: if partSeq[len(partSeq) - 2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq) - 2] partSeq = partSeq.upper() seqM.append(list(partSeq)) if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) data.fromList(seqM, IDs=nameList) return data
def readFastaSequences(fileName, out_type='DataSet'): """ Reads a file in fasta format and returns the sequence in a DataSet object @param fileName: Name of the input file @param: type of output object: DataSet or ConstrainedDataSet @return: list of sequence lists """ f = open(fileName,"r") index = -1 seqM = [] nameList = [] partSeq = "" nameReg = re.compile("^\>(.*)") try: while 1==1: line = f.next() s = nameReg.search(line) if s: if index != -1: if partSeq[len(partSeq)-2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq)-2] partSeq = partSeq.upper() # upper case letters by convention seqM.append(list(partSeq)) partSeq = "" index +=1 nameList.append(mixture.chomp(s.group(1) )) else: partSeq += mixture.chomp(line) except StopIteration: if partSeq[len(partSeq)-2:len(partSeq)] == "//": partSeq = partSeq[0:len(partSeq)-2] partSeq = partSeq.upper() seqM.append(list(partSeq)) if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) data.fromList(seqM,IDs=nameList) return data
def readUCSCPrior(filename): """ Reads files in the UCSC Dirichlet Mixture prior (DMP) format (http://www.soe.ucsc.edu/compbio/dirichlets/) and converts them into PyMix DirichletMixturePrior objects. Note that the alphabet in the UCSC priors does not contain the gap symbol. For the output DirichletMixturePrior the gap symbol is introduced with a parameter value of 0.01 in all components. @param filename: file in UCSC DMP format @return: DirichletMixturePrior object """ f = open(filename,'r') ex1 = re.compile('Mixture=\s(\d+.\d+)') ex2 = re.compile('Order\s*=\s+([A-Z\s]+)') ex3 = re.compile('Alpha=\s+([\d+.\d+\s+,\d+e-]+)') pi = [] sigma = None dComp = [] alpha_mat = [] for l in f: l = mixture.chomp(l) m1 = ex1.match(l) if m1: pi.append( float(m1.groups(1)[0])) m2 = ex2.match(l) if m2: s = m2.groups(1)[0] sigma = s.split(' ') m3 = ex3.match(l) if m3: s = m3.groups(1)[0] alpha = s.split(' ') alpha = map(float,alpha) as = alpha.pop(0) # first entry is the sum of the others -> remove alpha_mat.append(alpha) # intergrate gap character '-' into the alphabet sigma.append('-') alphabet = mixture.Alphabet(sigma) for i in range(len(alpha_mat)): alpha_mat[i].append(0.01) # add hyper paramerter for '-' dComp.append( mixture.DirichletPrior(21,alpha_mat[i]) ) prior = mixture.DirichletMixturePrior(len(dComp),21,pi,dComp) return alphabet,prior
def readSites(fileName): """ Flat file parser for the JASPAR .sites format. The files are essentially fasta but there is a count matrix at the end of the file. @param fileName: File name of .sites file @return: DataSet object """ f = open(fileName,"r") seq_head = re.compile("^\>(.*)") end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[") seq = [] ids = [] count = 1 for line in f: line = mixture.chomp(line) #line = line.upper() s = seq_head.search(line) if s: #print s.groups(1)[0] tl = s.groups(1)[0].split('\t') ids.append(str(tl[1])+'_'+str(tl[2])) #ids.append('seq'+str(count)) #count +=1 #print s.group(1) elif end.search(line): break else: if len(line) > 0: line = list(line) # remove lower case letters, only upper case letters are part of the # binding site site = [] for i,s in enumerate(line): if s.isupper(): site.append(s) seq.append(site) #print len(site) data = mixture.DataSet() data.fromList(seq, IDs = ids) return data
def readAlnData(fn,reg_str=None, out_type='DataSet'): """ Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object. @param reg_str: regular expression for sequence parsing @param: type of output object: DataSet or ConstrainedDataSet @return: DataSet object """ f = open(fn,'r') if reg_str: parse = re.compile(reg_str) else: parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)") d = {} f.readline() # remove first line for l in f: l = mixture.chomp(l) pat = parse.search(l) if pat: k = pat.group(1) seq = pat.group(2) if k in d.keys(): d[k] += seq else: d[k] = seq else: continue if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) sIDs = d.keys() dMatrix = [] for z in d.keys(): dMatrix.append(list(d[z])) data.fromList(dMatrix,IDs = sIDs) return data
def readAlnData(fn, reg_str=None, out_type='DataSet'): """ Parses a CLUSTALW format .aln multiple alignment file and returns a mixture.DataSet object. @param reg_str: regular expression for sequence parsing @param: type of output object: DataSet or ConstrainedDataSet @return: DataSet object """ f = open(fn, 'r') if reg_str: parse = re.compile(reg_str) else: parse = re.compile("(\w+\|\w+)\s+([\w,\-,.]+)") d = {} f.readline() # remove first line for l in f: l = mixture.chomp(l) pat = parse.search(l) if pat: k = pat.group(1) seq = pat.group(2) if k in d.keys(): d[k] += seq else: d[k] = seq else: continue if out_type == 'DataSet': data = mixture.DataSet() elif out_type == 'ConstrainedDataSet': data = mixture.ConstrainedDataSet() else: raise TypeError, 'Invalid output type ' + str(out_type) sIDs = d.keys() dMatrix = [] for z in d.keys(): dMatrix.append(list(d[z])) data.fromList(dMatrix, IDs=sIDs) return data
def readJASPAR(fileName): """ Reads a flat file of JASPAR binding sites matrices. JASPAR files are essentially fasta, but only upper case letters are part of the binding site proper. Lower case letters are discarded. """ f = open(fileName,"r") seq_head = re.compile("^\>(.*)") end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[") seq = [] ids = [] count = 1 for line in f: line = mixture.chomp(line) #line = line.upper() s = seq_head.search(line) if s: ids.append('seq'+str(count)) count +=1 #print s.group(1) elif end.search(line): break else: if len(line) > 0: line = list(line) # remove lower case letters, only upper case letters are part of the # binding site site = [] for i,s in enumerate(line): if s.isupper(): site.append(s) seq.append(site) #print len(site) data = mixture.DataSet() data.fromList(seq, IDs = ids) return data
def readJASPAR(fileName): """ Reads a flat file of JASPAR binding sites matrices. JASPAR files are essentially fasta, but only upper case letters are part of the binding site proper. Lower case letters are discarded. """ f = open(fileName, "r") seq_head = re.compile("^\>(.*)") end = re.compile("^[A,C,G,T,a,c,g,t]\s*\[") seq = [] ids = [] count = 1 for line in f: line = mixture.chomp(line) #line = line.upper() s = seq_head.search(line) if s: ids.append('seq' + str(count)) count += 1 #print s.group(1) elif end.search(line): break else: if len(line) > 0: line = list(line) # remove lower case letters, only upper case letters are part of the # binding site site = [] for i, s in enumerate(line): if s.isupper(): site.append(s) seq.append(site) #print len(site) data = mixture.DataSet() data.fromList(seq, IDs=ids) return data