def parseGetorfOutput(content): """ Parse getorf output file(handle), returns dictionaty with orfs @type content: open filehandle @param content: filehandle of a EMBOSS getorf output fasta file @rtype: dictionary @return: dictionary with keys (original fastaheader, orf-start, orf-stop) and sequence as values @attention: content can be: sys.stdin.readlines() @attention: content can be: fh.readlines() """ orfs = {} for header, sequence in parseFasta(content).iteritems(): # below 3 lines retrieves start coordinates from getorf output (fref,whatever)= header.split("_",1) (start,stop) = header.split(" ",1)[-1][1:-1].split(" - ") (start,stop) = (int(start),int(stop)) name = (fref,start,stop) orfs[name] = sequence # return fasta dictionary with openreadingframe info return orfs
def parse_cexpander(cexpanderdata, fname_fasta): """ Parse the cexpander_dr output file into a CexpanderOutput class object @type fname_cexpander: string @param fname_cexpander: (absolute) path to cexpander_dr output file @type fname_fasta: string @param fname_fasta: (absolute) path to fasta input file @rtype: CexpanderOutput object @return: CexpanderOutput object """ # open file to txt string and initialize empty CexpanderOutput object cxpOut = CexpanderOutput() cxpOut.sequences = parseFasta(open(fname_fasta).readlines()) data = cexpanderdata.split("\n\n")[0:-1] # generate header list; omit first 5 lines (cexpander STDOUT messages) headers = data.pop(0) headers = [ line.split("\t")[2].replace(">", "") for line in headers.split("\n")[5:] ] # loop over the `transfer blocks` in the file for pos in range(0, len(data)): cxpTrfblck = CexpanderTransferBlock() if not set([ line.split("\t")[1] for line in data[pos][1:].split("\n")[1:] ]).difference(['0', '1']): # cexpander in binary mode # single-line string of zeros (0) and ones (1) #cxpTrfblck.binarystring = ''.join( [ line.split("\t")[1] for line in data[pos][1:].split("\n")[1:] ] ) mode = "binary" cxpTrfblck.binarystring = "" for line in data[pos][1:].split("\n")[1:]: for cell in line.strip().split("\t")[1:]: cxpTrfblck.binarystring += cell else: # cexpander in float mode mode = "float" cxpTrfblck.binarystring = [] for line in data[pos][1:].split("\n")[1:]: for cell in line.strip().split("\t")[1:]: cxpTrfblck.binarystring.append(float(cell)) cxpTrfblck.header = headers[pos] cxpTrfblck.sequence = cxpOut.sequences[cxpTrfblck.header] cxpTrfblck.positions = len(cxpTrfblck.sequence) cxpTrfblck.uniform = cxpTrfblck.get_uniform_positions() cxpTrfblck.score = len(cxpTrfblck.uniform) cxpTrfblck.ratio = cxpTrfblck._binarystring2matchratio( cxpTrfblck.binarystring) cxpTrfblck.mode = mode if cxpTrfblck.positions != len(cxpTrfblck.binarystring): print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.positions, "!=", len( cxpTrfblck.binarystring) print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.header, mode print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.sequence print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.binarystring print "#" * 40 print "".join(data[pos]) print "#" * 40 # add CexpanderTransferBlock to CexpanderOutput object cxpOut.add_transferblock(cxpTrfblck) # return the created object return cxpOut
import unittest import sys from os.path import join, abspath, dirname LOCAL_DIR = abspath(dirname(__file__)) sys.path.append(abspath(join(LOCAL_DIR, '..'))) import MOODS import fasta DIST_DIR = abspath(dirname(dirname(LOCAL_DIR))) print(DIST_DIR) fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt") records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [ [0,1,0,0,0,0,0,1,1,0], [1,0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0,0], [0,0,1,1,1,1,1,0,0,1] ] matrix2 = [ [10,0,10,3,5,5], [0,5,0,3,5,0,5], [0,1,0,3,0,5,0], [0,4,0,1,0,0,5] ] results = MOODS.search(seq, [matrix1, matrix2], 0.011)
import unittest import sys from os.path import join, abspath, dirname LOCAL_DIR = abspath(dirname(__file__)) sys.path.append(abspath(join(LOCAL_DIR, '..'))) import MOODS import fasta DIST_DIR = abspath(dirname(dirname(LOCAL_DIR))) print(DIST_DIR) fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt") records = fasta.parseFasta(fasta_filepath) seq = records[0][1] matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]] matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0], [0, 4, 0, 1, 0, 0, 5]] results = MOODS.search(seq, [matrix1, matrix2], 0.011) print("Matrix 1 results: " + str(len(results[0]))) print("Matrix 2 results: " + str(len(results[1]))) matrices = [matrix1, matrix2] thresholds = [0.011, 0.011] bg = MOODS.bg_from_sequence(seq, 0.1)
def parse_cexpander(cexpanderdata,fname_fasta): """ Parse the cexpander_dr output file into a CexpanderOutput class object @type fname_cexpander: string @param fname_cexpander: (absolute) path to cexpander_dr output file @type fname_fasta: string @param fname_fasta: (absolute) path to fasta input file @rtype: CexpanderOutput object @return: CexpanderOutput object """ # open file to txt string and initialize empty CexpanderOutput object cxpOut = CexpanderOutput() cxpOut.sequences = parseFasta(open(fname_fasta).readlines()) data = cexpanderdata.split("\n\n")[0:-1] # generate header list; omit first 5 lines (cexpander STDOUT messages) headers = data.pop(0) headers = [ line.split("\t")[2].replace(">","") for line in headers.split("\n")[5:] ] # loop over the `transfer blocks` in the file for pos in range(0,len(data)): cxpTrfblck = CexpanderTransferBlock() if not set([ line.split("\t")[1] for line in data[pos][1:].split("\n")[1:] ]).difference(['0','1']): # cexpander in binary mode # single-line string of zeros (0) and ones (1) #cxpTrfblck.binarystring = ''.join( [ line.split("\t")[1] for line in data[pos][1:].split("\n")[1:] ] ) mode = "binary" cxpTrfblck.binarystring = "" for line in data[pos][1:].split("\n")[1:]: for cell in line.strip().split("\t")[1:]: cxpTrfblck.binarystring+=cell else: # cexpander in float mode mode = "float" cxpTrfblck.binarystring = [] for line in data[pos][1:].split("\n")[1:]: for cell in line.strip().split("\t")[1:]: cxpTrfblck.binarystring.append(float(cell)) cxpTrfblck.header = headers[pos] cxpTrfblck.sequence = cxpOut.sequences[cxpTrfblck.header] cxpTrfblck.positions = len(cxpTrfblck.sequence) cxpTrfblck.uniform = cxpTrfblck.get_uniform_positions() cxpTrfblck.score = len(cxpTrfblck.uniform) cxpTrfblck.ratio = cxpTrfblck._binarystring2matchratio(cxpTrfblck.binarystring) cxpTrfblck.mode = mode if cxpTrfblck.positions != len(cxpTrfblck.binarystring): print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.positions, "!=", len(cxpTrfblck.binarystring) print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.header, mode print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.sequence print "CEXPANDER-PARSE-ERROR:", cxpTrfblck.binarystring print "#"*40 print "".join(data[pos]) print "#"*40 # add CexpanderTransferBlock to CexpanderOutput object cxpOut.add_transferblock(cxpTrfblck) # return the created object return cxpOut