def RnaPlotParser(lines): """Returns sequence, coordinates, and pairing indices. """ sequence = '' coordinates = [] pairs = [] if lines: #Split on sequence block sequence_finder = LabeledRecordFinder(is_label_line=\ lambda x: x.startswith('/sequence')) prefix, seq_block = list(sequence_finder(lines)) #split on coordinate block coordinate_finder = LabeledRecordFinder(is_label_line=\ lambda x: x.startswith('/coor')) #sequence block is first item in list sequence_block, coord_block = list(coordinate_finder(seq_block)) #split on pairs block pairs_finder = LabeledRecordFinder(is_label_line=\ lambda x: x.startswith('/pairs')) #coordinate block is first item in list coordinate_block, pairs_block = list(pairs_finder(coord_block)) sequence = get_sequence(sequence_block) coordinates = get_coordinates(coordinate_block) pairs = get_pairs(pairs_block) return sequence, coordinates, pairs
def extractCommandLineData(command_block): """Returns a dict of all command line data from MEME output. """ data_dict = {} #Get only necessary Command Line Summary data ignore = lambda x: x.startswith('*') meme_model = LabeledRecordFinder(lambda x: 'model:' in x, ignore=ignore) cmd_data = list(meme_model(command_block)) cmd_data = cmd_data[1] cmd_data = cmd_data[:-4] #Just return list of strings rather than parse data """ cmd_data = '^'.join(cmd_data) cmd_data = cmd_data.split() cmd_data = ' '.join(cmd_data) cmd_data = cmd_data.split(': ') lastkarat = DelimitedSplitter('^',-1) cmd_data_temp = [] for line in cmd_data: cmd_data_temp.extend(lastkarat(line)) cmd_data = '>'.join(cmd_data_temp) cmd_data = cmd_data.replace('= ','=') cmd_data = cmd_data.replace('^',' ') cmd_data = cmd_data.split('>') """ return cmd_data
def MicroarrayParser(lines): """Returns tuple: ([ProbeNames],[GeneNames],[LogRatios]) for all dots in microarray file. """ probe_names = [] gene_names = [] log_ratios = [] #Make sure lines is not empty if lines: #Get the block of lines that starts with FEATURES features_record = LabeledRecordFinder(\ lambda x: x.startswith('FEATURES')) features_block = list(features_record(lines)) #Discard first block features_block = features_block[1] #Get the indices of GeneName and LogRatio from the block features_list = features_block[0].split('\t') probe_index = features_list.index('ProbeName') gene_index = features_list.index('GeneName') log_index = features_list.index('LogRatio') #Get the lists for GeneName and LogRatio for line in features_block[1:]: temp = line.split('\t') probe_names.append(temp[probe_index].upper()) gene_names.append(temp[gene_index].upper()) log_ratios.append(float(temp[log_index])) return (probe_names, gene_names, log_ratios)
def get_motif_p_value(lines): """Returns the motif p-value given motif block. """ motif_p_finder = LabeledRecordFinder(lambda x: x.startswith('Log Motif')) motif_p_block = list(motif_p_finder(lines))[-1] log_motif_portion = float(motif_p_block[0].split()[-1]) return exp(log_motif_portion)
def get_motif_sequences(lines): """Returns list of tuples with motif sequence information given motif block. - result is list of tuples : [(seq_num, motif_start, motif_seq, motif_sig),] """ motif_list = [] motif_seq_finder = LabeledRecordFinder(lambda x: 'columns' in x) motifs = list(motif_seq_finder(lines))[-1] for m in motifs[2:]: if ',' in m: curr = m.strip().split() motif_num = curr[1] seq_num = curr[0].split(',')[0] motif_start = int(curr[2]) - 1 #If motif does not start at beginning of sequence: if motif_start > 0: motif_seq = curr[4] #Motif starts at beginning of sequence, no context before motif else: motif_seq = curr[3] motif_sig = float(curr[-3]) motif_list.append( (seq_num, motif_start, motif_seq, motif_sig, motif_num)) else: break return motif_list
def getSummaryBlock(module_blocks): """Returns summary of motifs block. """ meme_summary = LabeledRecordFinder(lambda x: x.startswith('SUMMARY'),\ constructor=None,ignore=lambda x: x.startswith(' ')) summary_block = list(meme_summary(module_blocks)) return summary_block[1]
def get_sequence_and_motif_blocks(lines): """Returns main block of data as a list. """ gibbs_map_maximization = \ LabeledRecordFinder(lambda x: 'MAP MAXIMIZATION RESULTS' in x) seq_block, motif_block = list(gibbs_map_maximization(lines)) return seq_block, motif_block
def getDataBlock(lines): """Returns main block of data as list. """ #Get main data block: All lines following "COMMAND LINE SUMMARY" meme_command = LabeledRecordFinder(lambda x: x.startswith('COMMAND')) main_block = list(meme_command(lines)) alphabet = getMolType(main_block[0]) return main_block[1], alphabet
def getModuleDataBlocks(module_blocks): """Returns list data blocks for each module. """ #Get blocks of module information for each module meme_module_data = LabeledRecordFinder(lambda x: x.startswith('Motif')) module_data_blocks = [] for module in module_blocks: module_data_blocks.append(list(meme_module_data(module))) return module_data_blocks
def lazy_parse_sff_handle(handle): """Returns one flowgram at a time """ sff_info = LabeledRecordFinder(is_fasta_label, constructor=strip) sff_gen = sff_info(handle) header_lines = next(sff_gen) header = get_header_info(header_lines) return (_sff_parser(sff_gen, header), header)
def getCommandModuleBlocks(main_block): """Returns command line summary block and list of module blocks. """ #Get Command line summary and all module information meme_module = LabeledRecordFinder(lambda x: x.startswith('MOTIF')) main_block = list(meme_module(main_block)) command_block = main_block[0] module_blocks = [] if len(main_block) > 1: module_blocks = main_block[1:] return command_block, module_blocks
def test_parsers_ignore(self): """LabeledRecordFinder should skip lines to ignore.""" def never(line): return False def ignore_labels(line): return (not line) or line.isspace() or line.startswith('#') def is_start(line): return line.startswith('>') lines = ['>abc','\n','1','>def','#ignore','2'] self.assertEqual(list(LabeledRecordFinder(is_start)(lines)), [['>abc', '1'],['>def','#ignore','2']]) self.assertEqual(list(LabeledRecordFinder(is_start, ignore=never)(lines)), [['>abc', '', '1'],['>def','#ignore','2']]) self.assertEqual(list(LabeledRecordFinder(is_start, ignore=ignore_labels)(lines)), [['>abc','1'],['>def','2']])
def RnaFoldParser(lines): """Returns a tuple containing sequence and dot plot indices. (sequence, (index1, index2, pair probability)) """ sequence = '' indices = [] #Make sure lines is not empty if lines: #Get the block of lines that starts with /sequence sequence_block = LabeledRecordFinder(\ lambda x: x.startswith('/sequence')) #only care about the second element in the result seq_block = list(sequence_block(lines))[1] #Get the sequence from the block sequence = getSequence(seq_block) #Get the indices and pair probabilites from the block indices = getIndices(seq_block) return (sequence, indices)
def get_sequence_map(lines): """Returns dict mapping Gibbs sequence number to sequence ID. - ex: sequence numbers mapping to gis: {'1':'1091044', '2':'11467494', '3':'11499727'} """ sequence_map = {} sequence_finder = \ LabeledRecordFinder(lambda x: x.startswith('Sequences to be Searched:')) sequence_block = list(sequence_finder(lines))[-1] for i in sequence_block[2:]: if i.startswith('#'): num, label = i.strip().split(' ', 1) num = num.strip() label = label.strip() sequence_map[num[1:]] = label else: break return sequence_map
def get_summaries(handle, number_list=None, name_list=None, all_sums=False): """Returns specified flowgrams and sequence summaries as generator handle can be a list of lines or a file handle number_list is a list of the summaries wanted by their index in the sff file, starts at 0 name_list is a list of the summaries wanted by their name in the sff file all_sums if true will yield all the summaries in the order they appear in the file One and only one of the parameters must be set """ sff_info = LabeledRecordFinder(is_fasta_label, constructor=strip) sum_gen = sff_info(handle) if number_list: assert not (name_list or all_sums) num = len(number_list) for i, s in enumerate(sum_gen): if i - 1 in number_list: yield s num -= 1 if num == 0: break elif name_list: assert not all_sums for s in sum_gen: if s[0].strip('>') in name_list: yield s elif all_sums: header = True for s in sum_gen: if header: header = False continue yield s else: raise ValueError( "number_list, name_list or all_sums must be specified")
def get_rnaplot_postscript(sequence, struct): """Returns postscript string for seq and struct. """ #Params for RNAplot params = {'-t':'0',\ '--pre':'%PreTextHere'} #Get the postscript list ps_list = plot_from_seq_and_struct(sequence,\ struct,params=params).split('\n') #parse it into prefix and suffix lists pre_finder = LabeledRecordFinder(\ is_label_line=lambda x: x.startswith('%PreTextHere')) prefix,suffix = list(pre_finder(ps_list)) #Remove drawoutline and drawpairs commands form suffix new_suffix = [] for s in suffix: if not (s.startswith('drawpairs') or s.startswith('drawoutline')): new_suffix.append(s) return '\n'.join(prefix), '\n'.join(new_suffix)
def is_gde_label(x): """Checks if x looks like a GDE label line.""" return x and x[0] in '%#' def is_blank_or_comment(x): """Checks if x is blank or a FASTA comment line.""" return (not x) or x.startswith('#') or x.isspace() def is_blank(x): """Checks if x is blank.""" return (not x) or x.isspace() FastaFinder = LabeledRecordFinder(is_fasta_label, ignore=is_blank_or_comment) def MinimalFastaParser(infile, strict=True, \ label_to_name=str, finder=FastaFinder, \ is_label=None, label_characters='>'): """Yields successive sequences from infile as (label, seq) tuples. If strict is True (default), raises RecordError when label or seq missing. """ for rec in finder(infile): #first line must be a label line if not rec[0][0] in label_characters: if strict: raise RecordError, "Found Fasta record without label line: %s"%\ rec
def setUp(self): """Setup function for meme tests. """ #Meme output data: self.meme_file = MEME_FILE.split('\n') self.meme_main = LabeledRecordFinder(lambda x: x.startswith('COMMAND')) self.meme_command = LabeledRecordFinder(lambda x: x.startswith('MOTIF')) self.meme_summary = LabeledRecordFinder(lambda x: x.startswith('SUMMARY')) self.meme_module = LabeledRecordFinder(lambda x: x.startswith('Motif')) self.alphabet_block, self.main_block = \ list(self.meme_main(self.meme_file)) self.cmd_mod_list = list(self.meme_command(self.main_block)) self.command_block = self.cmd_mod_list[0] self.module_blocks = self.cmd_mod_list[1:] self.summary_block = list(self.meme_summary(self.module_blocks[-1]))[1] self.module_data_blocks = [] for module in self.module_blocks: self.module_data_blocks.append(\ list(self.meme_module(module))) #List and Dict for testing dictFromList function self.sample_list = ['key1',1,'key2',2,'key3',3,'key4',4] self.sample_dict = {'key1':1, 'key2':2, 'key3':3, 'key4':4, } #List of command line data self.command_line_list = [ 'model: mod= tcm nmotifs= 3 evt= 1e+100', 'object function= E-value of product of p-values', 'width: minw= 4 maxw= 10 minic= 0.00', 'width: wg= 11 ws= 1 endgaps= yes', 'nsites: minsites= 2 maxsites= 50 wnsites= 0.8', 'theta: prob= 1 spmap= uni spfuzz= 0.5', 'em: prior= dirichlet b= 0.01 maxiter= 20', 'distance= 1e-05', 'data: n= 597 N= 15', 'strands: +', 'sample: seed= 0 seqfrac= 1', ] #List of dicts which contain general info for each module. self.module_info_dicts = [ {'MOTIF':'1', 'width':'10', 'sites':'11', 'llr':'131', 'E-value':'1.3e-019', }, {'MOTIF':'2', 'width':'7', 'sites':'11', 'llr':'88', 'E-value':'2.5e-006', }, {'MOTIF':'3', 'width':'7', 'sites':'6', 'llr':'53', 'E-value':'5.5e-001', }, ] #Summary dict self.summary_dict = {'CombinedP':{ '1': float(3.48e-02), '11': float(3.78e-05), '17': float(2.78e-08), '28': float(3.49e-06), '105': float(3.98e-06), '159': float(1.08e-02), '402-C01': float(4.22e-07), '407-A07': float(7.32e-08), '410-A10': float(4.23e-04), '505-D01': float(5.72e-07), '507-B04-1': float(1.01e-04), '518-D12': float(2.83e-06), '621-H01': float(8.69e-07), '625-H05': float(8.86e-06), '629-C08': float(5.61e-07), } } self.remap_dict = { '11':'11', '1':'1', '407-A07':'407-A07', '17':'17', '159':'159', '505-D01':'505-D01', '28':'28', '507-B04-1':'507-B04-1', '402-C01':'402-C01', '621-H01':'621-H01', '629-C08':'629-C08', '410-A10':'410-A10', '105':'105', '625-H05':'625-H05', '518-D12':'518-D12' } #ModuleInstances and Modules self.ModuleInstances = [ [ModuleInstance('CTATTGGGGC',Location('629-C08',18,28), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('621-H01',45,55), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('505-D01',26,36), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('407-A07',5,15), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('105',0,10), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('28',3,13), float(1.95e-06)), ModuleInstance('CTATTGGGGC',Location('17',16,26), float(1.95e-06)), ModuleInstance('CTATTGGGCC',Location('402-C01',24,34), float(3.30e-06)), ModuleInstance('CTAGTGGGGC',Location('625-H05',2,12), float(5.11e-06)), ModuleInstance('CTAGTGGGCC',Location('11',15,25), float(6.37e-06)), ModuleInstance('CTATTGGGGT',Location('518-D12',0,10), float(9.40e-06)), ], [ModuleInstance('CGTTACG',Location('629-C08',37,44), float(6.82e-05)), ModuleInstance('CGTTACG',Location('621-H01',30,37), float(6.82e-05)), ModuleInstance('CGTTACG',Location('507-B04-1',8,15), float(6.82e-05)), ModuleInstance('CGTTACG',Location('410-A10',7,14), float(6.82e-05)), ModuleInstance('CGTTACG',Location('407-A07',26,33), float(6.82e-05)), ModuleInstance('CGTTACG',Location('17',0,7), float(6.82e-05)), ModuleInstance('TGTTACG',Location('625-H05',32,39), float(1.74e-04)), ModuleInstance('TGTTACG',Location('505-D01',3,10), float(1.74e-04)), ModuleInstance('CATTACG',Location('518-D12',30,37), float(2.14e-04)), ModuleInstance('CGGTACG',Location('402-C01',1,8), float(2.77e-04)), ModuleInstance('TGTTCCG',Location('629-C08',5,12), float(6.45e-04)), ], [ModuleInstance('CTATTGG',Location('629-C08',57,64), float(1.06e-04)), ModuleInstance('CTATTGG',Location('507-B04-1',42,49), float(1.06e-04)), ModuleInstance('CTATTGG',Location('410-A10',27,34), float(1.06e-04)), ModuleInstance('CTATTGG',Location('159',14,21), float(1.06e-04)), ModuleInstance('CTATTGG',Location('1',18,25), float(1.06e-04)), ModuleInstance('CTAATGG',Location('507-B04-1',28,35), float(1.63e-04)), ], ] self.Modules = [] for module, info in zip(self.ModuleInstances, self.module_info_dicts): curr_module_data = {} for instance in module: curr_module_data[(instance.Location.SeqId, instance.Location.Start)] = instance temp_module = Module(curr_module_data, MolType=DNA, Evalue=float(info['E-value']), Llr=int(info['llr'])) self.Modules.append(temp_module) self.ConsensusSequences = ['CTATTGGGGC','CGTTACG','CTATTGG']
taxonomy = ' '.join(taxonomy.split()) #separate by semicolons taxa = map(strip, taxonomy.split(';')) #get rid of leading/trailing spaces #delete trailing period if present last = taxa[-1] if last.endswith('.'): taxa[-1] = last[:-1] return species, taxa def is_feature_component_start(line): """Checks if a line starts with '/', ignoring whitespace.""" return line.lstrip().startswith('/') feature_component_iterator = LabeledRecordFinder(is_feature_component_start) _join_with_empty = dict.fromkeys(['translation']) _leave_as_lines = {} def parse_feature(lines): """Parses a feature. Doesn't handle subfeatures. Returns dict containing: 'type': source, gene, CDS, etc. 'location': unparsed location string ...then, key-value pairs for each annotation, e.g. '/gene="MNBH"' -> {'gene':['MNBH']} (i.e. quotes stripped) All relations are assumed 'to many', and order will be preserved. """
def is_cutg_label(x): """Checks if x looks like a CUTG label line.""" return x.startswith('>') def is_cutg_species_label(x): """Checks if x looks like a CUTG label line.""" return ':' in x def is_blank(x): """Checks if x is blank.""" return (not x) or x.isspace() CutgSpeciesFinder = LabeledRecordFinder(is_cutg_species_label, ignore=is_blank) CutgFinder = LabeledRecordFinder(is_cutg_label, ignore=is_blank) codon_order = "CGA CGC CGG CGU AGA AGG CUA CUC CUG CUU UUA UUG UCA UCC UCG UCU AGC AGU ACA ACC ACG ACU CCA CCC CCG CCU GCA GCC GCG GCU GGA GGC GGG GGU GUA GUC GUG GUU AAA AAG AAC AAU CAA CAG CAC CAU GAA GAG GAC GAU UAC UAU UGC UGU UUC UUU AUA AUC AUU AUG UGG UAA UAG UGA".split( ) #NOTE: following field order omits Locus/CDS (first field), which needs further #processing. Use zip(field_order, fields[1:]) and handle first field specially. field_order = "GenBank Location Length GenPept Species Description".split() species_label_splitter = DelimitedSplitter(':', -1) def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage): """Yields successive sequences from infile as CodonUsage objects.
from cogent.parse.record_finder import LabeledRecordFinder from string import maketrans, strip __author__ = "Rob Knight" __copyright__ = "Copyright 2007-2012, The Cogent Project" __credits__ = ["Rob Knight"] __license__ = "GPL" __version__ = "1.5.3" __maintainer__ = "Rob Knight" __email__ = "*****@*****.**" __status__ = "Development" def ll_start(line): """Returns True if line looks like the start of a LocusLink record.""" return line.startswith('>>') LLFinder = LabeledRecordFinder(ll_start) pipes = DelimitedSplitter('|', None) first_pipe = DelimitedSplitter('|') commas = DelimitedSplitter(',', None) first_colon = DelimitedSplitter(':', 1) accession_wrapper = FieldWrapper(['Accession', 'Gi', 'Strain'], pipes) def _read_accession(line): """Reads accession lines: format is Accession | Gi | Strain.""" return MappedRecord(accession_wrapper(line)) rell_wrapper = FieldWrapper(['Description', 'Id', 'IdType', 'Printable'], pipes) def _read_rell(line): """Reads RELL lines: format is Description|Id|IdType|Printable""" return MappedRecord(rell_wrapper(line))
def get_all_summaries(lines): """Returns all the flowgrams and sequence summaries in list of lists""" sff_info = LabeledRecordFinder(is_fasta_label, constructor=strip) return list(sff_info(lines))[1::]
WARNING: Only maps the data type if the key is in label_constructors above. """ if not line.startswith("#"): raise ValueError, "Labels must start with a # symbol." if line.find(":") == -1: raise ValueError, "Labels must contain a : symbol." key, value = map(strip, line[1:].split(":", 1)) key = key.upper() if key in label_constructors: value = label_constructors[key](value) return key, value BlatFinder = LabeledRecordFinder(query_finder, constructor=strip, \ ignore=is_blat_junk) BlastFinder = LabeledRecordFinder(query_finder, constructor=strip, \ ignore=is_blast_junk) PsiBlastFinder = LabeledRecordFinder(iter_finder, constructor=strip, \ ignore=is_blast_junk) PsiBlastQueryFinder = LabeledRecordFinder(iteration_set_finder, \ constructor=strip, ignore=is_blast_junk) def GenericBlastParser9(lines, finder, make_col_headers=False): """Yields successive records from lines (props, data list) Infile must in blast9 format
def get_motif_blocks(lines): """Returns list of motif blocks given main block as lines. """ gibbs_motif = LabeledRecordFinder(lambda x: 'MOTIF' in x) return list(gibbs_motif(lines))[1:]
elif len(attr) == 1: result.State = attr #handle line width elif attr.startswith('width'): result.Width = int(attr[5:]) else: #otherwise assume it's a color label result.Color = attr return result def _is_keyword(line): if line.startswith('@'): return True return False KeywordFinder = LabeledRecordFinder(_is_keyword) def MageParser(infile): """MageParser returns a new kinemage object, created from a string repr. infile: should be an iterable file object The MageParser works only on ONE kinemage object, so files containing more than one kinemage should be split beforehand. This can easily be adjusted if it would be useful in the future. The MageParser handles only certain keywords (@kinemage, @text, @caption, @___group, @____list) and MagePoints at this point in time. All unkown keywords are assumed to be part of the header, so you can find them in the header information. The lists that are part of the Simplex header are treated as normal lists.
def setUp(self): """Define a standard LabeledRecordFinder""" self.FastaLike = LabeledRecordFinder(lambda x: x.startswith('>'))