def _summarize(self, cv_res): """For each case-estimator, return best param draw from cv results.""" summary = _dict() for case_est, data in cv_res.items(): # For each case and estimator, iterate over draws to find best # test score best_data = None for draw_num, draw_data in data.items(): if best_data is None: best_data, best_draw = draw_data, draw_num best_data['params'] = self.params[case_est][best_draw] if draw_data['test_score_mean'] > best_data['test_score_mean']: best_data, best_draw = draw_data, draw_num best_data['params'] = self.params[case_est][best_draw] # Assign data associated with best test score to summary dict # We invert the dictionary nesting here for metric, val in best_data.items(): if metric not in summary: summary[metric] = _dict() summary[metric][case_est] = val return summary
def _collect(self): """Collect output and format into dicts.""" # Scores are returned as a list of tuples for each case, est, draw and # fold. We need to aggregate them up to case, est and draw level. scores = self._aggregate_scores() # To build the cv_results dictionary, we loop over the scores dict and # aggregate the lists created on the metrics specified. cv_res = self._get_results(scores) # Summarize best draws for each case-est draw summary = self._summarize(cv_res) # Finally, we sort summary in order of best performance rank = sorted(summary['test_score_mean'], key=itemgetter(1), reverse=True) pretty_summary = _dict() for metric, data in summary.items(): pretty_summary[metric] = _dict() for case_est in rank: pretty_summary[metric][case_est] = data[case_est] self.cv_results = cv_res self.summary = pretty_summary
def __init__(self, random_access_proxy, key_function, repr, obj_repr): """Initialize the class.""" # Use key_function=None for default value self._proxy = random_access_proxy self._key_function = key_function self._repr = repr self._obj_repr = obj_repr if key_function: offset_iter = ( (key_function(k), o, l) for (k, o, l) in random_access_proxy) else: offset_iter = random_access_proxy offsets = _dict() for key, offset, length in offset_iter: # Note - we don't store the length because I want to minimise the # memory requirements. With the SQLite backend the length is kept # and is used to speed up the get_raw method (by about 3 times). # The length should be provided by all the current backends except # SFF where there is an existing Roche index we can reuse (very fast # but lacks the record lengths) # assert length or format in ["sff", "sff-trim"], \ # "%s at offset %i given length %r (%s format %s)" \ # % (key, offset, length, filename, format) if key in offsets: self._proxy._handle.close() raise ValueError("Duplicate key '%s'" % key) else: offsets[key] = offset self._offsets = offsets
def __init__(self): ''' Initialize the class, should be called once only. ''' current_version = notepad.getPluginVersion() if current_version < '1.5.4.0': notepad.messageBox( 'It is needed to run PythonScript version 1.5.4.0 or higher', 'Unsupported PythonScript verion: {}'.format(current_version)) return self.INDICATOR_ID = 0 self.registered_lexers = _dict() self.document_is_of_interest = False self.regexes = None self.excluded_styles = None editor1.indicSetStyle(self.INDICATOR_ID, INDICATORSTYLE.TEXTFORE) editor1.indicSetFlags(self.INDICATOR_ID, INDICFLAG.VALUEFORE) editor2.indicSetStyle(self.INDICATOR_ID, INDICATORSTYLE.TEXTFORE) editor2.indicSetFlags(self.INDICATOR_ID, INDICFLAG.VALUEFORE) editor.callbackSync(self.on_updateui, [SCINTILLANOTIFICATION.UPDATEUI]) editor.callbackSync(self.on_marginclick, [SCINTILLANOTIFICATION.MARGINCLICK]) notepad.callback(self.on_langchanged, [NOTIFICATION.LANGCHANGED]) notepad.callback(self.on_bufferactivated, [NOTIFICATION.BUFFERACTIVATED])
def __init__(self, random_access_proxy, key_function, repr, obj_repr): """Initialize the class.""" # Use key_function=None for default value self._proxy = random_access_proxy self._key_function = key_function self._repr = repr self._obj_repr = obj_repr if key_function: offset_iter = ( (key_function(k), o, l) for (k, o, l) in random_access_proxy) else: offset_iter = random_access_proxy offsets = _dict() for key, offset, length in offset_iter: # Note - we don't store the length because I want to minimise the # memory requirements. With the SQLite backend the length is kept # and is used to speed up the get_raw method (by about 3 times). # The length should be provided by all the current backends except # SFF where there is an existing Roche index we can reuse (very fast # but lacks the record lengths) # assert length or format in ["sff", "sff-trim"], \ # "%s at offset %i given length %r (%s format %s)" \ # % (key, offset, length, filename, format) if key in offsets: self._proxy._handle.close() raise ValueError("Duplicate key '%s'" % key) else: offsets[key] = offset self._offsets = offsets
def to_ident(self, ): x = self.response d0 = self.__dict__.copy() d0.pop('response', None) # d = dict(x.headers.copy()) _d = {} d = x.headers via = d.get('via', '') _d['header_ident'] = None if 'varnish' in via: _d['header_ident'] = d.get('etag', None) if _d['header_ident'] is None: _d['header_ident'] = d.get('content-length', None) if _d['header_ident'] is None: _d['header_ident'] = d.get('content-disposition', None) if _d['header_ident'] is None: if not x.text: raise Exception('HTTP header is not informative!%s' % json.dumps(_dict(x.headers), indent=2)) # hd = _dict() # hd['clen'] = d.get('Content-Length', None) # hd['cdisp'] = d.get('Content-Disposition',None) # hd['ctype'] = d.get('Content-Type', None) # assert hd['clen'] or hd['cdisp'], hd # return [ sorted(d0.items()), ('_header_ident',list(hd.values())), ('_text',x.text)] return [ sorted(d0.items()), ('_header_ident', _d['header_ident']), ('_text', x.text) ]
def _get_results(self, scores): """Return score metrics for each case, est and param draw level.""" cv_res = _dict() for name, case_est_data in scores.items(): if name not in cv_res: cv_res[name] = _dict() for draw_num, draw_data in case_est_data.items(): if draw_num not in cv_res[name]: cv_res[name][draw_num] = _dict() for key, values in draw_data.items(): for n, m in zip(['mean', 'std'], self.metrics): cv_res[name][draw_num]['%s_%s' % (key, n)] = m(values) return cv_res
def to_dict(qresults, key_function=lambda rec: rec.id): """Turn a QueryResult iterator or list into a dictionary. - qresults - Iterable returning QueryResult objects. - key_function - Optional callback function which when given a QueryResult object should return a unique key for the dictionary. This function enables access of QueryResult objects from a single search output file using its identifier. >>> from Bio import SearchIO >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') >>> search_dict = SearchIO.to_dict(qresults) >>> list(search_dict) ['gi|195230749:301-1383', 'gi|325053704:108-1166', ..., 'gi|53729353:216-1313'] >>> search_dict['gi|156630997:105-1160'] QueryResult(id='gi|156630997:105-1160', 5 hits) By default, the dictionary key is the QueryResult's string ID. This may be changed by supplying a callback function that returns the desired identifier. Here is an example using a function that removes the 'gi|' part in the beginning of the QueryResult ID. >>> from Bio import SearchIO >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') >>> key_func = lambda qresult: qresult.id.split('|')[1] >>> search_dict = SearchIO.to_dict(qresults, key_func) >>> list(search_dict) ['195230749:301-1383', '325053704:108-1166', ..., '53729353:216-1313'] >>> search_dict['156630997:105-1160'] QueryResult(id='gi|156630997:105-1160', 5 hits) Note that the callback function does not change the QueryResult's ID value. It only changes the key value used to retrieve the associated QueryResult. As this function loads all QueryResult objects into memory, it may be unsuitable for dealing with files containing many queries. In that case, it is recommended that you use either `index` or `index_db`. Since Python 3.7, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. For CPython, this was already implemented in 3.6. As of Biopython 1.73, we explicitly use OrderedDict for CPython older than 3.6 (and for other Python older than 3.7) so that you can always assume the record order is preserved. """ qdict = _dict() for qresult in qresults: key = key_function(qresult) if key in qdict: raise ValueError("Duplicate key %r" % key) qdict[key] = qresult return qdict
def to_dict(qresults, key_function=lambda rec: rec.id): """Turn a QueryResult iterator or list into a dictionary. - qresults - Iterable returning QueryResult objects. - key_function - Optional callback function which when given a QueryResult object should return a unique key for the dictionary. This function enables access of QueryResult objects from a single search output file using its identifier. >>> from Bio import SearchIO >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') >>> search_dict = SearchIO.to_dict(qresults) >>> list(search_dict) ['gi|195230749:301-1383', 'gi|325053704:108-1166', ..., 'gi|53729353:216-1313'] >>> search_dict['gi|156630997:105-1160'] QueryResult(id='gi|156630997:105-1160', 5 hits) By default, the dictionary key is the QueryResult's string ID. This may be changed by supplying a callback function that returns the desired identifier. Here is an example using a function that removes the 'gi|' part in the beginning of the QueryResult ID. >>> from Bio import SearchIO >>> qresults = SearchIO.parse('Blast/wnts.xml', 'blast-xml') >>> key_func = lambda qresult: qresult.id.split('|')[1] >>> search_dict = SearchIO.to_dict(qresults, key_func) >>> list(search_dict) ['195230749:301-1383', '325053704:108-1166', ..., '53729353:216-1313'] >>> search_dict['156630997:105-1160'] QueryResult(id='gi|156630997:105-1160', 5 hits) Note that the callback function does not change the QueryResult's ID value. It only changes the key value used to retrieve the associated QueryResult. As this function loads all QueryResult objects into memory, it may be unsuitable for dealing with files containing many queries. In that case, it is recommended that you use either `index` or `index_db`. Since Python 3.7, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. For CPython, this was already implemented in 3.6. As of Biopython 1.73, we explicitly use OrderedDict for CPython older than 3.6 (and for other Python older than 3.7) so that you can always assume the record order is preserved. """ qdict = _dict() for qresult in qresults: key = key_function(qresult) if key in qdict: raise ValueError("Duplicate key %r" % key) qdict[key] = qresult return qdict
def _aggregate_scores(self): """Aggregate scores to one list per case, est and param draw level.""" scores = _dict() for case, est, draw_num, train_sc, test_sc, fit_time in self.scores_: # Strip fold data if case is not None: name = (case.split('__')[0], est.split('__')[0]) else: name = est.split('__')[0] if name not in scores: scores[name] = _dict() if draw_num not in scores[name]: scores[name][draw_num] = _dict(test_score=[], train_score=[], fit_time=[]) scores[name][draw_num]['test_score'].append(test_sc) scores[name][draw_num]['train_score'].append(train_sc) scores[name][draw_num]['fit_time'].append(fit_time)
def _get_output_file(FNAME,INDEX_FILE=INDEX_FILE): d = [ ("TS",time.time()), ("OUTPUT_FILE",FNAME,), ("RUNTIME_FILE", pymisca.header.name__lookup('__file__',level=-1)), ] d = _dict(d) print (json.dumps(d,indent=4)) with FileLock( INDEX_FILE +'.lock'): with open(INDEX_FILE, "a") as f: # f.write(json.dumps(d)+'\n') json.dump(d, f) f.write('\n') return FNAME
class Dict(_dict): """ A dict in which the items can be get/set as attributes. """ __reserved_names__ = dir(_dict()) # Also from OrderedDict __pure_names__ = dir(dict()) __slots__ = [] def __repr__(self): identifier_items = [] nonidentifier_items = [] for key, val in self.items(): if isidentifier(key): identifier_items.append("%s=%r" % (key, val)) else: nonidentifier_items.append("(%r, %r)" % (key, val)) if nonidentifier_items: return "Dict([%s], %s)" % ( ", ".join(nonidentifier_items), ", ".join(identifier_items), ) else: return "Dict(%s)" % (", ".join(identifier_items)) def __getattribute__(self, key): try: return object.__getattribute__(self, key) except AttributeError: if key in self: return self[key] else: raise def __setattr__(self, key, val): if key in Dict.__reserved_names__: # Either let OrderedDict do its work, or disallow if key not in Dict.__pure_names__: return _dict.__setattr__(self, key, val) else: raise AttributeError("Reserved name, this key can only " + "be set via ``d[%r] = X``" % key) else: # if isinstance(val, dict): val = Dict(val) -> no, makes a copy! self[key] = val def __dir__(self): names = [k for k in self.keys() if isidentifier(k)] return Dict.__reserved_names__ + names
def register_lexer(self, lexer_name, _regexes, excluded_styles): ''' reformat provided regexes and cache everything within registered_lexers dictionary. Args: lexer_name = string, expected values as returned by notepad.getLanguageName without the "udf - " if it is an user defined language _regexes = dict, in the form of _regexes[(int, (r, g, b))] = (r'', [int]) excluded_styles = list of integers Returns: None ''' regexes = _dict() for k, v in _regexes.items(): regexes[(k[0], self.rgb(*k[1]) | INDICVALUE.BIT)] = v self.registered_lexers[lexer_name.lower()] = (regexes, excluded_styles)
def get_test_plan_test_case_info(): import os import json global_info_dict = _dict() for dirpath, dirnames, filenames in os.walk(os.getcwd() + os.sep + 'test_case_cache' + os.sep + 'Subject' + os.sep + 'Purley_FPGA'): i = 0 for eb in filenames: i += 1 print os.path.join(dirpath, eb) with open(os.path.join(dirpath, eb), 'r') as p: data = json.load(p) signal_data_dict = HPQC_info_parser_tool(data) print signal_data_dict global_info_dict[ signal_data_dict['_test_id']] = signal_data_dict print global_info_dict, len(global_info_dict)
def graph_from_tree( lst, g=None, last=None, i=None, ): ''' Accept a list of shape [(node,[(file,[(node,[(file,)])]),]),] ''' this = graph_from_tree if g is None: g = Digraph('G', strict=True) g.attr(rankdir='TB') # g = Digraph('G', strict=0,) if not i: i = [0] if last is None: last = Path('root') g.node(last, label=repr(last), shape='diamond') out = [] for node, node_files in (lst): i[0] += 1 g.edge(last, node.prefix_named) g.node(node.prefix_named, label=node.to_table_node_label(), shape='plaintext') with g.subgraph(name='cluster_%s' % node.prefix_named) as c: # c.attr(label= node.dotname) c.attr(color='blue') c.node_attr['style'] = 'filled' nout = [] for file, down_nodes in node_files: c.node(file, label='%r' % file.basename(), style='filled') c.edge(node.prefix_named, file) # .basename()) for file, down_nodes in node_files: res = this(down_nodes, g, file, i) nout.append((file.basename(), res)) out.append((node.prefix_named, _dict(nout))) return g
class Dict(_dict): """ A dict in which the keys can be get and set as if they were attributes. Very convenient in combination with autocompletion. This Dict still behaves as much as possible as a normal dict, and keys can be anything that are otherwise valid keys. However, keys that are not valid identifiers or that are names of the dict class (such as 'items' and 'copy') cannot be get/set as attributes. """ __reserved_names__ = dir(_dict()) # Also from OrderedDict __pure_names__ = dir(dict()) def __getattribute__(self, key): try: return object.__getattribute__(self, key) except AttributeError: if key in self: return self[key] else: raise def __setattr__(self, key, val): if key in Dict.__reserved_names__: # Either let OrderedDict do its work, or disallow if key not in Dict.__pure_names__: return _dict.__setattr__(self, key, val) else: raise AttributeError('Reserved name, this key can only ' + 'be set via ``d[%r] = X``' % key) else: # if isinstance(val, dict): val = Dict(val) -> no, makes a copy! self[key] = val def __dir__(self): isidentifier = lambda x: bool(re.match(r'[a-z_]\w*$', x, re.I)) names = [ k for k in self.keys() if (isinstance(k, string_types) and isidentifier(k)) ] return Dict.__reserved_names__ + names
def to_dict(sequences, key_function=None): """Turn a sequence iterator or list into a dictionary. Arguments: - sequences - An iterator that returns SeqRecord objects, or simply a list of SeqRecord objects. - key_function - Optional callback function which when given a SeqRecord should return a unique key for the dictionary. e.g. key_function = lambda rec : rec.name or, key_function = lambda rec : rec.description.split()[0] If key_function is omitted then record.id is used, on the assumption that the records objects returned are SeqRecords with a unique id. If there are duplicate keys, an error is raised. Since Python 3.7, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. For CPython, this was already implemented in 3.6. As of Biopython 1.73, we explicitly use OrderedDict for CPython older than 3.6 (and for other Python older than 3.7) so that you can always assume the record order is preserved. Example usage, defaulting to using the record.id as key: >>> from Bio import SeqIO >>> filename = "GenBank/cor6_6.gb" >>> format = "genbank" >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format)) >>> print(list(id_dict)) ['X55053.1', 'X62281.1', 'M81224.1', 'AJ237582.1', 'L31939.1', 'AF297471.1'] >>> print(id_dict["L31939.1"].description) Brassica rapa (clone bif72) kin mRNA, complete cds A more complex example, using the key_function argument in order to use a sequence checksum as the dictionary key: >>> from Bio import SeqIO >>> from Bio.SeqUtils.CheckSum import seguid >>> filename = "GenBank/cor6_6.gb" >>> format = "genbank" >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format), ... key_function = lambda rec : seguid(rec.seq)) >>> for key, record in sorted(seguid_dict.items()): ... print("%s %s" % (key, record.id)) /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1 This approach is not suitable for very large sets of sequences, as all the SeqRecord objects are held in memory. Instead, consider using the Bio.SeqIO.index() function (if it supports your particular file format). Since Python 3.6, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. As of Biopython 1.72, on older versions of Python we explicitly use an OrderedDict so that you can always assume the record order is preserved. """ if key_function is None: key_function = lambda rec: rec.id d = _dict() for record in sequences: key = key_function(record) if key in d: raise ValueError("Duplicate key '%s'" % key) d[key] = record return d
def assemble_data(data_list): """Build a data dictionary out of a list of entries and data dicts Given a list named tuples of dictionaries, :func:`assemble_data` returns a nested ordered dictionary with data keys as outer keys and tuple names as inner keys. The returned dictionary can be printed in tabular format by :func:`assemble_table`. .. seealso:: :class:`Data`, :func:`assemble_table` Examples -------- >>> from mlens.metrics import assemble_data, assemble_table >>> d = [('row-idx-1.row-idx-2.a.b', {'column-1': 0.1, 'column-2': 0.1})] >>> print(assemble_table(assemble_data(d))) column-2-m column-2-s column-1-m column-1-s row-idx-1 row-idx-2 0.10 0.00 0.10 0.00 """ data = _dict() tmp = _dict() partitions = _get_partitions(data_list) # Collect scores per preprocessing case and estimator(s) for name, data_dict in data_list: if not data_dict: continue prefix, name = _split(name, '/', a_s='/') # Names are either est.i.j or case.est.i.j splitted = name.split('.') if partitions: name = tuple(splitted[:-1]) if len(name) == 3: name = '%s.%s--%s' % name else: name = '%s--%s' % name else: name = '.'.join(splitted[:-2]) name = '%s%s' % (prefix, name) if name not in tmp: # Set up data struct for name tmp[name] = _dict() for k in data_dict.keys(): tmp[name][k] = list() if '%s-m' % k not in data: data['%s-m' % k] = _dict() data['%s-s' % k] = _dict() data['%s-m' % k][name] = list() data['%s-s' % k][name] = list() # collect all data dicts belonging to name for k, v in data_dict.items(): tmp[name][k].append(v) # Aggregate to get mean and std for name, data_dict in tmp.items(): for k, v in data_dict.items(): if not v: continue try: # Purge None values from the main est due to no predict times v = [i for i in v if i is not None] if v: data['%s-m' % k][name] = np.mean(v) data['%s-s' % k][name] = np.std(v) except Exception as exc: warnings.warn( "Aggregating data for %s failed. Raw data:\n%r\n" "Details: %r" % (k, v, exc), MetricWarning) # Check if there are empty columns discard = list() for key, data_dict in data.items(): empty = True for val in data_dict.values(): if val or val == 0: empty = False if empty: discard.append(key) for key in discard: data.pop(key) return data
def to_dict(sequences, key_function=None): """Turn a sequence iterator or list into a dictionary. Arguments: - sequences - An iterator that returns SeqRecord objects, or simply a list of SeqRecord objects. - key_function - Optional callback function which when given a SeqRecord should return a unique key for the dictionary. e.g. key_function = lambda rec : rec.name or, key_function = lambda rec : rec.description.split()[0] If key_function is omitted then record.id is used, on the assumption that the records objects returned are SeqRecords with a unique id. If there are duplicate keys, an error is raised. Since Python 3.7, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. For CPython, this was already implemented in 3.6. As of Biopython 1.73, we explicitly use OrderedDict for CPython older than 3.6 (and for other Python older than 3.7) so that you can always assume the record order is preserved. Example usage, defaulting to using the record.id as key: >>> from Bio import SeqIO >>> filename = "GenBank/cor6_6.gb" >>> format = "genbank" >>> id_dict = SeqIO.to_dict(SeqIO.parse(filename, format)) >>> print(list(id_dict)) ['X55053.1', 'X62281.1', 'M81224.1', 'AJ237582.1', 'L31939.1', 'AF297471.1'] >>> print(id_dict["L31939.1"].description) Brassica rapa (clone bif72) kin mRNA, complete cds A more complex example, using the key_function argument in order to use a sequence checksum as the dictionary key: >>> from Bio import SeqIO >>> from Bio.SeqUtils.CheckSum import seguid >>> filename = "GenBank/cor6_6.gb" >>> format = "genbank" >>> seguid_dict = SeqIO.to_dict(SeqIO.parse(filename, format), ... key_function = lambda rec : seguid(rec.seq)) >>> for key, record in sorted(seguid_dict.items()): ... print("%s %s" % (key, record.id)) /wQvmrl87QWcm9llO4/efg23Vgg AJ237582.1 BUg6YxXSKWEcFFH0L08JzaLGhQs L31939.1 SabZaA4V2eLE9/2Fm5FnyYy07J4 X55053.1 TtWsXo45S3ZclIBy4X/WJc39+CY M81224.1 l7gjJFE6W/S1jJn5+1ASrUKW/FA X62281.1 uVEYeAQSV5EDQOnFoeMmVea+Oow AF297471.1 This approach is not suitable for very large sets of sequences, as all the SeqRecord objects are held in memory. Instead, consider using the Bio.SeqIO.index() function (if it supports your particular file format). Since Python 3.6, the default dict class maintains key order, meaning this dictionary will reflect the order of records given to it. As of Biopython 1.72, on older versions of Python we explicitly use an OrderedDict so that you can always assume the record order is preserved. """ # This is to avoid a lambda function: def _default_key_function(rec): return rec.id if key_function is None: key_function = _default_key_function d = _dict() for record in sequences: key = key_function(record) if key in d: raise ValueError("Duplicate key '%s'" % key) d[key] = record return d
def _parseFile(self, file_path, ignoreCategories, preserve_token_order, onlyCategories): """Private method that will do the work of parsing the mmCIF data file return Dictionary""" if preserve_token_order: try: from collections import OrderedDict as _dict except ImportError: # fallback: try to use the ordereddict backport when using python 2.6 try: from ordereddict import OrderedDict as _dict except ImportError: # backport not installed: use local OrderedDict from mmCif.ordereddict import OrderedDict as _dict else: _dict = dict mmcif_like_file = _dict() data_block = _dict() save_block = _dict() data_heading = "" line_num = 0 try: with openGzip(file_path, 'r') as f1: table_names = [] table_values = [] table_values_array = [] isLoop = False multiLineValue = False skipCategory = False for line in f1: line_num+=1 if skipCategory: flag = False while line: check = (line.strip().startswith('_') or self.loopRE.match(line.strip()[:5]) or self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])) if flag: if check: isLoop = False break else: if not check: flag = True if not (self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])): try: line = next(f1) line_num+=1 except StopIteration: break else: break skipCategory = False if isLoop is True and table_values_array != [] and (self.loopRE.match(line) is not None or (line.strip().startswith('_'))): isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise MMCIFWrapperSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_values_array = [] if line.strip() == "": continue if line.startswith('#'): continue if '\t#' in line or ' #' in line and not line.startswith(';'): new_line = '' for tok in self.dataValueRE.findall(line): if not tok.startswith('#'): new_line += tok+" " else: break # make sure to preserve the fact that ';' was not the first character line = new_line if not new_line.startswith(';') else " "+new_line # Fails for entries "3snv", "1kmm", "1ser", "2prg", "3oqd" # line = re.sub(r'\s#.*$', '', line) if line.startswith(';'): while '\n;' not in line: try: line += next(f1) line_num+=1 except StopIteration: break multiLineValue = True if self.dataRE.match(line): if data_block != {}: if table_values_array != []: isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise mmCifSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_names = [] table_values_array = [] mmcif_like_file[data_heading] = data_block data_block = _dict() data_heading = self.dataRE.match(line).group('data_heading') elif self.saveRE.match(line): while line.strip() != 'save_': try: line = next(f1) line_num+=1 except StopIteration: break continue elif self.loopRE.match(line): # Save and clear the table_values_array buffer from the # previous loop that was read if table_values_array != []: for itemIndex, name in enumerate(table_names): data_block[category].update({name:[row[itemIndex] for row in table_values_array]}) table_values_array = [] isLoop = True category, item, value = None, None, None #Stores items of a category listed in loop blocks table_names = [] #Stores values of items in a loop as a single row table_values = [] elif self.dataNameRE.match(line): # Match category and item simultaneously m = self.dataNameRE.match(line) category = m.group('data_category') item = m.group('category_item') remainder = m.group('remainder') value = None if isLoop and remainder != '': """Append any data values following the last loop category.item tag should any exist""" table_values += self._tokenizeData(remainder) line = '' else: line = remainder + "\n" if not isLoop: if line.strip() != '': value = self._tokenizeData(line) else: # For cases where values are on the following # line try: line = next(f1) line_num +=1 except StopIteration: break while value is None: char_start = 1 if line.startswith(';') else 0 while line.startswith(';') and not line.rstrip().endswith('\n;'): try: line += next(f1) line_num+=1 except StopIteration: break value = (line[char_start:line.rfind('\n;')]).strip() if char_start > 0: value = (line[char_start:line.rfind('\n;')]).strip() else: value = self._tokenizeData(" "+line) if (ignoreCategories and category in ignoreCategories) or (onlyCategories and category not in onlyCategories): pass else: if category in data_block: data_block[category].update({item: value if len(value) > 1 else value[0]}) else: data_block.setdefault(category, _dict({item: value if len(value) > 1 else value[0]})) # OrderedDict here preserves item order else: if (ignoreCategories and category in ignoreCategories) or (onlyCategories and category not in onlyCategories): skipCategory = True else: data_block.setdefault(category, _dict()) # OrderedDict here preserves item order table_names.append(item) else: if multiLineValue is True: table_values.append((line[1:line.rfind('\n;')]).strip()) multiLineValue = False line = line[line.rfind('\n;') + 2:] if line.strip() != '': table_values += self._tokenizeData(line) else: table_values += self._tokenizeData(line) if table_values != []: table_values_array += table_values table_values = [] if isLoop is True and table_values_array != []: isLoop = False num_item = len(table_names) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[val_index::num_item] table_values_array = [] if data_block != {}: mmcif_like_file[data_heading] = data_block return mmcif_like_file except KeyError as key_err: print("KeyError [line %i]: %s" %(line_num, str(key_err))) except IOError as io_err: print("IOException [line %i]: %s" % (line_num, str(io_err)))
class Parameters(_dict): """ A dict in which the items can be get/set as attributes. """ __reserved_names__ = dir(_dict()) # Also from OrderedDict __pure_names__ = dir(dict()) __slots__ = [] def __repr__(self): identifier_items = [] nonidentifier_items = [] for key, val in self.items(): if isidentifier(key): identifier_items.append('%s=%r' % (key, val)) else: nonidentifier_items.append('(%r, %r)' % (key, val)) if nonidentifier_items: return 'Parameters([%s], %s)' % (', '.join(nonidentifier_items), ', '.join(identifier_items)) else: return 'Parameters(%s)' % (', '.join(identifier_items)) def __str__(self): # Get alignment value c = 0 for key in self: c = max(c, len(key)) # How many chars left (to print on less than 80 lines) charsLeft = 79 - (c+6) s = '<%i parameters>\n' % len(self) for key in self.keys(): valuestr = repr(self[key]) if len(valuestr) > charsLeft: valuestr = valuestr[:charsLeft-3] + '...' s += key.rjust(c+4) + ": %s\n" % (valuestr) return s def __getattribute__(self, key): try: return object.__getattribute__(self, key) except AttributeError: if key in self: return self[key] else: raise def __setattr__(self, key, val): if key in self.__class__.__reserved_names__: # Either let OrderedDict do its work, or disallow if key not in self.__class__.__pure_names__: return _dict.__setattr__(self, key, val) else: raise AttributeError('Reserved name, this key can only ' + 'be set via ``d[%r] = X``' % key) else: # if isinstance(val, dict): val = Dict(val) -> no, makes a copy! self[key] = val def __dir__(self): names = [k for k in self.keys() if isidentifier(k)] return self.__class__.__reserved_names__ + names
def _parseFile(self, file_path, ignoreCategories, preserve_token_order, onlyCategories): """Private method that will do the work of parsing the mmCIF data file return Dictionary""" if preserve_token_order: try: from collections import OrderedDict as _dict except ImportError: # fallback: try to use the ordereddict backport when using python 2.6 try: from ordereddict import OrderedDict as _dict except ImportError: # backport not installed: use local OrderedDict from mmCif.ordereddict import OrderedDict as _dict else: _dict = dict mmcif_like_file = _dict() data_block = _dict() save_block = _dict() data_heading = "" line_num = 0 try: with openGzip(file_path, "rt") as f1: table_names = [] table_values = [] table_values_array = [] isLoop = False multiLineValue = False skipCategory = False for line in f1: line_num += 1 if skipCategory: flag = False while line: check = (line.strip().startswith("_") or self.loopRE.match(line.strip()[:5]) or self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])) if flag: if check: isLoop = False break else: if not check: flag = True if not (self.saveRE.match(line.strip()[:5]) or self.dataRE.match(line.strip()[:5])): try: line = next(f1) line_num += 1 except StopIteration: break else: break skipCategory = False if (isLoop is True and table_values_array != [] and (self.loopRE.match(line) is not None or (line.strip().startswith("_")))): isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise MMCIFWrapperSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[ val_index::num_item] table_values_array = [] if line.strip() == "": continue if line.startswith("#"): continue if "\t#" in line or " #" in line and not line.startswith( ";"): new_line = "" for tok in self.dataValueRE.findall(line): if not tok.startswith("#"): new_line += tok + " " else: break # make sure to preserve the fact that ';' was not the first character line = (new_line if not new_line.startswith(";") else " " + new_line) # Fails for entries "3snv", "1kmm", "1ser", "2prg", "3oqd" # line = re.sub(r'\s#.*$', '', line) if line.startswith(";"): while "\n;" not in line: try: line += next(f1) line_num += 1 except StopIteration: break multiLineValue = True if self.dataRE.match(line): if data_block != {}: if table_values_array != []: isLoop = False num_item = len(table_names) if len(table_values_array) % num_item != 0: raise mmCifSyntaxError(category) for val_index, item in enumerate(table_names): data_block[category][ item] = table_values_array[ val_index::num_item] table_names = [] table_values_array = [] mmcif_like_file[data_heading] = data_block data_block = _dict() data_heading = self.dataRE.match(line).group( "data_heading") elif self.saveRE.match(line): while line.strip() != "save_": try: line = next(f1) line_num += 1 except StopIteration: break continue elif self.loopRE.match(line): # Save and clear the table_values_array buffer from the # previous loop that was read if table_values_array != []: for itemIndex, name in enumerate(table_names): data_block[category].update({ name: [ row[itemIndex] for row in table_values_array ] }) table_values_array = [] isLoop = True category, item, value = None, None, None # Stores items of a category listed in loop blocks table_names = [] # Stores values of items in a loop as a single row table_values = [] elif self.dataNameRE.match(line): # Two step process STAR does not know contept of categories m = self.dataNameRE.match(line) flag = m.group("data_category") tmp_category = self.dataCategoryItem.match(flag) if tmp_category: category = tmp_category.group("data_category") item = tmp_category.group("category_item") else: category = "" item = flag remainder = m.group("remainder") value = None if isLoop and remainder != "": """Append any data values following the last loop category.item tag should any exist""" table_values += self._tokenizeData(remainder) line = "" else: line = remainder + "\n" if not isLoop: if line.strip() != "": value = self._tokenizeData(line) else: # For cases where values are on the following # line try: line = next(f1) line_num += 1 except StopIteration: break while value is None: char_start = 1 if line.startswith(";") else 0 while line.startswith(";") and not line.rstrip( ).endswith("\n;"): try: line += next(f1) line_num += 1 except StopIteration: break value = (line[char_start:line.rfind("\n;")] ).strip() if char_start > 0: value = (line[char_start:line.rfind("\n;")] ).strip() else: value = self._tokenizeData(" " + line) if (ignoreCategories and category in ignoreCategories) or ( onlyCategories and category not in onlyCategories): pass else: if category in data_block: data_block[category].update({ item: value if len(value) > 1 else value[0] }) else: data_block.setdefault( category, _dict({ item: value if len(value) > 1 else value[0] }), ) # OrderedDict here preserves item order else: if (ignoreCategories and category in ignoreCategories) or ( onlyCategories and category not in onlyCategories): skipCategory = True else: data_block.setdefault(category, _dict( )) # OrderedDict here preserves item order table_names.append(item) else: if multiLineValue is True: table_values.append( (line[1:line.rfind("\n;")]).strip()) multiLineValue = False line = line[line.rfind("\n;") + 2:] if line.strip() != "": table_values += self._tokenizeData(line) else: table_values += self._tokenizeData(line) if table_values != []: table_values_array += table_values table_values = [] if isLoop is True and table_values_array != []: isLoop = False num_item = len(table_names) for val_index, item in enumerate(table_names): data_block[category][item] = table_values_array[ val_index::num_item] table_values_array = [] if data_block != {}: mmcif_like_file[data_heading] = data_block return mmcif_like_file except KeyError as key_err: print("KeyError [line %i]: %s" % (line_num, str(key_err))) except IOError as io_err: print("IOException [line %i]: %s" % (line_num, str(io_err)))
continue layer, k = _split(dat_key, '/') case, k = _split(k, '.') est, part = _split(k, '--', reverse=True) # Header space before column headings items = [i for i in [layer, case, est, part] if i != ''] buffer = max(buffer, len(' '.join(items))) for k, v in zip(row_glossary, [layer, case, est, part]): v_ = len(v) if v_ > max_row_len[k]: max_row_len[k] = v_ dat = _dict() dat['layer'] = layer dat['case'] = case dat['est'] = est dat['part'] = part row_keys.append(dat_key) rows.append(dat) # Check which row name columns we can drop (ex partition number) drop = list() for k, v in max_row_len.items(): if v == 0: drop.append(k) # Header out = " " * (buffer + padding)
# # The basic structure always looks like this # # regexes[(a, b)] = (c, d) # # # regexes = an ordered dictionary which ensures that the regular expressions # are always processed in the same order. # a = an unique number - suggestion, start with 0 and always increase by one (per lexer) # b = color tuple in the form of (r,g,b). Example (255,0,0) for the color red. # c = raw byte string, describes the regular expression. Example r'\w+' # d = integer, denotes which match group should be considered # Example # builtin lexers - like python py_regexes = _dict() # cls and self objects - return match 0 py_regexes[(0, (224, 108, 117))] = (r'\b(cls|self)\b', 0) # function parameters - return match 1 py_regexes[(1, (209, 154, 102))] = (r'(?:(?:def)\s\w+)\s*\((.+)\):', 1) # args and kwargs - return match 0 py_regexes[(2, (86, 182, 194))] = (r'(\*|\*\*)(?=\w)', 0) # functions and class instances but not definitions - return match 1 py_regexes[(3, (79, 175, 239))] = (r'class\s*\w+?(?=\()|def\s*\w+?(?=\()|(\w+?(?=\())', 1) # dunder functions and special keywords - return match 0 py_regexes[(4, (86, 182, 194))] = ( r'\b(editor|editor1|editor2|notepad|console|__\w+__|super|object|type|print)\b', 0)
def _emit(key, value, content_handler, attr_prefix='@', cdata_key='#text', depth=0, preprocessor=None, pretty=False, newl='\n', indent='\t', namespace_separator=':', namespaces=None, full_document=True, expand_iter=None): key = _process_namespace(key, namespaces, namespace_separator, attr_prefix) if preprocessor is not None: result = preprocessor(key, value) if result is None: return key, value = result if (not hasattr(value, '__iter__') or isinstance(value, _basestring) or isinstance(value, dict)): value = [value] for index, v in enumerate(value): if full_document and depth == 0 and index > 0: raise ValueError('document with multiple roots') if v is None: v = _dict() elif isinstance(v, bool): if v: v = _unicode('true') else: v = _unicode('false') elif not isinstance(v, dict): if expand_iter and hasattr( v, '__iter__') and not isinstance(v, _basestring): v = _dict(((expand_iter, v), )) else: v = _unicode(v) if isinstance(v, _basestring): v = _dict(((cdata_key, v), )) cdata = None attrs = _dict() children = [] for ik, iv in v.items(): if ik == cdata_key: cdata = iv continue if ik.startswith(attr_prefix): ik = _process_namespace(ik, namespaces, namespace_separator, attr_prefix) if ik == '@xmlns' and isinstance(iv, dict): for k, v in iv.items(): attr = 'xmlns{}'.format(':{}'.format(k) if k else '') attrs[attr] = _unicode(v) continue if not isinstance(iv, _unicode): iv = _unicode(iv) attrs[ik[len(attr_prefix):]] = iv continue children.append((ik, iv)) if pretty: content_handler.ignorableWhitespace(depth * indent) content_handler.startElement(key, AttributesImpl(attrs)) if pretty and children: content_handler.ignorableWhitespace(newl) for child_key, child_value in children: _emit(child_key, child_value, content_handler, attr_prefix, cdata_key, depth + 1, preprocessor, pretty, newl, indent, namespaces=namespaces, namespace_separator=namespace_separator, expand_iter=expand_iter) if cdata is not None: content_handler.characters(cdata) if pretty and children: content_handler.ignorableWhitespace(depth * indent) content_handler.endElement(key) if pretty and depth: content_handler.ignorableWhitespace(newl)
def __repr__(self): return "%s(%s)" % (self.__class__.__name__, json.dumps(_dict([(k, getattr(self, k)) for k in ['method', 'url']]), default=repr, separators=',='))
class Dict(_dict): """ A dict in which the items can be get/set as attributes. This provides a lean way to represent structured data, and works well in combination with autocompletion. Keys can be anything that are otherwise valid keys, but keys that are not valid identifiers or that are methods of the dict class (e.g. 'items' or 'copy') can only be get/set in the classic way. Example: .. code-block:: python >> d = Dict(foo=3) >> d.foo 3 >> d['foo'] = 4 >> d.foo 4 >> d.bar = 5 >> d.bar 5 """ __reserved_names__ = dir(_dict()) # Also from OrderedDict __pure_names__ = dir(dict()) __slots__ = [] def __repr__(self): identifier_items = [] nonidentifier_items = [] for key, val in self.items(): if isidentifier(key): identifier_items.append('%s=%r' % (key, val)) else: nonidentifier_items.append('(%r, %r)' % (key, val)) if nonidentifier_items: return 'Dict([%s], %s)' % (', '.join(nonidentifier_items), ', '.join(identifier_items)) else: return 'Dict(%s)' % (', '.join(identifier_items)) def __getattribute__(self, key): try: return object.__getattribute__(self, key) except AttributeError: if key in self: return self[key] else: raise def __setattr__(self, key, val): if key in Dict.__reserved_names__: # Either let OrderedDict do its work, or disallow if key not in Dict.__pure_names__: return _dict.__setattr__(self, key, val) else: raise AttributeError('Reserved name, this key can only ' + 'be set via ``d[%r] = X``' % key) else: # if isinstance(val, dict): val = Dict(val) -> no, makes a copy! self[key] = val def __dir__(self): names = [k for k in self.keys() if isidentifier(k)] return Dict.__reserved_names__ + names