def insert_items(self, val_key_pairs, prepare_inserts=r.prepare_inserts): keys = [k for (v, k) in val_key_pairs] adjustments, new_keys = prepare_inserts(self._slist, keys) if adjustments: self.num_update_events += 1 self.num_updated_keys += len(adjustments) # Updating items is a bit tricky: we have to do it without violating order (just changing # key of an existing item easily might), so we remove items first. And we can only rely on # indices if we scan items in a backwards order. items = [ self._slist.pop(index) for (index, key) in reversed(adjustments) ] items.reverse() for (index, key), item in izip(adjustments, items): item.key = key self._slist.update(items) # Now add the new items. self._slist.update( Item(val, new_key) for (val, _), new_key in izip(val_key_pairs, new_keys)) # For testing, pass along the return value from prepare_inserts. return adjustments, new_keys
def safe_str_cmp(a, b): """This function compares strings in somewhat constant time. This requires that the length of at least one string is known in advance. Returns `True` if the two strings are equal, or `False` if they are not. .. versionadded:: 0.7 """ if isinstance(a, text_type): a = a.encode('utf-8') if isinstance(b, text_type): b = b.encode('utf-8') if _builtin_safe_str_cmp is not None: return _builtin_safe_str_cmp(a, b) if len(a) != len(b): return False rv = 0 if PY2: for x, y in izip(a, b): rv |= ord(x) ^ ord(y) else: for x, y in izip(a, b): rv |= x ^ y return rv == 0
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ cost, model, dataset, sgd, state = prepare_adagrad_test() def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val**2 dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def todok(self): from .dok import dok_matrix dok = dok_matrix((self.shape), dtype=self.dtype) dok.update(izip(izip(self.row, self.col), self.data)) return dok
def __getitem__(self, key): try: if isinstance(key, int) and (key >= 0): if key in self.cache: return self.cache[key] elif key < self.stop: self.stop = 0 self.iterator = iter(self.f()) delta = key - self.stop result = next(islice(self.iterator, delta, delta + 1)) self.cache[key] = result self.stop = key + 1 return result elif isinstance(key, slice): if key.start is None and key.stop is None: # Whole sequence is asked return list(self.f()) start = key.start or 0 step = key.step or 1 indexes = count(start, step) index_upd = start while (key.stop is None or index_upd < key.stop) and index_upd in self.cache: index_upd += step if index_upd < self.stop and (key.stop is None or index_upd < key.stop): self.iterator = iter(self.f()) result = list(islice(self.iterator, start, key.stop, step)) for i, value in izip(indexes, result): self.cache[i] = value self.stop = i + 1 if key.stop is None else key.stop return result else: result = [self.cache[i] for i in six.moves.xrange(start, index_upd, step)] if key.stop is None: result_upd = list(islice(self.iterator, index_upd - self.stop, None, step)) elif index_upd < key.stop: result_upd = list(islice(self.iterator, index_upd - self.stop, key.stop - self.stop, step)) else: result_upd = [] for i, value in izip(indexes, result_upd): self.cache[i] = value self.stop = key.stop return result + result_upd else: raise KeyError("Key must be non-negative integer or slice, not {}" .format(key)) except StopIteration: self.iterator = self.f() self.stop = 0 raise
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def get_sample(self, fit, factor=4, num=1): vals = numpy.array(fit.model.thawedpars) scales = self.scale.get_scales(fit) samples = [numpy.random.uniform(val - factor * abs(scale), val + factor * abs(scale), int(num)) for val, scale in izip(vals, scales)] return numpy.asarray(samples).T
def get_scales(self, fit, myscales=None): scales = [] thawedpars = [par for par in fit.model.pars if not par.frozen] if None == myscales: oldestmethod = fit.estmethod covar = Covariance() covar.config['sigma'] = self.sigma fit.estmethod = Covariance() try: r = fit.est_errors() finally: fit.estmethod = oldestmethod for par, val, lo, hi in izip(thawedpars, r.parvals, r.parmins, r.parmaxes): scale = None if lo is not None and hi is not None: scale = numpy.abs(lo) else: warning("Covariance failed for '%s', trying Confidence..." % par.fullname) conf = Confidence() conf.config['sigma'] = self.sigma fit.estmethod = conf try: t = fit.est_errors(parlist=(par,)) if t.parmins[0] is not None and t.parmaxes[0] is not None: scale = numpy.abs(t.parmins[0]) else: if t.parmins[0] is None and t.parmaxes[0] is not None: scale = numpy.abs(t.parmaxes[0]) else: warning('1 sigma bounds for parameter ' + par.fullname + ' could not be found, using soft limit minimum') if 0.0 == numpy.abs(par.min): scale = 1.0e-16 else: scale = numpy.abs(par.min) finally: fit.estmethod = oldestmethod scales.append(scale) else: if not numpy.iterable(myscales): raise TypeError( "scales option must be iterable of length %d " % len(thawedpars)) scales = list(map(abs, myscales)) scales = numpy.asarray(scales).transpose() return scales
def azip(*iterables, **kwargs): """Move `axis` (default -1) to the front of ndarrays in `iterables`.""" from six.moves import map as imap, zip as izip return izip(*( imap(kwargs.get('func', unmask), np.rollaxis(i, kwargs.get('axis', -1), kwargs.get('start', 0))) if isinstance(i, np.ndarray) else i for i in iterables))
def unpack(self, buff): """ Unpack the given binary buffer into the fields. The result is a dictionary mapping field names to values. """ args = struct.unpack_from(self._fmt, buff[:self._size]) return dict(izip(self._names, args))
def __init__(self, res, hascontent, duration=0, has_payload = False): """ - **snippets**: An optional dictionary of the form {field: snippet_size} for snippet formatting """ self.total = res[0] self.duration = duration self.docs = [] step = 1 if hascontent: step = 3 if has_payload else 2 else: # we can't have nocontent and payloads in the same response has_payload = False for i in xrange(1, len(res), step): id = to_string(res[i]) payload = to_string(res[i+1]) if has_payload else None fields_offset = 2 if has_payload else 1 fields = {} if hascontent: fields = dict( dict(izip(map(to_string, res[i + fields_offset][::2]), map(to_string, res[i + fields_offset][1::2]))) ) if hascontent else {} try: del fields['id'] except KeyError: pass doc = Document(id, payload=payload, **fields) self.docs.append(doc)
def _parse_set_weight_values(argvish): new_cmd_format, opts, args = validate_args(argvish) # We'll either parse the all-in-one-string format or the # --options format, # but not both. If both are specified, raise an error. try: devs = [] if not new_cmd_format: if len(args) % 2 != 0: print(Commands.set_weight.__doc__.strip()) exit(EXIT_ERROR) devs_and_weights = izip(islice(argvish, 0, len(argvish), 2), islice(argvish, 1, len(argvish), 2)) for devstr, weightstr in devs_and_weights: devs.extend(builder.search_devs( parse_search_value(devstr)) or []) weight = float(weightstr) _set_weight_values(devs, weight) else: if len(args) != 1: print(Commands.set_weight.__doc__.strip()) exit(EXIT_ERROR) devs.extend(builder.search_devs( parse_search_values_from_opts(opts)) or []) weight = float(args[0]) _set_weight_values(devs, weight) except ValueError as e: print(e) exit(EXIT_ERROR)
def refresh(self, items, consistent=False): """ Overwrite model data with freshest from database Parameters ---------- items : list or :class:`~flywheel.models.Model` Models to sync consistent : bool, optional If True, force a consistent read from the db. (default False) """ if isinstance(items, Model): items = [items] if not items: return tables = defaultdict(list) for item in items: tables[item.meta_.ddb_tablename(self.namespace)].append(item) for tablename, items in six.iteritems(tables): keys = [item.pk_dict_ for item in items] results = self.dynamo.batch_get(tablename, keys, consistent=consistent) for item, data in izip(items, results): with item.loading_(self): for key, val in data.items(): item.set_ddb_val_(key, val)
def eval_model_to_fit(self, modelfuncs): total_model = [] for func, data in izip(modelfuncs, self.datasets): total_model.append(data.eval_model_to_fit(func)) return numpy.concatenate(total_model)
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict( izip(sorted(itervalues(self.token2id)), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = { token: idmap[tokenid] for token, tokenid in iteritems(self.token2id) } self.id2token = {} self.dfs = { idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs) }
def _parse_set_weight_values(argvish): new_cmd_format, opts, args = validate_args(argvish) # We'll either parse the all-in-one-string format or the # --options format, # but not both. If both are specified, raise an error. try: devs = [] if not new_cmd_format: if len(args) % 2 != 0: print(Commands.set_weight.__doc__.strip()) exit(EXIT_ERROR) devs_and_weights = izip(islice(argvish, 0, len(argvish), 2), islice(argvish, 1, len(argvish), 2)) for devstr, weightstr in devs_and_weights: devs.extend( builder.search_devs(parse_search_value(devstr)) or []) weight = float(weightstr) _set_weight_values(devs, weight) else: if len(args) != 1: print(Commands.set_weight.__doc__.strip()) exit(EXIT_ERROR) devs.extend( builder.search_devs(parse_search_values_from_opts(opts)) or []) weight = float(args[0]) _set_weight_values(devs, weight) except ValueError as e: print(e) exit(EXIT_ERROR)
def extract_all_features(): def get_group(block_id): group = groupby.get_group(block_id) return group pool = Pool(processes=56, initializer=init_process, initargs=(netatmo_groups, netatmo_anns)) res = list(pool.imap(sleep_30_sec, xrange(56))) group_generator = imap(get_group, groups.keys()[:]) feature_iterator = pool.imap(extract_features, group_generator) X, y, block_ids = [], [], [] save_id = 0 for block_id, features in izip(groups.keys()[:], tqdm(feature_iterator)): group = groupby.get_group(block_id) X.append(features) y.append(group.iloc[0]['rain']) block_ids.append(block_id + (group.iloc[0]["hours_since"], )) X = pd.DataFrame(X) y = np.array(y) block_ids = pd.DataFrame( block_ids, columns=["city_code", "sq_x", "sq_y", "hour_hash", "hours_since"]) return X, y, block_ids
def __init__(self, fname, id2word=None, metadata=True): self.metadata = metadata MmCorpus.__init__(self, fname=fname) self.doc_metadata = {} self.metadata = metadata if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted( all_terms ) # sort the list of all words; rank in that list = word's integer id # build a mapping of word id(int) -> word (string) self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word if metadata: self.doc_with_meta(fname) self.doc_id_to_postgres_id = {} self.postgres_id_to_doc_id = {} self.__build_relation_dictionaries()
def return_docs(self, return_doc_cb): """Return the changed documents and their last change generation repeatedly invoking the callback return_doc_cb. The final step of a sync exchange. :param: return_doc_cb(doc, gen, trans_id): is a callback used to return the documents with their last change generation to the target replica. :return: None """ changes_to_return = self.changes_to_return # return docs, including conflicts changed_doc_ids = [doc_id for doc_id, _, _ in changes_to_return] self._trace('before get_docs') docs = self._db.get_docs( changed_doc_ids, check_for_conflicts=False, include_deleted=True) docs_by_gen = izip( docs, (gen for _, gen, _ in changes_to_return), (trans_id for _, _, trans_id in changes_to_return)) _outgoing_trace = [] # for tests for doc, gen, trans_id in docs_by_gen: return_doc_cb(doc, gen, trans_id) _outgoing_trace.append((doc.doc_id, doc.rev)) # for tests self._db._last_exchange_log['return'] = { 'docs': _outgoing_trace, 'last_gen': self.new_gen}
def get_gradients(self, model, data, **kwargs): """ Provides the gradients of the cost function with respect to the model parameters. These are not necessarily those obtained by theano.tensor.grad --you may wish to use approximate or even intentionally incorrect gradients in some cases. Parameters ---------- model : a pylearn2 Model instance data : a batch in cost.get_data_specs() form kwargs : dict Optional extra arguments, not used by the base class. Returns ------- gradients : OrderedDict a dictionary mapping from the model's parameters to their gradients The default implementation is to compute the gradients using T.grad applied to the value returned by expr. However, subclasses may return other values for the gradient. For example, an intractable cost may return a sampling-based approximation to its gradient. updates : OrderedDict a dictionary mapping shared variables to updates that must be applied to them each time these gradients are computed. This is to facilitate computation of sampling-based approximate gradients. The parameters should never appear in the updates dictionary. This would imply that computing their gradient changes their value, thus making the gradient value outdated. """ try: cost = self.expr(model=model, data=data, **kwargs) except TypeError: # If anybody knows how to add type(self) to the exception message # but still preserve the stack trace, please do so # The current code does neither message = "Error while calling " + str(type(self)) + ".expr" reraise_as(TypeError(message)) if cost is None: raise NotImplementedError( str(type(self)) + " represents an intractable cost and " "does not provide a gradient " "approximation scheme.") params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def set_arrays(filename, args, fields=None, ascii=True, clobber=False): if os.path.isfile(filename) and not clobber: raise IOErr("filefound", filename) if not numpy.iterable(args) or len(args) == 0: raise IOErr('noarrayswrite') if not numpy.iterable(args[0]): raise IOErr('noarrayswrite') size = len(args[0]) for arg in args: if not numpy.iterable(arg): raise IOErr('noarrayswrite') elif len(arg) != size: raise IOErr('arraysnoteq') if ascii and '[' not in filename and ']' not in filename: filename += "[opt kernel=text/simple]" tbl = pycrates.TABLECrate() if fields is None: fields = ['col%i' % (ii + 1) for ii in range(len(args))] if len(args) != len(fields): raise IOErr('toomanycols', str(len(fields)), str(len(args))) for val, name in izip(args, fields): _set_column(tbl, name, val) pycrates.write_file(tbl, filename, clobber=True) close_crate_dataset(tbl.get_dataset())
def lazy_load_trees(skeleton_ids, node_properties): """ Return a lazy collection of pairs of (long, DiGraph) representing (skeleton_id, tree). The node_properties is a list of strings, each being a name of a column in the django model of the Treenode table that is not the treenode id, parent_id or skeleton_id. """ values_list = ('id', 'parent_id', 'skeleton_id') props = tuple(set(node_properties) - set(values_list)) values_list += props ts = Treenode.objects.filter(skeleton__in=skeleton_ids) \ .order_by('skeleton') \ .values_list(*values_list) skid = None tree = None for t in ts: if t[2] != skid: if tree: yield (skid, tree) # Prepare for the next one skid = t[2] tree = DiGraph() fields = {k: v for k,v in izip(props, islice(t, 3, 3 + len(props)))} tree.add_node(t[0], fields) if t[1]: # From child to parent tree.add_edge(t[0], t[1]) if tree: yield (skid, tree)
def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs): """extract content between annotation indexes""" if ignored_regions and ( _int_cmp(start_index, 'le', ignored_regions[0].start_index) and _int_cmp(end_index, 'ge', ignored_regions[-1].end_index)): starts = [start_index] + [ i.end_index for i in ignored_regions if i.end_index is not None ] ends = [i.start_index for i in ignored_regions] if starts[-1] is not None: ends.append(end_index) included_regions = izip(starts, ends) if ends[0] is None: included_regions.next() regions = starmap(extraction_page.htmlpage_region_inside, included_regions) region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions)) else: region = extraction_page.htmlpage_region_inside( start_index, end_index) validated = self.content_validate(region) return [(self.annotation.surrounds_attribute, validated)] if validated else []
def multiupdate_metadata(self, keys, metadatas): """ Update the metadata for a collection of keys. Where supported by an implementation, this should perform the whole collection of sets as a single transaction. Like zip() if keys and metadatas have different lengths, then any excess values in the longer list should be silently ignored. Parameters ---------- keys : iterable of strings The keys for the resources in the key-value store. Each key is a unique identifier for a resource within the key-value store. metadatas : iterable of dicts An iterator that provides the metadata dictionaries for the corresponding keys. Events ------ StoreSetEvent : On successful completion of a transaction, a StoreSetEvent should be emitted with the key & metadata for each key that was set. """ with self.transaction('Updating metadata for '+', '.join('"%s"' % key for key in keys)): for key, metadata in izip(keys, metadatas): self.update_metadata(key, metadata)
def par_at_boundary( low, val, high, tol ): for par_min, par_val, par_max in izip( low, val, high ): if sao_fcmp( par_val, par_min, tol ) == 0: return True if sao_fcmp( par_val, par_max, tol ) == 0: return True return False
def azip(*iterables, **kwargs): """Move `axis` (default -1) to the front of ndarrays in `iterables`.""" from six.moves import map as imap, zip as izip return izip( *(imap(kwargs.get('func', unmask), np.rollaxis(i, kwargs.get('axis', -1), kwargs.get('start', 0)) ) if isinstance(i, np.ndarray) else i for i in iterables))
def build_stacked_ae(nvis, nhids, act_enc, act_dec, tied_weights=False, irange=1e-3, rng=None, corruptor=None, contracting=False): """ .. todo:: WRITEME properly Allocate a stack of autoencoders. """ rng = make_np_rng(rng, which_method='randn') layers = [] final = {} # "Broadcast" arguments if they are singular, or accept sequences if # they are the same length as nhids for c in [ 'corruptor', 'contracting', 'act_enc', 'act_dec', 'tied_weights', 'irange' ]: if type(locals()[c]) is not str and hasattr(locals()[c], '__len__'): assert len(nhids) == len(locals()[c]) final[c] = locals()[c] else: final[c] = [locals()[c]] * len(nhids) # The number of visible units in each layer is the initial input # size and the first k-1 hidden unit sizes. nviss = [nvis] + nhids[:-1] seq = izip( nhids, nviss, final['act_enc'], final['act_dec'], final['corruptor'], final['contracting'], final['tied_weights'], final['irange'], ) # Create each layer. for (nhid, nvis, act_enc, act_dec, corr, cae, tied, ir) in seq: args = (nvis, nhid, act_enc, act_dec, tied, ir, rng) if cae and corr is not None: raise ValueError("Can't specify denoising and contracting " "objectives simultaneously") elif cae: autoenc = ContractiveAutoencoder(*args) elif corr is not None: autoenc = DenoisingAutoencoder(corr, *args) else: autoenc = Autoencoder(*args) layers.append(autoenc) # Create the stack return StackedBlocks(layers)
def calc(self, p, x, xhi=None, *args, **kwargs): pha = self.pha # TODO: this should probably include AREASCAL user_grid = False try: if self._check_for_user_grid(x, xhi): user_grid = True self._startup_user_grid(x, xhi) # Slow if self.table is None: # again, fit() never comes in here b/c it calls startup() src = self.source vals = [] for model, args in izip(self.models, self.grid): elo, ehi = lo, hi = args if pha.units == 'wavelength': lo = DataPHA._hc / ehi hi = DataPHA._hc / elo vals.append(model(src(lo, hi))) self.orders = vals # Fast else: xlo, xhi = self.elo, self.ehi if pha.units == 'wavelength': xlo, xhi = self.lo, self.hi src = self.source(xlo, xhi) # hi-res grid of all ARF grids # Fold summed intervals through the associated response. self.orders = \ [model(sum_intervals(src, interval[0], interval[1])) for model, interval in izip(self.models, self.table)] vals = sum(self.orders) if self.mask is not None: vals = vals[self.mask] finally: if user_grid: self._teardown_user_grid() return vals
def calc(self, p, arglist): vals = [] for model, args in izip(self.models, arglist): # FIXME: we're not using p here (and therefore assuming that the # parameter values have already been updated to match the contents # of p) vals.append(model(*args)) return sum(vals)
def populate_connectors(chunkIDs, chunks, cs, connectors): # Build up edges via the connectors for c in cs: # c is (treenode_id, connector_id, relation_id, confidence) for chunkID, chunk in izip(chunkIDs, chunks): if c[0] in chunk: connectors[c[1]][c[2]].append((chunkID, c[3])) break
def __str__(self): """ Return a listing of the attributes listed in self._fields and, if present, self._extra_fields. """ fields = self._fields + getattr(self, '_extra_fields', ()) fdict = dict(izip(fields, [getattr(self, f) for f in fields])) return print_fields(fields, fdict)
def BulkUpdateRecord(self, table_id, row_ids, columns): table_data = self.all_tables[table_id] rowid_map = {r:i for i, r in enumerate(table_data.row_ids)} table_indices = [rowid_map[r] for r in row_ids] for col, values in six.iteritems(columns): if col in table_data.columns: col_values = table_data.columns[col] for i, v in izip(table_indices, values): col_values[i] = v
def __new__(cls, *seq): if len(seq) != length: raise TypeError('Length mismatch') for i, j in izip(seq, cls._fields): if not isinstance(i, Number): raise TypeError(j + ' is not a Number') return baseClass.__new__(cls, *seq)
def _yield_text_from_framed_data(framed_data, parse=lambda x: x): parts = [parse(x) for x in framed_data.split(BOUNDARY)] for text_length, text in izip(parts[1::2], parts[2::2]): if text_length != str(len(text)): warning = 'invalid declared length=%s for packet_text=%s' % ( text_length, text) _log.warn('[packet error] %s', warning) continue yield text
def calc(self, p, x, xhi=None, *args, **kwargs): pha = self.pha user_grid = False try: if self._check_for_user_grid(x, xhi): user_grid = True self._startup_user_grid(x, xhi) # Slow if self.table is None: # again, fit() never comes in here b/c it calls startup() src = self.source vals = [] for model, args in izip(self.models, self.grid): elo, ehi = lo, hi = args if pha.units == 'wavelength': lo = DataPHA._hc / ehi hi = DataPHA._hc / elo vals.append(model(src(lo, hi))) self.orders = vals # Fast else: xlo, xhi = self.elo, self.ehi if pha.units == 'wavelength': xlo, xhi = self.lo, self.hi src = self.source(xlo, xhi) # hi-res grid of all ARF grids # Fold summed intervals through the associated response. self.orders = \ [model(sum_intervals(src, interval[0], interval[1])) for model, interval in izip(self.models, self.table)] vals = sum(self.orders) if self.mask is not None: vals = vals[self.mask] finally: if user_grid: self._teardown_user_grid() return vals
def loadBulk(self, oids): """ Storage API to return multiple objects We load a unique set of them, just in case :param list oids: Iterable oids to load at once :return: Loaded oid objects :rtype: list """ # First, try to get whatever possible from cache self._load_lock.acquire() try: self._lock.acquire() # for atomic processing of invalidations try: result = [] for oid in oids: out = self._cache.load(oid) if not out: self._load_oids[oid] = 1 else: result.append(out) finally: self._lock.release() if len(self._load_oids) == 0: return result # If we ever get here, we need to load some more stuff # self._load_oids dictionary is protected by self._load_lock if self._server is None: raise ClientDisconnected() load_oids = list(self._load_oids.keys()) # [(data, tid), (data, tid), ...] bulk_data = self._server.rpc.call("loadBulk", load_oids) data_size = 0 for oid, (data, tid) in izip(load_oids, bulk_data): data_size += len(data) self._lock.acquire() # for atomic processing of invalidations try: if self._load_oids[ oid]: # Update cache only when there was no invalidation self._cache.store(oid, tid, None, data) del self._load_oids[oid] result.append( (data, tid) ) # XXX shouldn't we provide a recent value from cache then? finally: self._lock.release() logging.debug("Bulk-loaded {0} objects of size {1}".format( len(load_oids), data_size)) finally: self._load_lock.release() return result
def _histogram(xlo, xhi, y, yerr=None, title=None, xlabel=None, ylabel=None, overplot=False, clearwindow=True, yerrorbars=False, errstyle=None, errcolor=None, errthickness=None, fillcolor=None, fillopacity=None, fillstyle=None, xlog=False, ylog=False, linestyle=chips.chips_solid, linecolor=None, linethickness=None, symbolangle=None, symbolcolor=None, symbolfill=None, symbolsize=None, symbolstyle=chips.chips_none): if (not overplot) and clearwindow: _clear_window() if yerrorbars and yerr is not None: chips.add_histogram(xlo, xhi, y, yerr) else: chips.add_histogram(xlo, xhi, y) for var in ('errstyle', 'errcolor', 'errthickness', 'fillcolor', 'fillopacity', 'fillstyle', 'linestyle', 'linecolor', 'linethickness', 'symbolangle', 'symbolcolor', 'symbolfill', 'symbolsize', 'symbolstyle'): val = locals()[var] if val is not None: if 'color' in var: val = _check_hex_color(val) getattr(chips.advanced, 'set_histogram_' + var)(val) if not overplot: for log_axis, axis_id in izip((xlog, ylog), (chips.X_AXIS, chips.Y_AXIS)): if log_axis: chips.log_scale(axis_id) else: chips.linear_scale(axis_id) if title: ttl = title.replace('_', '\\_') chips.set_plot_title(ttl) if xlabel: xlbl = xlabel.replace('_', '\\_') chips.set_plot_xlabel(xlbl) if ylabel: ylbl = ylabel.replace('_', '\\_') chips.set_plot_ylabel(ylbl)
def bigrams(seq): """ Yields bigrams from the given sequence. >>> list(bigrams(range(4))) [(0, 1), (1, 2), (2, 3)] """ first, second = tee(seq, 2) second = islice(second, 1, None) return izip(first, second)
def compactify(self): """Assign new word ids to all words, shrinking gaps.""" # build mapping from old id -> new id idmap = dict(izip(sorted(self.token2id.values()), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()} self.id2token = {} self.word_freq = {idmap[tokenid]: freq for tokenid, freq in self.word_freq.items()}
def _contour(x0, x1, y, levels=None, title=None, xlabel=None, ylabel=None, overcontour=False, clearwindow=True, xlog=False, ylog=False, style=None, color=None, thickness=None, axis_pad=0.05): if (not overcontour) and clearwindow: _clear_window() # Catch NANs before sending to ChIPS bad = list(numpy.where(numpy.isnan(y)==True)).pop(0) bad_vals = numpy.array(y[bad]) y[bad] = 0.0 if levels is None: chips.add_contour(x0, x1, y) else: levels = numpy.asarray(levels, numpy.float_) chips.add_contour(x0, x1, y, levels) y[bad] = bad_vals for var in ('style', 'color', 'thickness'): val = locals()[var] if val is not None: if 'color' in var: val = _check_hex_color(val) getattr(chips.advanced, 'set_contour_' + var)(val) chips.advanced.set_axis_pad(axis_pad) chips.set_data_aspect_ratio() chips.limits(chips.X_AXIS, x0.min(), x0.max()) chips.limits(chips.Y_AXIS, x1.min(), x1.max()) if not overcontour: for log_axis, axis_id in izip((xlog, ylog), (chips.X_AXIS, chips.Y_AXIS)): if log_axis: chips.log_scale(axis_id) else: chips.linear_scale(axis_id) if title: ttl = title.replace('_', '\\_') chips.set_plot_title(ttl) if xlabel: xlbl = xlabel.replace('_', '\\_') chips.set_plot_xlabel(xlbl) if ylabel: ylbl = ylabel.replace('_', '\\_') chips.set_plot_ylabel(ylbl)
def __iter__(self): """ Yields ------ list of (int, float) Document in BoW format. """ for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]): yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
def compactify(self): """Assign new word ids to all words, shrinking gaps.""" logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(sorted(itervalues(self.token2id)), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)} self.id2token = {} self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query # reset num_best and normalize parameters, in case they were changed dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.norm # there are 4 distinct code paths, depending on whether input `query` is # a corpus (or numpy/scipy matrix) or a single document, and whether the # similarity result should be a full array or only num_best most similar # documents. pool, shard_results = self.query_shards(query) if self.num_best is None: # user asked for all documents => just stack the sub-results into a single matrix # (works for both corpus / single doc query) result = numpy.hstack(shard_results) else: # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) else: # the trickiest combination: returning num_best results when query was a corpus results = [] for shard_no, result in enumerate(shard_results): shard_result = [convert(doc, shard_no) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to # "IOError 24: too many open files". so let's terminate it manually. pool.terminate() return result
def loadBulk(self, oids): """ Storage API to return multiple objects We load a unique set of them, just in case :param list oids: Iterable oids to load at once :return: Loaded oid objects :rtype: list """ # First, try to get whatever possible from cache self._load_lock.acquire() try: self._lock.acquire() # for atomic processing of invalidations try: result = [] for oid in oids: out = self._cache.load(oid) if not out: self._load_oids[oid] = 1 else: result.append(out) finally: self._lock.release() if len(self._load_oids) == 0: return result # If we ever get here, we need to load some more stuff # self._load_oids dictionary is protected by self._load_lock if self._server is None: raise ClientDisconnected() load_oids = list(self._load_oids.keys()) # [(data, tid), (data, tid), ...] bulk_data = self._server.rpc.call("loadBulk", load_oids) data_size = 0 for oid, (data, tid) in izip(load_oids, bulk_data): data_size += len(data) self._lock.acquire() # for atomic processing of invalidations try: if self._load_oids[oid]: # Update cache only when there was no invalidation self._cache.store(oid, tid, None, data) del self._load_oids[oid] result.append((data, tid)) # XXX shouldn't we provide a recent value from cache then? finally: self._lock.release() logging.debug("Bulk-loaded {0} objects of size {1}".format(len(load_oids), data_size)) finally: self._load_lock.release() return result
def pbkdf2_bin(data, salt, iterations=DEFAULT_PBKDF2_ITERATIONS, keylen=None, hashfunc=None): """Returns a binary digest for the PBKDF2 hash algorithm of `data` with the given `salt`. It iterates `iterations` times and produces a key of `keylen` bytes. By default, SHA-1 is used as hash function; a different hashlib `hashfunc` can be provided. .. versionadded:: 0.9 :param data: the data to derive. :param salt: the salt for the derivation. :param iterations: the number of iterations. :param keylen: the length of the resulting key. If not provided the digest size will be used. :param hashfunc: the hash function to use. This can either be the string name of a known hash function or a function from the hashlib module. Defaults to sha1. """ if isinstance(hashfunc, string_types): hashfunc = _hash_funcs[hashfunc] elif not hashfunc: hashfunc = hashlib.sha1 data = to_bytes(data) salt = to_bytes(salt) # If we're on Python with pbkdf2_hmac we can try to use it for # compatible digests. if _has_native_pbkdf2: _test_hash = hashfunc() if hasattr(_test_hash, 'name') and \ _test_hash.name in _hash_funcs: return hashlib.pbkdf2_hmac(_test_hash.name, data, salt, iterations, keylen) mac = hmac.HMAC(data, None, hashfunc) if not keylen: keylen = mac.digest_size def _pseudorandom(x, mac=mac): h = mac.copy() h.update(x) return bytearray(h.digest()) buf = bytearray() for block in range(1, -(-keylen // mac.digest_size) + 1): rv = u = _pseudorandom(salt + _pack_int(block)) for i in range_type(iterations - 1): u = _pseudorandom(bytes(u)) rv = bytearray(starmap(xor, izip(rv, u))) buf.extend(rv) return bytes(buf[:keylen])
def parallel_est(estfunc, limit_parnums, pars, numcores=_ncpus): tasks = [] def worker(out_q, err_q, parids, parnums, parvals, lock): results = [] for parid, singleparnum in izip(parids, parnums): try: result = estfunc(parid, singleparnum, lock) results.append((parid, result)) except EstNewMin: # catch the EstNewMin exception and include the exception # class and the modified parameter values to the error queue. # These modified parvals determine the new lower statistic. # The exception class will be instaniated re-raised with the # parameter values attached. C++ Python exceptions are not # picklable for use in the queue. err_q.put(EstNewMin(parvals)) return except Exception as e: #err_q.put( e.__class__() ) err_q.put(e) return out_q.put(results) # The multiprocessing manager provides references to process-safe # shared objects like Queue and Lock manager = multiprocessing.Manager() out_q = manager.Queue() err_q = manager.Queue() lock = manager.Lock() size = len(limit_parnums) parids = numpy.arange(size) # if len(limit_parnums) is less than numcores, only use length number of # processes if size < numcores: numcores = size # group limit_parnums into numcores-worth of chunks limit_parnums = numpy.array_split(limit_parnums, numcores) parids = numpy.array_split(parids, numcores) tasks = [multiprocessing.Process(target=worker, args=(out_q, err_q, parid, parnum, pars, lock)) for parid, parnum in izip(parids, limit_parnums)] return run_tasks(tasks, out_q, err_q, size)
def query(self, queryobj=None, skip=None, limit=None, prefetch=True, **kw): """ Smart proxy to catalog's query. One can add <field=...> keyword arguments to make queries where fields are equal to specified values :param zerodb.catalog.query.Query queryobj: Query which all sorts of logical, range queries etc :param int skip: Offset to start the result iteration from :param int limit: Limit number of results to this """ # Catalog's query returns only integers # We must be smart here and return objects # But no, we must be even smarter and batch-preload objects # Most difficult part is preloading TreeSets for index when needed # (when we do complex queries which require composite index) # We also probably should do something like lazy query(...)[ofs:...] # if no limit, skip are used # Work needed on skip and limit because zope didn't well support them... skip = skip or 0 if limit: kw["limit"] = skip + limit eq_args = [] for k in list(kw.keys()): if k not in set(["sort_index", "sort_type", "reverse", "names", "limit"]): eq_args.append(Eq(k, kw.pop(k))) if queryobj: Q = optimize(optimize(queryobj) & And(*eq_args)) else: Q = And(*eq_args) q = lambda: self._catalog.query(Q, **kw) if limit: _, q = q() # XXX islice -> [:] qids = list(itertools.islice(q, skip, skip + limit)) objects = [self._objects[uid] for uid in qids] if objects and prefetch: self._db._connection.prefetch(objects) for obj, uid in izip(objects, qids): obj._p_uid = uid return objects else: db_list = DBListPrefetch if prefetch else DBList return db_list(q, self)