def _batch_import_urls(cls, tweets): with_urls = [t for t in tweets if 'entities' in t and 'urls' in t['entities']] urls = list(chain(*[t['entities']['urls'] for t in with_urls])) logging.debug('URLs: {0} in {1} tweets'.format(len(urls), len(with_urls))) cls._save_urls(urls) url_map = dict(map(lambda x: (x.url, x), Url.objects.filter(url__in=pluck('expanded_url', urls)))) with_media_urls = [t for t in tweets if 'entities' in t and 'media' in t['entities'] and t['entities']['media']] media_urls = list(chain(*[t['entities']['media'] for t in with_media_urls])) logging.debug('Media URLs: {0} in {1} tweets'.format(len(media_urls), len(with_media_urls))) cls._save_media_urls(media_urls) media_url_map = dict(map(lambda x: (x.internal_id, x), Media.objects.filter(internal_id__in=pluck('id_str', media_urls)))) tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets)) logging.debug('URLs saved: {0}'.format(len(url_map))) logging.debug('Media URLs saved: {0}'.format(len(media_url_map))) for tweet in tweets: tweet_urls = [url_map[url['expanded_url']] for url in tweet['entities']['urls'] if url['expanded_url'] in url_map] if tweet_urls: tweet_map[tweet['__pk__']].links = tweet_urls if 'media' in tweet['entities'] and tweet['entities']['media']: print(tweet['entities']['media']) tweet_media_urls = [media_url_map[media_url['id_str']] for media_url in tweet['entities']['media'] if media_url['id_str'] in media_url_map] if tweet_media_urls: tweet_map[tweet['__pk__']].media = tweet_media_urls
def test_url(self) -> None: """Test the generated url and get the required parameters from the service.""" try: resp = self.session.get(self.base_url, {"f": "json"}).json() try: self.units = resp["units"].replace("esri", "").lower() except KeyError: self.units = None self._max_nrecords = int(resp["maxRecordCount"]) self.query_formats = resp["supportedQueryFormats"].replace( " ", "").lower().split(",") self.valid_fields = list( set( utils.traverse_json(resp, ["fields", "name"]) + utils.traverse_json(resp, ["fields", "alias"]) + ["*"])) try: extent = resp["extent"] if "extent" in resp else resp[ "fullExtent"] bounds = (extent["xmin"], extent["ymin"], extent["xmax"], extent["ymax"]) crs = extent["spatialReference"]["latestWkid"] self.extent = utils.MatchCRS.bounds(bounds, crs, DEF_CRS) except KeyError: self.extent = None try: self.feature_types = dict( zip((tlz.pluck("id", resp["types"])), tlz.pluck("name", resp["types"]))) except KeyError: self.feature_types = None except KeyError: raise ServerError(self.base_url)
def _batch_import_keywords(cls, tweets): kws = set() for kw in pluck('keywords', tweets): kws.update(kw) keyword_map = dict(map(lambda x: (x.name, x), Keyword.retrieve(kws))) tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets)) logging.debug('Keywords: {0} in {1} tweets'.format(len(keyword_map), len(tweet_map))) for tweet in tweets: tweet_kws = [keyword_map[k] for k in tweet['keywords']] tweet_map[tweet['__pk__']].keywords = tweet_kws
def _batch_import_urls(cls, tweets): with_urls = [ t for t in tweets if 'entities' in t and 'urls' in t['entities'] ] urls = list(chain(*[t['entities']['urls'] for t in with_urls])) logging.debug('URLs: {0} in {1} tweets'.format(len(urls), len(with_urls))) cls._save_urls(urls) url_map = dict( map(lambda x: (x.url, x), Url.objects.filter(url__in=pluck('expanded_url', urls)))) with_media_urls = [ t for t in tweets if 'entities' in t and 'media' in t['entities'] and t['entities']['media'] ] media_urls = list( chain(*[t['entities']['media'] for t in with_media_urls])) logging.debug('Media URLs: {0} in {1} tweets'.format( len(media_urls), len(with_media_urls))) cls._save_media_urls(media_urls) media_url_map = dict( map( lambda x: (x.internal_id, x), Media.objects.filter( internal_id__in=pluck('id_str', media_urls)))) tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets)) logging.debug('URLs saved: {0}'.format(len(url_map))) logging.debug('Media URLs saved: {0}'.format(len(media_url_map))) for tweet in tweets: tweet_urls = [ url_map[url['expanded_url']] for url in tweet['entities']['urls'] if url['expanded_url'] in url_map ] if tweet_urls: tweet_map[tweet['__pk__']].links = tweet_urls if 'media' in tweet['entities'] and tweet['entities']['media']: print(tweet['entities']['media']) tweet_media_urls = [ media_url_map[media_url['id_str']] for media_url in tweet['entities']['media'] if media_url['id_str'] in media_url_map ] if tweet_media_urls: tweet_map[tweet['__pk__']].media = tweet_media_urls
def _batch_import_keywords(cls, tweets): kws = set() for kw in pluck('keywords', tweets): kws.update(kw) keyword_map = dict(map(lambda x: (x.name, x), Keyword.retrieve(kws))) tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets)) logging.debug('Keywords: {0} in {1} tweets'.format( len(keyword_map), len(tweet_map))) for tweet in tweets: tweet_kws = [keyword_map[k] for k in tweet['keywords']] tweet_map[tweet['__pk__']].keywords = tweet_kws
def get_files(self, item: str) -> Dict[str, Tuple[str, str]]: """Get all the available zip files in an item.""" url = "https://www.sciencebase.gov/catalog/item" payload = {"fields": "files,downloadUri", "format": "json"} r = self.session.get(f"{url}/{item}", payload=payload).json() files_url = zip(tlz.pluck("name", r["files"]), tlz.pluck("url", r["files"])) # TODO: Add units meta = "".join(tlz.pluck("metadataHtmlViewUri", r["files"], default="")) return { f.replace("_CONUS.zip", ""): (u, meta) for f, u in files_url if ".zip" in f }
def _get_py(self, key): if isinstance(key, tuple): assert len(key) == 2 rows, cols = key usecols = cols ds = self.dshape.subshape[rows, cols] usecols = None if isinstance(usecols, slice) else listpack(usecols) else: rows = key ds = self.dshape.subshape[rows] usecols = None if isinstance(ds, DataShape) and isdimension(ds[0]): ds = ds.subshape[0] seq = self._iter(usecols=usecols) if isinstance(key, tuple) and isinstance(cols, _strtypes + _inttypes): seq = pluck(0, seq) seq = coerce(ds, seq) if isinstance(rows, compatibility._inttypes): line = nth(rows, seq) try: return next(line).item() except TypeError: try: return line.item() except AttributeError: return line elif isinstance(rows, list): return nth_list(rows, seq) elif isinstance(rows, slice): return it.islice(seq, rows.start, rows.stop, rows.step) else: raise IndexError("key %r is not valid" % rows)
def get_stats(self, per_worker=False): individual_stats = _each(self.ctx, yappi.get_func_stats) if per_worker: return individual_stats else: stat, *rest = pluck(1, individual_stats) # merging adapted from _add_from_YSTAT for other in rest: for saved_stat in other: if saved_stat not in stat: stat._idx_max += 1 saved_stat.index = stat._idx_max stat.append(saved_stat) # fix children's index values for saved_stat in other: for saved_child_stat in saved_stat.children: # we know for sure child's index is pointing to a valid stat in saved_stats # so as saved_stat is already in sync. (in above loop), we can safely assume # that we shall point to a valid stat in current_stats with the child's full_name saved_child_stat.index = stat[ saved_child_stat.full_name].index # merge stats for saved_stat in other: saved_stat_in_curr = stat[saved_stat.full_name] saved_stat_in_curr += saved_stat return stat
def _check_contents(self, dset, duplication=1): files = sorted(dset.icollect(), key=lambda file: int( basename(file[0]).replace('test_file_', '').replace( '.tmp', ''))) expected = b''.join(interleave([self.contents] * duplication)) self.assertEqual(b''.join(pluck(1, files)), expected)
def call_function(func, func_token, args, kwargs, pure=None, nout=None): dask_key_name = kwargs.pop('dask_key_name', None) pure = kwargs.pop('pure', pure) if dask_key_name is None: name = '%s-%s' % (funcname(func), tokenize(func_token, *args, pure=pure, **kwargs)) else: name = dask_key_name dsk = sharedict.ShareDict() args_dasks = list(map(to_task_dask, args)) for arg, d in args_dasks: if isinstance(d, sharedict.ShareDict): dsk.update_with_key(d) elif isinstance(arg, (str, tuple)): dsk.update_with_key(d, key=arg) else: dsk.update(d) args = tuple(pluck(0, args_dasks)) if kwargs: dask_kwargs, dsk2 = to_task_dask(kwargs) dsk.update(dsk2) task = (apply, func, list(args), dask_kwargs) else: task = (func,) + args dsk.update_with_key({name: task}, key=name) nout = nout if nout is not None else None return Delayed(name, dsk, length=nout)
def batch_import(self, tweets, parse_json=False): logging.info('Received {0} tweets'.format(len(tweets))) tweets = list(filter(self.accept_tweet, map(self.pre_func, tweets))) logging.info('After Filtering: {0} tweets'.format(len(tweets))) user_fn = lambda x: pluck('user', x) tweet_fn = lambda x: x retweet_sources = [t['retweeted_status'] for t in tweets if 'retweeted_status' in t] quote_sources = [t['quoted_status'] for t in tweets if 'quoted_status' in t] imported_users = Importer._batch_import(User, tweets, user_fn) imported_users.extend(Importer._batch_import(User, retweet_sources, user_fn)) imported_users.extend(Importer._batch_import(User, quote_sources, user_fn)) imported = Importer._batch_import(Tweet, tweets, tweet_fn) imported.extend(Importer._batch_import(Tweet, [t['retweeted_status'] for t in tweets if 'retweeted_status' in t], tweet_fn)) imported.extend(Importer._batch_import(Tweet, [t['quoted_status'] for t in tweets if 'quoted_status' in t], tweet_fn)) Importer._batch_import_retweets(list(filter(lambda x: 'retweeted_status' in x, tweets))) Importer._batch_import_retweets(list(filter(lambda x: 'quoted_status' in x, tweets)), retweet_key='quoted_status') Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], tweets))) Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], retweet_sources))) Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], quote_sources))) Importer._batch_import_urls(list(filter(lambda x: x['__created__'], tweets))) Importer._batch_import_urls(list(filter(lambda x: x['__created__'], retweet_sources))) Importer._batch_import_urls(list(filter(lambda x: x['__created__'], quote_sources))) self.post_func(imported_users, imported) logging.debug('Imported Tweets: {0}'.format(len(imported))) return imported_users, imported
def build_graph(estimator, cv, scorer, candidate_params, X, y=None, groups=None, fit_params=None, iid=True, refit=True, error_score='raise', return_train_score=True, cache_cv=True): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting is_pairwise = getattr(estimator, '_pairwise', False) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) n_splits = compute_n_splits(cv, X, y, groups) if fit_params: # A mapping of {name: (name, graph-key)} param_values = to_indexable(*fit_params.values(), allow_scalars=True) fit_params = {k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values))} else: fit_params = {} fields, tokens, params = normalize_params(candidate_params) main_token = tokenize(normalize_estimator(estimator), fields, params, X_name, y_name, groups_name, fit_params, cv, error_score == 'raise', return_train_score) cv_name = 'cv-split-' + main_token dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name, is_pairwise, cache_cv) if iid: weights = 'cv-n-samples-' + main_token dsk[weights] = (cv_n_samples, cv_name) else: weights = None scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields, tokens, params, X_name, y_name, fit_params, n_splits, error_score, scorer, return_train_score) cv_results = 'cv-results-' + main_token candidate_params_name = 'cv-parameters-' + main_token dsk[candidate_params_name] = (decompress_params, fields, params) dsk[cv_results] = (create_cv_results, scores, candidate_params_name, n_splits, error_score, weights) keys = [cv_results] if refit: best_params = 'best-params-' + main_token dsk[best_params] = (get_best_params, candidate_params_name, cv_results) best_estimator = 'best-estimator-' + main_token if fit_params: fit_params = (dict, (zip, list(fit_params.keys()), list(pluck(1, fit_params.values())))) dsk[best_estimator] = (fit_best, clone(estimator), best_params, X_name, y_name, fit_params) keys.append(best_estimator) return dsk, keys, n_splits
def _remote(self): contents = pluck(1, super()._remote()) if self.dset.encoding is not None: contents = map( partial(_decode, self.dset.encoding, self.dset.errors), contents) lines = map(partial(_splitlines, True), contents) return chain.from_iterable(lines)
def take_snapshot(self, per_worker=False): snapshots = _each(self.ctx, tracemalloc.take_snapshot) if per_worker: return snapshots snapshot_merged, *snapshots = pluck(1, snapshots) traces_merged = snapshot_merged.traces._traces for s in snapshots: traces_merged.extend(s.traces._traces) return snapshot_merged
def _compress_letter_and_initabbr(words): if not words: return words result = [] for word_len, words_grp in groupby( words, lambda x: len(x[0])): # compress single letter words if word_len == 1: result.append(("".join(pluck(0, words_grp)), None)) else: result.extend(words_grp) if (result and 2 <= len(result[0]) <= len(result) - 1 and # remove if first word is an abbrevation of following words, e.g. "ABC Aa Bb Cc Company" all(char == word[0] for char, word in zip(result[0][0], pluck(0, result[1:])))): result = result[1:] return result
def batch_import(self, tweets, parse_json=False): logging.info('Received {0} tweets'.format(len(tweets))) tweets = list(filter(self.accept_tweet, map(self.pre_func, tweets))) logging.info('After Filtering: {0} tweets'.format(len(tweets))) user_fn = lambda x: pluck('user', x) tweet_fn = lambda x: x retweet_sources = [ t['retweeted_status'] for t in tweets if 'retweeted_status' in t ] quote_sources = [ t['quoted_status'] for t in tweets if 'quoted_status' in t ] imported_users = Importer._batch_import(User, tweets, user_fn) imported_users.extend( Importer._batch_import(User, retweet_sources, user_fn)) imported_users.extend( Importer._batch_import(User, quote_sources, user_fn)) imported = Importer._batch_import(Tweet, tweets, tweet_fn) imported.extend( Importer._batch_import(Tweet, [ t['retweeted_status'] for t in tweets if 'retweeted_status' in t ], tweet_fn)) imported.extend( Importer._batch_import( Tweet, [t['quoted_status'] for t in tweets if 'quoted_status' in t], tweet_fn)) Importer._batch_import_retweets( list(filter(lambda x: 'retweeted_status' in x, tweets))) Importer._batch_import_retweets(list( filter(lambda x: 'quoted_status' in x, tweets)), retweet_key='quoted_status') Importer._batch_import_keywords( list(filter(lambda x: x['__created__'], tweets))) Importer._batch_import_keywords( list(filter(lambda x: x['__created__'], retweet_sources))) Importer._batch_import_keywords( list(filter(lambda x: x['__created__'], quote_sources))) Importer._batch_import_urls( list(filter(lambda x: x['__created__'], tweets))) Importer._batch_import_urls( list(filter(lambda x: x['__created__'], retweet_sources))) Importer._batch_import_urls( list(filter(lambda x: x['__created__'], quote_sources))) self.post_func(imported_users, imported) logging.debug('Imported Tweets: {0}'.format(len(imported))) return imported_users, imported
def toolz_max(self, col, rows): """ Max of values for a given column in a list of dictionaries Args: col (str): Column to process rows (list): Records """ return max(list(toolz.pluck(col, rows)))
def toolz_count(self, col, rows): """ Unique values for a given column in a list of dictionaries Args: col (str): Column to process rows (list): Records """ return len(set(list(toolz.pluck(col, rows))))
def _group_ids_by_index(index, tokens): id_groups = [] def new_group(): o = [] id_groups.append(o) return o.append _id_groups = defaultdict(new_group) for n, t in enumerate(pluck(index, tokens)): _id_groups[t](n) return id_groups
def increment_rt_counts(tweet_pks): """ :param tweet_pks: dictionary {tweet_pk: rt_count} :return: """ items = sorted(tweet_pks.items(), key=lambda x: x[1], reverse=True) grouped = groupby(lambda x: x[1], items) for incr, pairs in grouped.items(): if incr > 0: pks = pluck(0, pairs) TweetFeatures.objects.filter(tweet_id__in=pks).update(count_rts=F('count_rts') + incr)
def pre_compute(expr, seq): try: if isinstance(seq, Iterator): first = next(seq) seq = concat([[first], seq]) else: first = next(iter(seq)) except StopIteration: return [] if isinstance(first, dict): return pluck(expr.fields, seq) else: return seq
def toolz_sum(self, col, rows, dtype=None): """ Sum a given column in a list of dictionaries Args: col (str): Column to process rows (list): Records """ values = list(toolz.pluck(col, rows)) if dtype is not None: values = [dtype(v) for v in values] return sum(values)
def toolz_avg(self, col, rows): """ Unique values for a given column in a list of dictionaries Args: col (str): Column to process rows (list): Records """ count = len(rows) total = sum(list(toolz.pluck(col, rows))) return total / count if count > 0 else None
def _batch_import_retweets(cls, tweets, retweet_key='retweeted_status'): rts = [] user_map = User.objects.in_bulk(pluck('__pk__', pluck('user', tweets))) tweet_ids = list(pluck('__pk__', tweets)) tweet_ids.extend(pluck('__pk__', pluck(retweet_key, tweets))) tweet_map = Tweet.objects.in_bulk(tweet_ids) rt_counts = defaultdict(int) for current in tweets: rts.append(ReTweet(user=user_map[current['user']['__pk__']], tweet_instance=tweet_map[current['__pk__']], source_tweet=tweet_map[current[retweet_key]['__pk__']], datetime=tweet_map[current['__pk__']].datetime)) rt_counts[current['__pk__']] += 1 ReTweet.objects.bulk_create(rts) logging.debug('RT Increments: {0}'.format(rt_counts)) increment_rt_counts(rt_counts)
def increment_rt_counts(tweet_pks): """ :param tweet_pks: dictionary {tweet_pk: rt_count} :return: """ items = sorted(tweet_pks.items(), key=lambda x: x[1], reverse=True) grouped = groupby(lambda x: x[1], items) for incr, pairs in grouped.items(): if incr > 0: pks = pluck(0, pairs) TweetFeatures.objects.filter(tweet_id__in=pks).update( count_rts=F('count_rts') + incr)
def _into_iter_mongodb(l, coll, columns=None, schema=None): """ Into helper function Return both a lazy sequence of tuples and a list of column names """ seq = coll.find() if not columns and schema: columns = schema[0].names elif not columns: item = next(seq) seq = concat([[item], seq]) columns = sorted(item.keys()) columns.remove('_id') return columns, pluck(columns, seq)
def _batch_import_retweets(cls, tweets, retweet_key='retweeted_status'): rts = [] user_map = User.objects.in_bulk(pluck('__pk__', pluck('user', tweets))) tweet_ids = list(pluck('__pk__', tweets)) tweet_ids.extend(pluck('__pk__', pluck(retweet_key, tweets))) tweet_map = Tweet.objects.in_bulk(tweet_ids) rt_counts = defaultdict(int) for current in tweets: rts.append( ReTweet(user=user_map[current['user']['__pk__']], tweet_instance=tweet_map[current['__pk__']], source_tweet=tweet_map[current[retweet_key]['__pk__']], datetime=tweet_map[current['__pk__']].datetime)) rt_counts[current['__pk__']] += 1 ReTweet.objects.bulk_create(rts) logging.debug('RT Increments: {0}'.format(rt_counts)) increment_rt_counts(rt_counts)
def pre_compute(expr, seq, scope=None, **kwargs): try: if isinstance(seq, Iterator): first = next(seq) seq = concat([[first], seq]) else: first = next(iter(seq)) except StopIteration: return [] if isinstance(first, dict): leaf = expr._leaves()[0] return pluck(leaf.fields, seq) else: return seq
def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" # Collect a set of all fields fields = set() for p in params: fields.update(p) fields = sorted(fields) params2 = list(pluck(fields, params, MISSING)) # Non-basic types (including MISSING) are unique to their id tokens = [tuple(x if isinstance(x, (int, float, str)) else id(x) for x in p) for p in params2] return fields, tokens, params2
def trans_iter(): for kdnr, groups in groupby(map(dict, pluck(1, trans_dat.iterrows())), key=itemgetter("kdnr")): aggs = [ FrequencyTracker(varname, varfunc, ["timestamp"], 0) for varname, varfunc in [ ("land", itemgetter("ref_land")), ("kanal", lambda x: str(x["UTU5_EINGABE_NAME"])[:5]), ] ] + [TimestampSameCustomer()] for row in groups: for agg in aggs: agg.add(row) row.update(agg.value) yield row
def normalize_params(params): """Take a list of dictionaries, and tokenize/normalize.""" # Collect a set of all fields fields = set() for p in params: fields.update(p) fields = sorted(fields) params2 = list(pluck(fields, params, MISSING)) # Non-basic types (including MISSING) are unique to their id tokens = [ tuple(x if isinstance(x, (int, float, str)) else id(x) for x in p) for p in params2 ] return fields, tokens, params2
def _batch_import(base_class, cls, elements, fn): logging.debug('Trying to import {1} from {0} elements'.format( len(elements), cls)) internal_ids = set(pluck('id_str', fn(elements))) existing_users = cls.objects.filter(internal_id__in=internal_ids) existing_ids = set([u.internal_id for u in existing_users]) user_pks = dict([(u.internal_id, u.pk) for u in existing_users]) new_ids = internal_ids - existing_ids logging.debug('Existing IDs: {0}'.format(len(existing_ids))) logging.debug('New IDs: {0}'.format(len(new_ids))) added_keys = set() new_elements = [] for element in fn(elements): if element['id_str'] in user_pks: element['__pk__'] = user_pks[element['id_str']] element['__created__'] = False else: if not element['id_str'] in added_keys: user_model = cls() user_model.copy_json(valfilter(lambda x: x, element)) new_elements.append(user_model) element['__created__'] = True element['__pk__'] = None added_keys.add(element['id_str']) cls.objects.bulk_create(new_elements) new_models = list(cls.objects.filter(internal_id__in=new_ids)) logging.debug('New IDs created successfully: {0}'.format( len(new_models))) new_pks = dict([(u.internal_id, u.pk) for u in new_models]) for element in fn(elements): if element['id_str'] in new_pks: element['__pk__'] = new_pks[element['id_str']] return new_models
def _batch_import(base_class, cls, elements, fn): logging.debug('Trying to import {1} from {0} elements'.format(len(elements), cls)) internal_ids = set(pluck('id_str', fn(elements))) existing_users = cls.objects.filter(internal_id__in=internal_ids) existing_ids = set([u.internal_id for u in existing_users]) user_pks = dict([(u.internal_id, u.pk) for u in existing_users]) new_ids = internal_ids - existing_ids logging.debug('Existing IDs: {0}'.format(len(existing_ids))) logging.debug('New IDs: {0}'.format(len(new_ids))) added_keys = set() new_elements = [] for element in fn(elements): if element['id_str'] in user_pks: element['__pk__'] = user_pks[element['id_str']] element['__created__'] = False else: if not element['id_str'] in added_keys: user_model = cls() user_model.copy_json(valfilter(lambda x: x, element)) new_elements.append(user_model) element['__created__'] = True element['__pk__'] = None added_keys.add(element['id_str']) cls.objects.bulk_create(new_elements) new_models = list(cls.objects.filter(internal_id__in=new_ids)) logging.debug('New IDs created successfully: {0}'.format(len(new_models))) new_pks = dict([(u.internal_id, u.pk) for u in new_models]) for element in fn(elements): if element['id_str'] in new_pks: element['__pk__'] = new_pks[element['id_str']] return new_models
def build_graph(estimator, cv, scorer, candidate_params, X, y=None, groups=None, fit_params=None, iid=True, refit=True, error_score='raise', return_train_score=True, cache_cv=True): X, y, groups = to_indexable(X, y, groups) cv = check_cv(cv, y, is_classifier(estimator)) # "pairwise" estimators require a different graph for CV splitting is_pairwise = getattr(estimator, '_pairwise', False) dsk = {} X_name, y_name, groups_name = to_keys(dsk, X, y, groups) n_splits = compute_n_splits(cv, X, y, groups) if fit_params: # A mapping of {name: (name, graph-key)} param_values = to_indexable(*fit_params.values(), allow_scalars=True) fit_params = { k: (k, v) for (k, v) in zip(fit_params, to_keys(dsk, *param_values)) } else: fit_params = {} fields, tokens, params = normalize_params(candidate_params) main_token = tokenize(normalize_estimator(estimator), fields, params, X_name, y_name, groups_name, fit_params, cv, error_score == 'raise', return_train_score) cv_name = 'cv-split-' + main_token dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name, is_pairwise, cache_cv) if iid: weights = 'cv-n-samples-' + main_token dsk[weights] = (cv_n_samples, cv_name) else: weights = None scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields, tokens, params, X_name, y_name, fit_params, n_splits, error_score, scorer, return_train_score) cv_results = 'cv-results-' + main_token candidate_params_name = 'cv-parameters-' + main_token dsk[candidate_params_name] = (decompress_params, fields, params) dsk[cv_results] = (create_cv_results, scores, candidate_params_name, n_splits, error_score, weights) keys = [cv_results] if refit: best_params = 'best-params-' + main_token dsk[best_params] = (get_best_params, candidate_params_name, cv_results) best_estimator = 'best-estimator-' + main_token if fit_params: fit_params = (dict, (zip, list(fit_params.keys()), list(pluck(1, fit_params.values())))) dsk[best_estimator] = (fit_best, clone(estimator), best_params, X_name, y_name, fit_params) keys.append(best_estimator) return dsk, keys, n_splits
def _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score, step_fields_lk, fit_params_lk, field_to_index, step_name, none_passthrough, is_transform): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] if step_name in field_to_index: # The estimator may change each call new_fits = {} new_Xs = {} est_index = field_to_index[step_name] for ids in _group_ids_by_index(est_index, tokens): # Get the estimator for this subgroup sub_est = params[ids[0]][est_index] if sub_est is MISSING: sub_est = step # If an estimator is `None`, there's nothing to do if sub_est is None: nones = dict.fromkeys(ids, None) new_fits.update(nones) if is_transform: if none_passthrough: new_Xs.update(zip(ids, get(ids, Xs))) else: new_Xs.update(nones) else: # Extract the proper subset of Xs, ys sub_Xs = get(ids, Xs) sub_ys = get(ids, ys) # Only subset the parameters/tokens if necessary if sub_fields: sub_tokens = list(pluck(sub_inds, get(ids, tokens))) sub_params = list(pluck(sub_inds, get(ids, params))) else: sub_tokens = sub_params = None if is_transform: sub_fits, sub_Xs = do_fit_transform( dsk, next_token, sub_est, cv, sub_fields, sub_tokens, sub_params, sub_Xs, sub_ys, sub_fit_params, n_splits, error_score) new_Xs.update(zip(ids, sub_Xs)) new_fits.update(zip(ids, sub_fits)) else: sub_fits = do_fit(dsk, next_token, sub_est, cv, sub_fields, sub_tokens, sub_params, sub_Xs, sub_ys, sub_fit_params, n_splits, error_score) new_fits.update(zip(ids, sub_fits)) # Extract lists of transformed Xs and fit steps all_ids = list(range(len(Xs))) if is_transform: Xs = get(all_ids, new_Xs) fits = get(all_ids, new_fits) elif step is None: # Nothing to do fits = [None] * len(Xs) if not none_passthrough: Xs = fits else: # Only subset the parameters/tokens if necessary if sub_fields: sub_tokens = list(pluck(sub_inds, tokens)) sub_params = list(pluck(sub_inds, params)) else: sub_tokens = sub_params = None if is_transform: fits, Xs = do_fit_transform(dsk, next_token, step, cv, sub_fields, sub_tokens, sub_params, Xs, ys, sub_fit_params, n_splits, error_score) else: fits = do_fit(dsk, next_token, step, cv, sub_fields, sub_tokens, sub_params, Xs, ys, sub_fit_params, n_splits, error_score) return (fits, Xs) if is_transform else (fits, None)
def _do_featureunion(dsk, next_token, est, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score): if 'transformer_list' in fields: raise NotImplementedError("Setting FeatureUnion.transformer_list " "in a gridsearch") (field_to_index, step_fields_lk) = _group_subparams(est.transformer_list, fields, ignore=('transformer_weights')) fit_params_lk = _group_fit_params(est.transformer_list, fit_params) token = next_token(est) n_samples = _do_n_samples(dsk, token, Xs, n_splits) fit_steps = [] tr_Xs = [] for (step_name, step) in est.transformer_list: fits, out_Xs = _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score, step_fields_lk, fit_params_lk, field_to_index, step_name, False, True) fit_steps.append(fits) tr_Xs.append(out_Xs) # Rebuild the FeatureUnions step_names = [n for n, _ in est.transformer_list] if 'transformer_weights' in field_to_index: index = field_to_index['transformer_weights'] weight_lk = {} weight_tokens = list(pluck(index, tokens)) for i, tok in enumerate(weight_tokens): if tok not in weight_lk: weights = params[i][index] if weights is MISSING: weights = est.transformer_weights lk = weights or {} weight_list = [lk.get(n) for n in step_names] weight_lk[tok] = (weights, weight_list) weights = get(weight_tokens, weight_lk) else: lk = est.transformer_weights or {} weight_list = [lk.get(n) for n in step_names] weight_tokens = repeat(None) weights = repeat((est.transformer_weights, weight_list)) out = [] out_append = out.append fit_name = 'feature-union-' + token tr_name = 'feature-union-concat-' + token m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip(zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples): if (steps, wt) in seen: out_append(seen[steps, wt]) else: for n in range(n_splits): dsk[(fit_name, m, n)] = (feature_union, step_names, [None if s is None else s + (n, ) for s in steps], w) dsk[(tr_name, m, n)] = (feature_union_concat, [None if x is None else x + (n, ) for x in Xs], nsamp + (n, ), wl) seen[steps, wt] = m out_append(m) m += 1 return [(fit_name, i) for i in out], [(tr_name, i) for i in out]
sns.violinplot(x='diff', y='cond', data=freq_diffs) #%% clip = np.percentile(freq_diffs['diff'], [2.5, 97.5]) for cond, data in freq_diffs.groupby('cond'): sns.kdeplot(data['diff'], clip=clip, label=cond) plt.savefig('figures/word_freq_of_suggs.pdf') #%% import random acceptability_frames = [] for group in cytoolz.partition_all(len(conditions), samples): context = group[0]['context'] meta = list( cytoolz.pluck(['review_idx', 'sent_idx', 'word_idx', 'true_follows'], group)) assert len(set(meta)) == 1 meta = list(meta[0]) true_follows = meta.pop(-1) options = [('true', group[0]['true_follows'])] for sample in group: for sugg in sample['suggs'].split('\n')[:1]: options.append((sample['cond'], sugg)) random.shuffle(options) acceptability_frames.append( dict(meta=meta, context=group[0]['context'], options=options)) import json json.dump(acceptability_frames, open('acceptability_frames.json', 'w'))
def pluck(self, ind): if cytoolz.isiterable(ind): return self.__class__(itertools.imap(flist, cytoolz.pluck(ind, self))) else: return self.__class__(cytoolz.pluck(ind, self))
def stage_data(self) -> pd.DataFrame: """Stage the NHDPlus Attributes database and save to nhdplus_attrs.feather.""" r = self.get_children(self.nhd_attr_item) titles = tlz.pluck("title", r["items"]) titles = tlz.concat( tlz.map(tlz.partial(re.findall, "Select(.*?)Attributes"), titles)) titles = tlz.map(str.strip, titles) main_items = dict(zip(titles, tlz.pluck("id", r["items"]))) files = {} soil = main_items.pop("Soil") for i, item in main_items.items(): r = self.get_children(item) titles = tlz.pluck("title", r["items"]) titles = tlz.map( lambda s: s.split(":")[1].strip() if ":" in s else s, titles) child_items = dict(zip(titles, tlz.pluck("id", r["items"]))) files[i] = {t: self.get_files(c) for t, c in child_items.items()} r = self.get_children(soil) titles = tlz.pluck("title", r["items"]) titles = tlz.map(lambda s: s.split(":")[1].strip() if ":" in s else s, titles) child_items = dict(zip(titles, tlz.pluck("id", r["items"]))) stat = child_items.pop("STATSGO Soil Characteristics") ssur = child_items.pop("SSURGO Soil Characteristics") files["Soil"] = {t: self.get_files(c) for t, c in child_items.items()} r = self.get_children(stat) titles = tlz.pluck("title", r["items"]) titles = tlz.map(lambda s: s.split(":")[1].split(",")[1].strip(), titles) child_items = dict(zip(titles, tlz.pluck("id", r["items"]))) files["STATSGO"] = { t: self.get_files(c) for t, c in child_items.items() } r = self.get_children(ssur) titles = tlz.pluck("title", r["items"]) titles = tlz.map(lambda s: s.split(":")[1].strip(), titles) child_items = dict(zip(titles, tlz.pluck("id", r["items"]))) files["SSURGO"] = { t: self.get_files(c) for t, c in child_items.items() } chars = [] types = {"CAT": "local", "TOT": "upstream_acc", "ACC": "div_routing"} for t, dd in files.items(): for d, fd in dd.items(): for f, u in fd.items(): chars.append({ "name": f, "type": types.get(f[-3:], "other"), "theme": t, "description": d, "url": u[0], "meta": u[1], }) char_df = pd.DataFrame(chars, dtype="category") char_df.to_feather(self.char_feather) return char_df
def _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score, step_fields_lk, fit_params_lk, field_to_index, step_name, none_passthrough, is_transform): sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2)) sub_fit_params = fit_params_lk[step_name] if step_name in field_to_index: # The estimator may change each call new_fits = {} new_Xs = {} est_index = field_to_index[step_name] for ids in _group_ids_by_index(est_index, tokens): # Get the estimator for this subgroup sub_est = params[ids[0]][est_index] if sub_est is MISSING: sub_est = step # If an estimator is `None`, there's nothing to do if sub_est is None: nones = dict.fromkeys(ids, None) new_fits.update(nones) if is_transform: if none_passthrough: new_Xs.update(zip(ids, get(ids, Xs))) else: new_Xs.update(nones) else: # Extract the proper subset of Xs, ys sub_Xs = get(ids, Xs) sub_ys = get(ids, ys) # Only subset the parameters/tokens if necessary if sub_fields: sub_tokens = list(pluck(sub_inds, get(ids, tokens))) sub_params = list(pluck(sub_inds, get(ids, params))) else: sub_tokens = sub_params = None if is_transform: sub_fits, sub_Xs = do_fit_transform(dsk, next_token, sub_est, cv, sub_fields, sub_tokens, sub_params, sub_Xs, sub_ys, sub_fit_params, n_splits, error_score) new_Xs.update(zip(ids, sub_Xs)) new_fits.update(zip(ids, sub_fits)) else: sub_fits = do_fit(dsk, next_token, sub_est, cv, sub_fields, sub_tokens, sub_params, sub_Xs, sub_ys, sub_fit_params, n_splits, error_score) new_fits.update(zip(ids, sub_fits)) # Extract lists of transformed Xs and fit steps all_ids = list(range(len(Xs))) if is_transform: Xs = get(all_ids, new_Xs) fits = get(all_ids, new_fits) elif step is None: # Nothing to do fits = [None] * len(Xs) if not none_passthrough: Xs = fits else: # Only subset the parameters/tokens if necessary if sub_fields: sub_tokens = list(pluck(sub_inds, tokens)) sub_params = list(pluck(sub_inds, params)) else: sub_tokens = sub_params = None if is_transform: fits, Xs = do_fit_transform(dsk, next_token, step, cv, sub_fields, sub_tokens, sub_params, Xs, ys, sub_fit_params, n_splits, error_score) else: fits = do_fit(dsk, next_token, step, cv, sub_fields, sub_tokens, sub_params, Xs, ys, sub_fit_params, n_splits, error_score) return (fits, Xs) if is_transform else (fits, None)
def _do_featureunion(dsk, next_token, est, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score): if 'transformer_list' in fields: raise NotImplementedError("Setting FeatureUnion.transformer_list " "in a gridsearch") (field_to_index, step_fields_lk) = _group_subparams(est.transformer_list, fields, ignore=('transformer_weights')) fit_params_lk = _group_fit_params(est.transformer_list, fit_params) token = next_token(est) n_samples = _do_n_samples(dsk, token, Xs, n_splits) fit_steps = [] tr_Xs = [] for (step_name, step) in est.transformer_list: fits, out_Xs = _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys, fit_params, n_splits, error_score, step_fields_lk, fit_params_lk, field_to_index, step_name, False, True) fit_steps.append(fits) tr_Xs.append(out_Xs) # Rebuild the FeatureUnions step_names = [n for n, _ in est.transformer_list] if 'transformer_weights' in field_to_index: index = field_to_index['transformer_weights'] weight_lk = {} weight_tokens = list(pluck(index, tokens)) for i, tok in enumerate(weight_tokens): if tok not in weight_lk: weights = params[i][index] if weights is MISSING: weights = est.transformer_weights lk = weights or {} weight_list = [lk.get(n) for n in step_names] weight_lk[tok] = (weights, weight_list) weights = get(weight_tokens, weight_lk) else: lk = est.transformer_weights or {} weight_list = [lk.get(n) for n in step_names] weight_tokens = repeat(None) weights = repeat((est.transformer_weights, weight_list)) out = [] out_append = out.append fit_name = 'feature-union-' + token tr_name = 'feature-union-concat-' + token m = 0 seen = {} for steps, Xs, wt, (w, wl), nsamp in zip(zip(*fit_steps), zip(*tr_Xs), weight_tokens, weights, n_samples): if (steps, wt) in seen: out_append(seen[steps, wt]) else: for n in range(n_splits): dsk[(fit_name, m, n)] = (feature_union, step_names, [None if s is None else s + (n,) for s in steps], w) dsk[(tr_name, m, n)] = (feature_union_concat, [None if x is None else x + (n,) for x in Xs], nsamp + (n,), wl) seen[steps, wt] = m out_append(m) m += 1 return [(fit_name, i) for i in out], [(tr_name, i) for i in out]
def setcompare(iter1, iter2): cntr1 = Counter(iter1) cntr2 = Counter(iter2) only1 = cntr1.keys() - cntr2.keys() only2 = cntr2.keys() - cntr1.keys() both = cntr1.keys() & cntr2.keys() cnt1 = sum(cntr1[key] for key in only1) cnt2 = sum(cntr2[key] for key in only2) cnt12a = sum(cntr1[key] for key in both) cnt12b = sum(cntr2[key] for key in both) distinct1 = len(only1) distinct2 = len(only2) distinct12 = len(both) cnt_perct = "{} ({:.0%})".format if hasattr(iter1, "name"): name1 = f"1 {iter1.name}" else: name1 = "1" if hasattr(iter2, "name"): name2 = f"2 {iter2.name}" else: name2 = "2" display_data = [ ["", f"Set {name1} only", "Intersect.", f"Set {name2} only"], [ "Count", cnt_perct(cnt1, cnt1 / (cnt1 + cnt12a)), "{} | {}".format(cnt12a, cnt12b), cnt_perct(cnt2, cnt2 / (cnt2 + cnt12b)), ], [ "Distinct count", cnt_perct(distinct1, distinct1 / (distinct1 + distinct12)), distinct12, cnt_perct(distinct2, distinct2 / (distinct2 + distinct12)), ], [ "Examples", format_tuples( pluck( 0, Counter({key: cntr1[key] for key in only1}).most_common(5)), cntr1, ), format_tuples( pluck( 0, Counter({key: cntr1[key] + cntr2[key] for key in both}).most_common(5), ), cntr1, cntr2, ), format_tuples( pluck( 0, Counter({key: cntr2[key] for key in only2}).most_common(5)), cntr2, ), ], ] make_table(display_data) table = apply_theme("basic_both") for x, y in product([0, 1, 2], [1, 2, 3]): set_cell_style(x, y, align="center") return table
def generate_timeline(time_range, save_to_db=True, tweets=None, skip_fields=None, size=40, sideline_turns=5, time_bucket_size=30, location_depth=1, time_zone=None, twitter_api=None, min_length=50, exclude_replies=True, informational_only=False, exclude_retweets=False, post_update=True, retweet_tweeps=True, skip_sensitive_content=True, retweeted_only=False, post_to_twitter=False, update_users=True, shout_score=0.5, n_candidates=None, target_entropy=0.99): if n_candidates is not None and n_candidates <= 0: n_candidates = None # generate filtered set characterizer = Characterizer() if not time_zone: time_zone = pytz.timezone(settings.TIME_ZONE) locations = list(Location.objects.filter(depth=location_depth)) location_pks = [loc.pk for loc in locations] location_populations = dict([(loc.pk, loc.population) for loc in locations]) if not time_zone: time_zone = pytz.timezone(settings.TIME_ZONE) if tweets is None: tweets = _timeline_queryset(time_range, location_depth=location_depth, min_length=min_length, exclude_replies=exclude_replies, informational_only=informational_only, exclude_retweets=exclude_retweets, skip_sensitive_content=skip_sensitive_content, retweeted_only=retweeted_only) feature_keys = ['pk', 'text', 'favourite_count', 'retweet_count', 'user__screen_name', 'datetime', 'user__name', 'user__profile_image_url', 'internal_id', 'user__internal_id', 'user__friends_count', 'user__followers_count', 'user__statuses_count', 'characterization__count_rts', 'characterization__manual_rt', 'characterization__is_reply'] if location_depth > 0: feature_keys.append('user__characterization__location_depth_{0}'.format(location_depth)) tweet_features = list(tweets.values(*feature_keys)) if not tweet_features or len(tweet_features) < size: print('ERROR, not enough tweets', len(tweet_features)) return None if location_depth > 0: pick_strategy, approve_fn = TimelineFilter.select_tweet_and_sideline(TimelineFilter.select_popular_bucketed, location_pks, turns=sideline_turns) start_strategy = pick_strategy else: pick_strategy = TimelineFilter.select_popular_bucketed start_strategy = TimelineFilter.starting_tweet approve_fn = None if skip_fields is None: skip_fields = {'geography'} else: skip_fields.add('geography') generator = TimelineFilter(characterizer, skip_fields=skip_fields, min_date=time_range[0], max_entropy_percentile=100.0, time_bucket_size=time_bucket_size, start_strategy=start_strategy, pick_strategy=pick_strategy, approve_tweet_fn=approve_fn, n_candidates=n_candidates, target_entropy=target_entropy) for t in tweet_features: if location_depth > 0: t['geography'] = int(t['user__characterization__location_depth_{0}'.format(location_depth)]) t['popularity'] = 2.0 * t['characterization__count_rts'] + t['retweet_count'] + 0.5 * t['favourite_count'] generator.prepare_tweet(t) tweet_features = filter(lambda x: x['__shout_score__'] < shout_score, tweet_features) tweet_features = sorted(tweet_features, key=lambda x: x['buckets']['popularity'], reverse=True) print('N features', len(tweet_features)) for i in range(0, min([size, len(tweet_features)])): generator(tweet_features) # estimate data tweet_pks = list(pluck('pk', generator)) tl_tweets = Tweet.objects.in_bulk(tweet_pks) to_serialize = [tl_tweets[pk] for pk in tweet_pks] popularity = list(pluck('popularity', generator)) html = [] media = [] sources = [] rted_user_pks = set() for t in to_serialize: if t.media.exists(): media.append(json.loads(serializers.serialize("json", t.media.all()))) else: media.append(None) if t.rt_instance_tweet.exists(): rt = t.rt_instance_tweet.all()[0].source_tweet rt_serialized = json.loads(serializers.serialize("json", [rt]))[0] rted_user_pks.add(rt_serialized['fields']['user']) # NOTE: this user is not updated. need to update it at some point. #rt_serialized['fields']['user'] = json.loads(serializers.serialize("json", [User.objects.get(pk=pk)]))[0] html.append(rt.beautify_html()) sources.append(rt_serialized) else: html.append(t.beautify_html()) sources.append(None) if not twitter_api or not update_users: print('not updating users') users = json.loads(serializers.serialize("json", [t.user for t in to_serialize])) else: print('updating user profiles') user_pks = [t.user_id for t in to_serialize] user_pks.extend(rted_user_pks) user_ids = User.objects.filter(pk__in=user_pks).values_list('internal_id', flat=True) tl_user_data = twitter_api.lookup_users(user_ids=user_ids) for tl_user in tl_user_data: user, created = User.import_json(valfilter(lambda x: x, tl_user._json)) user_dict = User.objects.in_bulk([t.user_id for t in to_serialize]) users = json.loads(serializers.serialize("json", [user_dict[t.user_id] for t in to_serialize])) for t, source in zip(to_serialize, sources): if source is not None: source['fields']['user'] = json.loads(serializers.serialize("json", [User.objects.get(pk=source['fields']['user'])]))[0] weight = [] if location_depth > 0: max_population = 1.0 * max([loc.population for loc in locations]) else: max_population = None for t in generator: if location_depth > 0: w = gmean([t['popularity'] + 1.0, max_population / location_populations[t['geography']], t['user__followers_count'] + 1.0, t['user__friends_count'] + 1.0]) else: w = gmean([t['popularity'] + 1.0, t['user__followers_count'] + 1.0, t['user__friends_count'] + 1.0]) weight.append(w) hour_norm = 1.0 / (60.0 * 60.0) kwargs = { 'user': users, 'html': html, 'popularity': popularity, 'weight': weight, 'recency': [(time_range[1] - t.datetime).total_seconds() * hour_norm for t in to_serialize], 'media': media, 'source_user': sources } if location_depth > 0: kwargs['geolocation'] = list(pluck('geography', generator)) if time_zone: kwargs['datetime'] = [t.datetime.astimezone(time_zone).isoformat() for t in to_serialize] # metadata metadata = { 'locations': json.loads(serializers.serialize("json", locations)), } if not save_to_db: return {'tweets': to_serialize, 'metadata': kwargs} # save filtered set tl = Timeline.from_tweets(to_serialize, metadata, **kwargs) if twitter_api and to_serialize: tl_url = u'http://auroratwittera.cl{0}'.format(reverse('timelines:timeline-home')) if len(users) > 3: top_users = random.sample([u'@{0}'.format(t['fields']['screen_name']) for t in users], 3) else: top_users = ['@{0}'.format(t['fields']['screen_name']) for t in users] status_1 = 'Nvo. resumen informativo en {0} con tweets de {1}'.format(tl_url, u' '.join(top_users)) print(repr(status_1)) if post_update and post_to_twitter: try: twitter_api.update_status(status_1) time.sleep(30) except tweepy.error.TweepError: pass if rted_user_pks: try: top_rted = random.sample([u'@{0}'.format(t['fields']['user']['fields']['screen_name']) for t in sources if t], min([3, len(rted_user_pks)])) status_2 = 'Nvo. resumen informativo en {0} con RTs de {1}'.format(tl_url, u' '.join(top_rted)) print(repr(status_2)) if post_update and post_to_twitter: try: twitter_api.update_status(status_2) time.sleep(30) except tweepy.error.TweepError: pass except ValueError: pass if retweet_tweeps: for w, t in sorted(zip(kwargs['weight'], to_serialize), reverse=True)[0:size - 1]: if not post_to_twitter: print('retweet', t.internal_id) else: try: print('retweet', t.internal_id) twitter_api.retweet(t.internal_id) time.sleep(60) except tweepy.error.TweepError as e: print([e]) return tl