Exemple #1
0
    def _batch_import_urls(cls, tweets):
        with_urls = [t for t in tweets if 'entities' in t and 'urls' in t['entities']]
        urls = list(chain(*[t['entities']['urls'] for t in with_urls]))
        logging.debug('URLs: {0} in {1} tweets'.format(len(urls), len(with_urls)))
        cls._save_urls(urls)
        url_map = dict(map(lambda x: (x.url, x), Url.objects.filter(url__in=pluck('expanded_url', urls))))

        with_media_urls = [t for t in tweets if 'entities' in t and 'media' in t['entities'] and t['entities']['media']]
        media_urls = list(chain(*[t['entities']['media'] for t in with_media_urls]))
        logging.debug('Media URLs: {0} in {1} tweets'.format(len(media_urls), len(with_media_urls)))
        cls._save_media_urls(media_urls)
        media_url_map = dict(map(lambda x: (x.internal_id, x), Media.objects.filter(internal_id__in=pluck('id_str', media_urls))))

        tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets))

        logging.debug('URLs saved: {0}'.format(len(url_map)))
        logging.debug('Media URLs saved: {0}'.format(len(media_url_map)))

        for tweet in tweets:
            tweet_urls = [url_map[url['expanded_url']] for url in tweet['entities']['urls'] if url['expanded_url'] in url_map]

            if tweet_urls:
                tweet_map[tweet['__pk__']].links = tweet_urls

            if 'media' in tweet['entities'] and tweet['entities']['media']:
                print(tweet['entities']['media'])
                tweet_media_urls = [media_url_map[media_url['id_str']] for media_url in tweet['entities']['media'] if media_url['id_str'] in media_url_map]

                if tweet_media_urls:
                    tweet_map[tweet['__pk__']].media = tweet_media_urls
Exemple #2
0
 def test_url(self) -> None:
     """Test the generated url and get the required parameters from the service."""
     try:
         resp = self.session.get(self.base_url, {"f": "json"}).json()
         try:
             self.units = resp["units"].replace("esri", "").lower()
         except KeyError:
             self.units = None
         self._max_nrecords = int(resp["maxRecordCount"])
         self.query_formats = resp["supportedQueryFormats"].replace(
             " ", "").lower().split(",")
         self.valid_fields = list(
             set(
                 utils.traverse_json(resp, ["fields", "name"]) +
                 utils.traverse_json(resp, ["fields", "alias"]) + ["*"]))
         try:
             extent = resp["extent"] if "extent" in resp else resp[
                 "fullExtent"]
             bounds = (extent["xmin"], extent["ymin"], extent["xmax"],
                       extent["ymax"])
             crs = extent["spatialReference"]["latestWkid"]
             self.extent = utils.MatchCRS.bounds(bounds, crs, DEF_CRS)
         except KeyError:
             self.extent = None
         try:
             self.feature_types = dict(
                 zip((tlz.pluck("id", resp["types"])),
                     tlz.pluck("name", resp["types"])))
         except KeyError:
             self.feature_types = None
     except KeyError:
         raise ServerError(self.base_url)
Exemple #3
0
    def _batch_import_keywords(cls, tweets):
        kws = set()
        for kw in pluck('keywords', tweets):
            kws.update(kw)

        keyword_map = dict(map(lambda x: (x.name, x), Keyword.retrieve(kws)))
        tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets))
        logging.debug('Keywords: {0} in {1} tweets'.format(len(keyword_map), len(tweet_map)))

        for tweet in tweets:
            tweet_kws = [keyword_map[k] for k in tweet['keywords']]
            tweet_map[tweet['__pk__']].keywords = tweet_kws
Exemple #4
0
    def _batch_import_urls(cls, tweets):
        with_urls = [
            t for t in tweets if 'entities' in t and 'urls' in t['entities']
        ]
        urls = list(chain(*[t['entities']['urls'] for t in with_urls]))
        logging.debug('URLs: {0} in {1} tweets'.format(len(urls),
                                                       len(with_urls)))
        cls._save_urls(urls)
        url_map = dict(
            map(lambda x: (x.url, x),
                Url.objects.filter(url__in=pluck('expanded_url', urls))))

        with_media_urls = [
            t for t in tweets if 'entities' in t and 'media' in t['entities']
            and t['entities']['media']
        ]
        media_urls = list(
            chain(*[t['entities']['media'] for t in with_media_urls]))
        logging.debug('Media URLs: {0} in {1} tweets'.format(
            len(media_urls), len(with_media_urls)))
        cls._save_media_urls(media_urls)
        media_url_map = dict(
            map(
                lambda x: (x.internal_id, x),
                Media.objects.filter(
                    internal_id__in=pluck('id_str', media_urls))))

        tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets))

        logging.debug('URLs saved: {0}'.format(len(url_map)))
        logging.debug('Media URLs saved: {0}'.format(len(media_url_map)))

        for tweet in tweets:
            tweet_urls = [
                url_map[url['expanded_url']]
                for url in tweet['entities']['urls']
                if url['expanded_url'] in url_map
            ]

            if tweet_urls:
                tweet_map[tweet['__pk__']].links = tweet_urls

            if 'media' in tweet['entities'] and tweet['entities']['media']:
                print(tweet['entities']['media'])
                tweet_media_urls = [
                    media_url_map[media_url['id_str']]
                    for media_url in tweet['entities']['media']
                    if media_url['id_str'] in media_url_map
                ]

                if tweet_media_urls:
                    tweet_map[tweet['__pk__']].media = tweet_media_urls
Exemple #5
0
    def _batch_import_keywords(cls, tweets):
        kws = set()
        for kw in pluck('keywords', tweets):
            kws.update(kw)

        keyword_map = dict(map(lambda x: (x.name, x), Keyword.retrieve(kws)))
        tweet_map = Tweet.objects.in_bulk(pluck('__pk__', tweets))
        logging.debug('Keywords: {0} in {1} tweets'.format(
            len(keyword_map), len(tweet_map)))

        for tweet in tweets:
            tweet_kws = [keyword_map[k] for k in tweet['keywords']]
            tweet_map[tweet['__pk__']].keywords = tweet_kws
Exemple #6
0
 def get_files(self, item: str) -> Dict[str, Tuple[str, str]]:
     """Get all the available zip files in an item."""
     url = "https://www.sciencebase.gov/catalog/item"
     payload = {"fields": "files,downloadUri", "format": "json"}
     r = self.session.get(f"{url}/{item}", payload=payload).json()
     files_url = zip(tlz.pluck("name", r["files"]),
                     tlz.pluck("url", r["files"]))
     # TODO: Add units
     meta = "".join(tlz.pluck("metadataHtmlViewUri", r["files"],
                              default=""))
     return {
         f.replace("_CONUS.zip", ""): (u, meta)
         for f, u in files_url if ".zip" in f
     }
Exemple #7
0
    def _get_py(self, key):
        if isinstance(key, tuple):
            assert len(key) == 2
            rows, cols = key
            usecols = cols
            ds = self.dshape.subshape[rows, cols]
            usecols = None if isinstance(usecols, slice) else listpack(usecols)
        else:
            rows = key
            ds = self.dshape.subshape[rows]
            usecols = None

        if isinstance(ds, DataShape) and isdimension(ds[0]):
            ds = ds.subshape[0]

        seq = self._iter(usecols=usecols)
        if isinstance(key, tuple) and isinstance(cols, _strtypes + _inttypes):
            seq = pluck(0, seq)
        seq = coerce(ds, seq)

        if isinstance(rows, compatibility._inttypes):
            line = nth(rows, seq)
            try:
                return next(line).item()
            except TypeError:
                try:
                    return line.item()
                except AttributeError:
                    return line
        elif isinstance(rows, list):
            return nth_list(rows, seq)
        elif isinstance(rows, slice):
            return it.islice(seq, rows.start, rows.stop, rows.step)
        else:
            raise IndexError("key %r is not valid" % rows)
Exemple #8
0
 def get_stats(self, per_worker=False):
     individual_stats = _each(self.ctx, yappi.get_func_stats)
     if per_worker:
         return individual_stats
     else:
         stat, *rest = pluck(1, individual_stats)
         # merging adapted from _add_from_YSTAT
         for other in rest:
             for saved_stat in other:
                 if saved_stat not in stat:
                     stat._idx_max += 1
                     saved_stat.index = stat._idx_max
                     stat.append(saved_stat)
             # fix children's index values
             for saved_stat in other:
                 for saved_child_stat in saved_stat.children:
                     # we know for sure child's index is pointing to a valid stat in saved_stats
                     # so as saved_stat is already in sync. (in above loop), we can safely assume
                     # that we shall point to a valid stat in current_stats with the child's full_name
                     saved_child_stat.index = stat[
                         saved_child_stat.full_name].index
             # merge stats
             for saved_stat in other:
                 saved_stat_in_curr = stat[saved_stat.full_name]
                 saved_stat_in_curr += saved_stat
         return stat
Exemple #9
0
 def _check_contents(self, dset, duplication=1):
     files = sorted(dset.icollect(),
                    key=lambda file: int(
                        basename(file[0]).replace('test_file_', '').replace(
                            '.tmp', '')))
     expected = b''.join(interleave([self.contents] * duplication))
     self.assertEqual(b''.join(pluck(1, files)), expected)
Exemple #10
0
    def _get_py(self, key):
        if isinstance(key, tuple):
            assert len(key) == 2
            rows, cols = key
            usecols = cols
            ds = self.dshape.subshape[rows, cols]
            usecols = None if isinstance(usecols, slice) else listpack(usecols)
        else:
            rows = key
            ds = self.dshape.subshape[rows]
            usecols = None

        if isinstance(ds, DataShape) and isdimension(ds[0]):
            ds = ds.subshape[0]

        seq = self._iter(usecols=usecols)
        if isinstance(key, tuple) and isinstance(cols, _strtypes + _inttypes):
            seq = pluck(0, seq)
        seq = coerce(ds, seq)

        if isinstance(rows, compatibility._inttypes):
            line = nth(rows, seq)
            try:
                return next(line).item()
            except TypeError:
                try:
                    return line.item()
                except AttributeError:
                    return line
        elif isinstance(rows, list):
            return nth_list(rows, seq)
        elif isinstance(rows, slice):
            return it.islice(seq, rows.start, rows.stop, rows.step)
        else:
            raise IndexError("key %r is not valid" % rows)
Exemple #11
0
def call_function(func, func_token, args, kwargs, pure=None, nout=None):
    dask_key_name = kwargs.pop('dask_key_name', None)
    pure = kwargs.pop('pure', pure)

    if dask_key_name is None:
        name = '%s-%s' % (funcname(func),
                          tokenize(func_token, *args, pure=pure, **kwargs))
    else:
        name = dask_key_name

    dsk = sharedict.ShareDict()
    args_dasks = list(map(to_task_dask, args))
    for arg, d in args_dasks:
        if isinstance(d, sharedict.ShareDict):
            dsk.update_with_key(d)
        elif isinstance(arg, (str, tuple)):
            dsk.update_with_key(d, key=arg)
        else:
            dsk.update(d)

    args = tuple(pluck(0, args_dasks))

    if kwargs:
        dask_kwargs, dsk2 = to_task_dask(kwargs)
        dsk.update(dsk2)
        task = (apply, func, list(args), dask_kwargs)
    else:
        task = (func,) + args

    dsk.update_with_key({name: task}, key=name)
    nout = nout if nout is not None else None
    return Delayed(name, dsk, length=nout)
Exemple #12
0
    def batch_import(self, tweets, parse_json=False):
        logging.info('Received {0} tweets'.format(len(tweets)))
        tweets = list(filter(self.accept_tweet, map(self.pre_func, tweets)))
        logging.info('After Filtering: {0} tweets'.format(len(tweets)))

        user_fn = lambda x: pluck('user', x)
        tweet_fn = lambda x: x

        retweet_sources = [t['retweeted_status'] for t in tweets if 'retweeted_status' in t]
        quote_sources = [t['quoted_status'] for t in tweets if 'quoted_status' in t]
        imported_users = Importer._batch_import(User, tweets, user_fn)
        imported_users.extend(Importer._batch_import(User, retweet_sources, user_fn))
        imported_users.extend(Importer._batch_import(User, quote_sources, user_fn))

        imported = Importer._batch_import(Tweet, tweets, tweet_fn)
        imported.extend(Importer._batch_import(Tweet, [t['retweeted_status'] for t in tweets if 'retweeted_status' in t], tweet_fn))
        imported.extend(Importer._batch_import(Tweet, [t['quoted_status'] for t in tweets if 'quoted_status' in t], tweet_fn))

        Importer._batch_import_retweets(list(filter(lambda x: 'retweeted_status' in x, tweets)))
        Importer._batch_import_retweets(list(filter(lambda x: 'quoted_status' in x, tweets)), retweet_key='quoted_status')

        Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], tweets)))
        Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], retweet_sources)))
        Importer._batch_import_keywords(list(filter(lambda x: x['__created__'], quote_sources)))

        Importer._batch_import_urls(list(filter(lambda x: x['__created__'], tweets)))
        Importer._batch_import_urls(list(filter(lambda x: x['__created__'], retweet_sources)))
        Importer._batch_import_urls(list(filter(lambda x: x['__created__'], quote_sources)))

        self.post_func(imported_users, imported)

        logging.debug('Imported Tweets: {0}'.format(len(imported)))

        return imported_users, imported
Exemple #13
0
def build_graph(estimator, cv, scorer, candidate_params, X, y=None,
                groups=None, fit_params=None, iid=True, refit=True,
                error_score='raise', return_train_score=True, cache_cv=True):

    X, y, groups = to_indexable(X, y, groups)
    cv = check_cv(cv, y, is_classifier(estimator))
    # "pairwise" estimators require a different graph for CV splitting
    is_pairwise = getattr(estimator, '_pairwise', False)

    dsk = {}
    X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
    n_splits = compute_n_splits(cv, X, y, groups)

    if fit_params:
        # A mapping of {name: (name, graph-key)}
        param_values = to_indexable(*fit_params.values(), allow_scalars=True)
        fit_params = {k: (k, v) for (k, v) in
                      zip(fit_params, to_keys(dsk, *param_values))}
    else:
        fit_params = {}

    fields, tokens, params = normalize_params(candidate_params)
    main_token = tokenize(normalize_estimator(estimator), fields, params,
                          X_name, y_name, groups_name, fit_params, cv,
                          error_score == 'raise', return_train_score)

    cv_name = 'cv-split-' + main_token
    dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name,
                    is_pairwise, cache_cv)

    if iid:
        weights = 'cv-n-samples-' + main_token
        dsk[weights] = (cv_n_samples, cv_name)
    else:
        weights = None

    scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields,
                              tokens, params, X_name, y_name, fit_params,
                              n_splits, error_score, scorer,
                              return_train_score)

    cv_results = 'cv-results-' + main_token
    candidate_params_name = 'cv-parameters-' + main_token
    dsk[candidate_params_name] = (decompress_params, fields, params)
    dsk[cv_results] = (create_cv_results, scores, candidate_params_name,
                       n_splits, error_score, weights)
    keys = [cv_results]

    if refit:
        best_params = 'best-params-' + main_token
        dsk[best_params] = (get_best_params, candidate_params_name, cv_results)
        best_estimator = 'best-estimator-' + main_token
        if fit_params:
            fit_params = (dict, (zip, list(fit_params.keys()),
                                list(pluck(1, fit_params.values()))))
        dsk[best_estimator] = (fit_best, clone(estimator), best_params,
                               X_name, y_name, fit_params)
        keys.append(best_estimator)

    return dsk, keys, n_splits
Exemple #14
0
def call_function(func, func_token, args, kwargs, pure=None, nout=None):
    dask_key_name = kwargs.pop('dask_key_name', None)
    pure = kwargs.pop('pure', pure)

    if dask_key_name is None:
        name = '%s-%s' % (funcname(func),
                          tokenize(func_token, *args, pure=pure, **kwargs))
    else:
        name = dask_key_name

    dsk = sharedict.ShareDict()
    args_dasks = list(map(to_task_dask, args))
    for arg, d in args_dasks:
        if isinstance(d, sharedict.ShareDict):
            dsk.update_with_key(d)
        elif isinstance(arg, (str, tuple)):
            dsk.update_with_key(d, key=arg)
        else:
            dsk.update(d)

    args = tuple(pluck(0, args_dasks))

    if kwargs:
        dask_kwargs, dsk2 = to_task_dask(kwargs)
        dsk.update(dsk2)
        task = (apply, func, list(args), dask_kwargs)
    else:
        task = (func,) + args

    dsk.update_with_key({name: task}, key=name)
    nout = nout if nout is not None else None
    return Delayed(name, dsk, length=nout)
Exemple #15
0
 def _remote(self):
     contents = pluck(1, super()._remote())
     if self.dset.encoding is not None:
         contents = map(
             partial(_decode, self.dset.encoding, self.dset.errors),
             contents)
     lines = map(partial(_splitlines, True), contents)
     return chain.from_iterable(lines)
Exemple #16
0
 def take_snapshot(self, per_worker=False):
     snapshots = _each(self.ctx, tracemalloc.take_snapshot)
     if per_worker:
         return snapshots
     snapshot_merged, *snapshots = pluck(1, snapshots)
     traces_merged = snapshot_merged.traces._traces
     for s in snapshots:
         traces_merged.extend(s.traces._traces)
     return snapshot_merged
Exemple #17
0
def _compress_letter_and_initabbr(words):
    if not words:
        return words
    result = []
    for word_len, words_grp in groupby(
            words, lambda x: len(x[0])):  # compress single letter words
        if word_len == 1:
            result.append(("".join(pluck(0, words_grp)), None))
        else:
            result.extend(words_grp)

    if (result and 2 <= len(result[0]) <= len(result) - 1
            and  # remove if first word is an abbrevation of following words, e.g. "ABC Aa Bb Cc Company"
            all(char == word[0]
                for char, word in zip(result[0][0], pluck(0, result[1:])))):
        result = result[1:]

    return result
Exemple #18
0
    def batch_import(self, tweets, parse_json=False):
        logging.info('Received {0} tweets'.format(len(tweets)))
        tweets = list(filter(self.accept_tweet, map(self.pre_func, tweets)))
        logging.info('After Filtering: {0} tweets'.format(len(tweets)))

        user_fn = lambda x: pluck('user', x)
        tweet_fn = lambda x: x

        retweet_sources = [
            t['retweeted_status'] for t in tweets if 'retweeted_status' in t
        ]
        quote_sources = [
            t['quoted_status'] for t in tweets if 'quoted_status' in t
        ]
        imported_users = Importer._batch_import(User, tweets, user_fn)
        imported_users.extend(
            Importer._batch_import(User, retweet_sources, user_fn))
        imported_users.extend(
            Importer._batch_import(User, quote_sources, user_fn))

        imported = Importer._batch_import(Tweet, tweets, tweet_fn)
        imported.extend(
            Importer._batch_import(Tweet, [
                t['retweeted_status']
                for t in tweets if 'retweeted_status' in t
            ], tweet_fn))
        imported.extend(
            Importer._batch_import(
                Tweet,
                [t['quoted_status']
                 for t in tweets if 'quoted_status' in t], tweet_fn))

        Importer._batch_import_retweets(
            list(filter(lambda x: 'retweeted_status' in x, tweets)))
        Importer._batch_import_retweets(list(
            filter(lambda x: 'quoted_status' in x, tweets)),
                                        retweet_key='quoted_status')

        Importer._batch_import_keywords(
            list(filter(lambda x: x['__created__'], tweets)))
        Importer._batch_import_keywords(
            list(filter(lambda x: x['__created__'], retweet_sources)))
        Importer._batch_import_keywords(
            list(filter(lambda x: x['__created__'], quote_sources)))

        Importer._batch_import_urls(
            list(filter(lambda x: x['__created__'], tweets)))
        Importer._batch_import_urls(
            list(filter(lambda x: x['__created__'], retweet_sources)))
        Importer._batch_import_urls(
            list(filter(lambda x: x['__created__'], quote_sources)))

        self.post_func(imported_users, imported)

        logging.debug('Imported Tweets: {0}'.format(len(imported)))

        return imported_users, imported
Exemple #19
0
    def toolz_max(self, col, rows):
        """
        Max of values for a given column in a list of dictionaries

        Args:
           col (str): Column to process
           rows (list): Records
        """
        return max(list(toolz.pluck(col, rows)))
Exemple #20
0
    def toolz_count(self, col, rows):
        """
        Unique values for a given column in a list of dictionaries

        Args:
           col (str): Column to process
           rows (list): Records

        """
        return len(set(list(toolz.pluck(col, rows))))
def _group_ids_by_index(index, tokens):
    id_groups = []

    def new_group():
        o = []
        id_groups.append(o)
        return o.append

    _id_groups = defaultdict(new_group)
    for n, t in enumerate(pluck(index, tokens)):
        _id_groups[t](n)
    return id_groups
Exemple #22
0
def _group_ids_by_index(index, tokens):
    id_groups = []

    def new_group():
        o = []
        id_groups.append(o)
        return o.append

    _id_groups = defaultdict(new_group)
    for n, t in enumerate(pluck(index, tokens)):
        _id_groups[t](n)
    return id_groups
Exemple #23
0
def increment_rt_counts(tweet_pks):
    """
    :param tweet_pks: dictionary {tweet_pk: rt_count}
    :return:
    """
    items = sorted(tweet_pks.items(), key=lambda x: x[1], reverse=True)
    grouped = groupby(lambda x: x[1], items)

    for incr, pairs in grouped.items():
        if incr > 0:
            pks = pluck(0, pairs)
            TweetFeatures.objects.filter(tweet_id__in=pks).update(count_rts=F('count_rts') + incr)
Exemple #24
0
def pre_compute(expr, seq):
    try:
        if isinstance(seq, Iterator):
            first = next(seq)
            seq = concat([[first], seq])
        else:
            first = next(iter(seq))
    except StopIteration:
        return []
    if isinstance(first, dict):
        return pluck(expr.fields, seq)
    else:
        return seq
Exemple #25
0
    def toolz_sum(self, col, rows, dtype=None):
        """
        Sum a given column in a list of dictionaries

        Args:
           col (str): Column to process
           rows (list): Records
        """
        values = list(toolz.pluck(col, rows))
        if dtype is not None:
            values = [dtype(v) for v in values]

        return sum(values)
Exemple #26
0
    def toolz_avg(self, col, rows):
        """
        Unique values for a given column in a list of dictionaries

        Args:
           col (str): Column to process
           rows (list): Records

        """

        count = len(rows)
        total = sum(list(toolz.pluck(col, rows)))
        return total / count if count > 0 else None
Exemple #27
0
    def _batch_import_retweets(cls, tweets, retweet_key='retweeted_status'):
        rts = []

        user_map = User.objects.in_bulk(pluck('__pk__', pluck('user', tweets)))

        tweet_ids = list(pluck('__pk__', tweets))
        tweet_ids.extend(pluck('__pk__', pluck(retweet_key, tweets)))
        tweet_map = Tweet.objects.in_bulk(tweet_ids)

        rt_counts = defaultdict(int)

        for current in tweets:
            rts.append(ReTweet(user=user_map[current['user']['__pk__']],
                                      tweet_instance=tweet_map[current['__pk__']],
                                      source_tweet=tweet_map[current[retweet_key]['__pk__']],
                                      datetime=tweet_map[current['__pk__']].datetime))

            rt_counts[current['__pk__']] += 1

        ReTweet.objects.bulk_create(rts)
        logging.debug('RT Increments: {0}'.format(rt_counts))
        increment_rt_counts(rt_counts)
Exemple #28
0
def increment_rt_counts(tweet_pks):
    """
    :param tweet_pks: dictionary {tweet_pk: rt_count}
    :return:
    """
    items = sorted(tweet_pks.items(), key=lambda x: x[1], reverse=True)
    grouped = groupby(lambda x: x[1], items)

    for incr, pairs in grouped.items():
        if incr > 0:
            pks = pluck(0, pairs)
            TweetFeatures.objects.filter(tweet_id__in=pks).update(
                count_rts=F('count_rts') + incr)
Exemple #29
0
def _into_iter_mongodb(l, coll, columns=None, schema=None):
    """ Into helper function

    Return both a lazy sequence of tuples and a list of column names
    """
    seq = coll.find()
    if not columns and schema:
        columns = schema[0].names
    elif not columns:
        item = next(seq)
        seq = concat([[item], seq])
        columns = sorted(item.keys())
        columns.remove('_id')
    return columns, pluck(columns, seq)
Exemple #30
0
    def _batch_import_retweets(cls, tweets, retweet_key='retweeted_status'):
        rts = []

        user_map = User.objects.in_bulk(pluck('__pk__', pluck('user', tweets)))

        tweet_ids = list(pluck('__pk__', tweets))
        tweet_ids.extend(pluck('__pk__', pluck(retweet_key, tweets)))
        tweet_map = Tweet.objects.in_bulk(tweet_ids)

        rt_counts = defaultdict(int)

        for current in tweets:
            rts.append(
                ReTweet(user=user_map[current['user']['__pk__']],
                        tweet_instance=tweet_map[current['__pk__']],
                        source_tweet=tweet_map[current[retweet_key]['__pk__']],
                        datetime=tweet_map[current['__pk__']].datetime))

            rt_counts[current['__pk__']] += 1

        ReTweet.objects.bulk_create(rts)
        logging.debug('RT Increments: {0}'.format(rt_counts))
        increment_rt_counts(rt_counts)
Exemple #31
0
def pre_compute(expr, seq, scope=None, **kwargs):
    try:
        if isinstance(seq, Iterator):
            first = next(seq)
            seq = concat([[first], seq])
        else:
            first = next(iter(seq))
    except StopIteration:
        return []
    if isinstance(first, dict):
        leaf = expr._leaves()[0]
        return pluck(leaf.fields, seq)
    else:
        return seq
Exemple #32
0
def normalize_params(params):
    """Take a list of dictionaries, and tokenize/normalize."""
    # Collect a set of all fields
    fields = set()
    for p in params:
        fields.update(p)
    fields = sorted(fields)

    params2 = list(pluck(fields, params, MISSING))
    # Non-basic types (including MISSING) are unique to their id
    tokens = [tuple(x if isinstance(x, (int, float, str)) else id(x)
                    for x in p) for p in params2]

    return fields, tokens, params2
Exemple #33
0
def pre_compute(expr, seq, scope=None, **kwargs):
    try:
        if isinstance(seq, Iterator):
            first = next(seq)
            seq = concat([[first], seq])
        else:
            first = next(iter(seq))
    except StopIteration:
        return []
    if isinstance(first, dict):
        leaf = expr._leaves()[0]
        return pluck(leaf.fields, seq)
    else:
        return seq
Exemple #34
0
def _into_iter_mongodb(l, coll, columns=None, schema=None):
    """ Into helper function

    Return both a lazy sequence of tuples and a list of column names
    """
    seq = coll.find()
    if not columns and schema:
        columns = schema[0].names
    elif not columns:
        item = next(seq)
        seq = concat([[item], seq])
        columns = sorted(item.keys())
        columns.remove('_id')
    return columns, pluck(columns, seq)
def trans_iter():
    for kdnr, groups in groupby(map(dict, pluck(1, trans_dat.iterrows())),
                                key=itemgetter("kdnr")):
        aggs = [
            FrequencyTracker(varname, varfunc, ["timestamp"], 0)
            for varname, varfunc in [
                ("land", itemgetter("ref_land")),
                ("kanal", lambda x: str(x["UTU5_EINGABE_NAME"])[:5]),
            ]
        ] + [TimestampSameCustomer()]
        for row in groups:
            for agg in aggs:
                agg.add(row)
                row.update(agg.value)
            yield row
def normalize_params(params):
    """Take a list of dictionaries, and tokenize/normalize."""
    # Collect a set of all fields
    fields = set()
    for p in params:
        fields.update(p)
    fields = sorted(fields)

    params2 = list(pluck(fields, params, MISSING))
    # Non-basic types (including MISSING) are unique to their id
    tokens = [
        tuple(x if isinstance(x, (int, float, str)) else id(x) for x in p)
        for p in params2
    ]

    return fields, tokens, params2
Exemple #37
0
    def _batch_import(base_class, cls, elements, fn):
        logging.debug('Trying to import {1} from {0} elements'.format(
            len(elements), cls))
        internal_ids = set(pluck('id_str', fn(elements)))

        existing_users = cls.objects.filter(internal_id__in=internal_ids)
        existing_ids = set([u.internal_id for u in existing_users])
        user_pks = dict([(u.internal_id, u.pk) for u in existing_users])
        new_ids = internal_ids - existing_ids

        logging.debug('Existing IDs: {0}'.format(len(existing_ids)))
        logging.debug('New IDs: {0}'.format(len(new_ids)))

        added_keys = set()
        new_elements = []
        for element in fn(elements):
            if element['id_str'] in user_pks:
                element['__pk__'] = user_pks[element['id_str']]
                element['__created__'] = False
            else:
                if not element['id_str'] in added_keys:
                    user_model = cls()
                    user_model.copy_json(valfilter(lambda x: x, element))
                    new_elements.append(user_model)
                element['__created__'] = True
                element['__pk__'] = None
                added_keys.add(element['id_str'])

        cls.objects.bulk_create(new_elements)

        new_models = list(cls.objects.filter(internal_id__in=new_ids))
        logging.debug('New IDs created successfully: {0}'.format(
            len(new_models)))
        new_pks = dict([(u.internal_id, u.pk) for u in new_models])
        for element in fn(elements):
            if element['id_str'] in new_pks:
                element['__pk__'] = new_pks[element['id_str']]

        return new_models
Exemple #38
0
    def _batch_import(base_class, cls, elements, fn):
        logging.debug('Trying to import {1} from {0} elements'.format(len(elements), cls))
        internal_ids = set(pluck('id_str', fn(elements)))

        existing_users = cls.objects.filter(internal_id__in=internal_ids)
        existing_ids = set([u.internal_id for u in existing_users])
        user_pks = dict([(u.internal_id, u.pk) for u in existing_users])
        new_ids = internal_ids - existing_ids

        logging.debug('Existing IDs: {0}'.format(len(existing_ids)))
        logging.debug('New IDs: {0}'.format(len(new_ids)))

        added_keys = set()
        new_elements = []
        for element in fn(elements):
            if element['id_str'] in user_pks:
                element['__pk__'] = user_pks[element['id_str']]
                element['__created__'] = False
            else:
                if not element['id_str'] in added_keys:
                    user_model = cls()
                    user_model.copy_json(valfilter(lambda x: x, element))
                    new_elements.append(user_model)
                element['__created__'] = True
                element['__pk__'] = None
                added_keys.add(element['id_str'])

        cls.objects.bulk_create(new_elements)

        new_models = list(cls.objects.filter(internal_id__in=new_ids))
        logging.debug('New IDs created successfully: {0}'.format(len(new_models)))
        new_pks = dict([(u.internal_id, u.pk) for u in new_models])
        for element in fn(elements):
            if element['id_str'] in new_pks:
                element['__pk__'] = new_pks[element['id_str']]

        return new_models
def build_graph(estimator,
                cv,
                scorer,
                candidate_params,
                X,
                y=None,
                groups=None,
                fit_params=None,
                iid=True,
                refit=True,
                error_score='raise',
                return_train_score=True,
                cache_cv=True):

    X, y, groups = to_indexable(X, y, groups)
    cv = check_cv(cv, y, is_classifier(estimator))
    # "pairwise" estimators require a different graph for CV splitting
    is_pairwise = getattr(estimator, '_pairwise', False)

    dsk = {}
    X_name, y_name, groups_name = to_keys(dsk, X, y, groups)
    n_splits = compute_n_splits(cv, X, y, groups)

    if fit_params:
        # A mapping of {name: (name, graph-key)}
        param_values = to_indexable(*fit_params.values(), allow_scalars=True)
        fit_params = {
            k: (k, v)
            for (k, v) in zip(fit_params, to_keys(dsk, *param_values))
        }
    else:
        fit_params = {}

    fields, tokens, params = normalize_params(candidate_params)
    main_token = tokenize(normalize_estimator(estimator), fields, params,
                          X_name, y_name, groups_name, fit_params, cv,
                          error_score == 'raise', return_train_score)

    cv_name = 'cv-split-' + main_token
    dsk[cv_name] = (cv_split, cv, X_name, y_name, groups_name, is_pairwise,
                    cache_cv)

    if iid:
        weights = 'cv-n-samples-' + main_token
        dsk[weights] = (cv_n_samples, cv_name)
    else:
        weights = None

    scores = do_fit_and_score(dsk, main_token, estimator, cv_name, fields,
                              tokens, params, X_name, y_name, fit_params,
                              n_splits, error_score, scorer,
                              return_train_score)

    cv_results = 'cv-results-' + main_token
    candidate_params_name = 'cv-parameters-' + main_token
    dsk[candidate_params_name] = (decompress_params, fields, params)
    dsk[cv_results] = (create_cv_results, scores, candidate_params_name,
                       n_splits, error_score, weights)
    keys = [cv_results]

    if refit:
        best_params = 'best-params-' + main_token
        dsk[best_params] = (get_best_params, candidate_params_name, cv_results)
        best_estimator = 'best-estimator-' + main_token
        if fit_params:
            fit_params = (dict, (zip, list(fit_params.keys()),
                                 list(pluck(1, fit_params.values()))))
        dsk[best_estimator] = (fit_best, clone(estimator), best_params, X_name,
                               y_name, fit_params)
        keys.append(best_estimator)

    return dsk, keys, n_splits
def _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys,
                 fit_params, n_splits, error_score, step_fields_lk,
                 fit_params_lk, field_to_index, step_name, none_passthrough,
                 is_transform):
    sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2))
    sub_fit_params = fit_params_lk[step_name]

    if step_name in field_to_index:
        # The estimator may change each call
        new_fits = {}
        new_Xs = {}
        est_index = field_to_index[step_name]

        for ids in _group_ids_by_index(est_index, tokens):
            # Get the estimator for this subgroup
            sub_est = params[ids[0]][est_index]
            if sub_est is MISSING:
                sub_est = step

            # If an estimator is `None`, there's nothing to do
            if sub_est is None:
                nones = dict.fromkeys(ids, None)
                new_fits.update(nones)
                if is_transform:
                    if none_passthrough:
                        new_Xs.update(zip(ids, get(ids, Xs)))
                    else:
                        new_Xs.update(nones)
            else:
                # Extract the proper subset of Xs, ys
                sub_Xs = get(ids, Xs)
                sub_ys = get(ids, ys)
                # Only subset the parameters/tokens if necessary
                if sub_fields:
                    sub_tokens = list(pluck(sub_inds, get(ids, tokens)))
                    sub_params = list(pluck(sub_inds, get(ids, params)))
                else:
                    sub_tokens = sub_params = None

                if is_transform:
                    sub_fits, sub_Xs = do_fit_transform(
                        dsk, next_token, sub_est, cv, sub_fields, sub_tokens,
                        sub_params, sub_Xs, sub_ys, sub_fit_params, n_splits,
                        error_score)
                    new_Xs.update(zip(ids, sub_Xs))
                    new_fits.update(zip(ids, sub_fits))
                else:
                    sub_fits = do_fit(dsk, next_token, sub_est, cv, sub_fields,
                                      sub_tokens, sub_params, sub_Xs, sub_ys,
                                      sub_fit_params, n_splits, error_score)
                    new_fits.update(zip(ids, sub_fits))
        # Extract lists of transformed Xs and fit steps
        all_ids = list(range(len(Xs)))
        if is_transform:
            Xs = get(all_ids, new_Xs)
        fits = get(all_ids, new_fits)
    elif step is None:
        # Nothing to do
        fits = [None] * len(Xs)
        if not none_passthrough:
            Xs = fits
    else:
        # Only subset the parameters/tokens if necessary
        if sub_fields:
            sub_tokens = list(pluck(sub_inds, tokens))
            sub_params = list(pluck(sub_inds, params))
        else:
            sub_tokens = sub_params = None

        if is_transform:
            fits, Xs = do_fit_transform(dsk, next_token, step, cv, sub_fields,
                                        sub_tokens, sub_params, Xs, ys,
                                        sub_fit_params, n_splits, error_score)
        else:
            fits = do_fit(dsk, next_token, step, cv, sub_fields, sub_tokens,
                          sub_params, Xs, ys, sub_fit_params, n_splits,
                          error_score)
    return (fits, Xs) if is_transform else (fits, None)
def _do_featureunion(dsk, next_token, est, cv, fields, tokens, params, Xs, ys,
                     fit_params, n_splits, error_score):
    if 'transformer_list' in fields:
        raise NotImplementedError("Setting FeatureUnion.transformer_list "
                                  "in a gridsearch")

    (field_to_index,
     step_fields_lk) = _group_subparams(est.transformer_list,
                                        fields,
                                        ignore=('transformer_weights'))
    fit_params_lk = _group_fit_params(est.transformer_list, fit_params)

    token = next_token(est)

    n_samples = _do_n_samples(dsk, token, Xs, n_splits)

    fit_steps = []
    tr_Xs = []
    for (step_name, step) in est.transformer_list:
        fits, out_Xs = _do_fit_step(dsk, next_token, step, cv, fields, tokens,
                                    params, Xs, ys, fit_params, n_splits,
                                    error_score, step_fields_lk, fit_params_lk,
                                    field_to_index, step_name, False, True)
        fit_steps.append(fits)
        tr_Xs.append(out_Xs)

    # Rebuild the FeatureUnions
    step_names = [n for n, _ in est.transformer_list]

    if 'transformer_weights' in field_to_index:
        index = field_to_index['transformer_weights']
        weight_lk = {}
        weight_tokens = list(pluck(index, tokens))
        for i, tok in enumerate(weight_tokens):
            if tok not in weight_lk:
                weights = params[i][index]
                if weights is MISSING:
                    weights = est.transformer_weights
                lk = weights or {}
                weight_list = [lk.get(n) for n in step_names]
                weight_lk[tok] = (weights, weight_list)
        weights = get(weight_tokens, weight_lk)
    else:
        lk = est.transformer_weights or {}
        weight_list = [lk.get(n) for n in step_names]
        weight_tokens = repeat(None)
        weights = repeat((est.transformer_weights, weight_list))

    out = []
    out_append = out.append
    fit_name = 'feature-union-' + token
    tr_name = 'feature-union-concat-' + token
    m = 0
    seen = {}
    for steps, Xs, wt, (w, wl), nsamp in zip(zip(*fit_steps), zip(*tr_Xs),
                                             weight_tokens, weights,
                                             n_samples):
        if (steps, wt) in seen:
            out_append(seen[steps, wt])
        else:
            for n in range(n_splits):
                dsk[(fit_name, m,
                     n)] = (feature_union, step_names,
                            [None if s is None else s + (n, )
                             for s in steps], w)
                dsk[(tr_name, m,
                     n)] = (feature_union_concat,
                            [None if x is None else x + (n, )
                             for x in Xs], nsamp + (n, ), wl)
            seen[steps, wt] = m
            out_append(m)
            m += 1
    return [(fit_name, i) for i in out], [(tr_name, i) for i in out]
Exemple #42
0
sns.violinplot(x='diff', y='cond', data=freq_diffs)
#%%

clip = np.percentile(freq_diffs['diff'], [2.5, 97.5])
for cond, data in freq_diffs.groupby('cond'):
    sns.kdeplot(data['diff'], clip=clip, label=cond)
plt.savefig('figures/word_freq_of_suggs.pdf')

#%%
import random

acceptability_frames = []
for group in cytoolz.partition_all(len(conditions), samples):
    context = group[0]['context']
    meta = list(
        cytoolz.pluck(['review_idx', 'sent_idx', 'word_idx', 'true_follows'],
                      group))
    assert len(set(meta)) == 1
    meta = list(meta[0])
    true_follows = meta.pop(-1)
    options = [('true', group[0]['true_follows'])]
    for sample in group:
        for sugg in sample['suggs'].split('\n')[:1]:
            options.append((sample['cond'], sugg))
    random.shuffle(options)
    acceptability_frames.append(
        dict(meta=meta, context=group[0]['context'], options=options))
import json

json.dump(acceptability_frames, open('acceptability_frames.json', 'w'))

Exemple #43
0
 def pluck(self, ind):
     if cytoolz.isiterable(ind): return self.__class__(itertools.imap(flist, cytoolz.pluck(ind, self)))
     else: return self.__class__(cytoolz.pluck(ind, self))
Exemple #44
0
    def stage_data(self) -> pd.DataFrame:
        """Stage the NHDPlus Attributes database and save to nhdplus_attrs.feather."""
        r = self.get_children(self.nhd_attr_item)

        titles = tlz.pluck("title", r["items"])
        titles = tlz.concat(
            tlz.map(tlz.partial(re.findall, "Select(.*?)Attributes"), titles))
        titles = tlz.map(str.strip, titles)

        main_items = dict(zip(titles, tlz.pluck("id", r["items"])))

        files = {}
        soil = main_items.pop("Soil")
        for i, item in main_items.items():
            r = self.get_children(item)

            titles = tlz.pluck("title", r["items"])
            titles = tlz.map(
                lambda s: s.split(":")[1].strip() if ":" in s else s, titles)

            child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
            files[i] = {t: self.get_files(c) for t, c in child_items.items()}

        r = self.get_children(soil)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].strip()
                         if ":" in s else s, titles)

        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        stat = child_items.pop("STATSGO Soil Characteristics")
        ssur = child_items.pop("SSURGO Soil Characteristics")
        files["Soil"] = {t: self.get_files(c) for t, c in child_items.items()}

        r = self.get_children(stat)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].split(",")[1].strip(),
                         titles)
        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        files["STATSGO"] = {
            t: self.get_files(c)
            for t, c in child_items.items()
        }

        r = self.get_children(ssur)
        titles = tlz.pluck("title", r["items"])
        titles = tlz.map(lambda s: s.split(":")[1].strip(), titles)
        child_items = dict(zip(titles, tlz.pluck("id", r["items"])))
        files["SSURGO"] = {
            t: self.get_files(c)
            for t, c in child_items.items()
        }

        chars = []
        types = {"CAT": "local", "TOT": "upstream_acc", "ACC": "div_routing"}
        for t, dd in files.items():
            for d, fd in dd.items():
                for f, u in fd.items():
                    chars.append({
                        "name": f,
                        "type": types.get(f[-3:], "other"),
                        "theme": t,
                        "description": d,
                        "url": u[0],
                        "meta": u[1],
                    })
        char_df = pd.DataFrame(chars, dtype="category")
        char_df.to_feather(self.char_feather)
        return char_df
Exemple #45
0
def _do_fit_step(dsk, next_token, step, cv, fields, tokens, params, Xs, ys,
                 fit_params, n_splits, error_score, step_fields_lk,
                 fit_params_lk, field_to_index, step_name, none_passthrough,
                 is_transform):
    sub_fields, sub_inds = map(list, unzip(step_fields_lk[step_name], 2))
    sub_fit_params = fit_params_lk[step_name]

    if step_name in field_to_index:
        # The estimator may change each call
        new_fits = {}
        new_Xs = {}
        est_index = field_to_index[step_name]

        for ids in _group_ids_by_index(est_index, tokens):
            # Get the estimator for this subgroup
            sub_est = params[ids[0]][est_index]
            if sub_est is MISSING:
                sub_est = step

            # If an estimator is `None`, there's nothing to do
            if sub_est is None:
                nones = dict.fromkeys(ids, None)
                new_fits.update(nones)
                if is_transform:
                    if none_passthrough:
                        new_Xs.update(zip(ids, get(ids, Xs)))
                    else:
                        new_Xs.update(nones)
            else:
                # Extract the proper subset of Xs, ys
                sub_Xs = get(ids, Xs)
                sub_ys = get(ids, ys)
                # Only subset the parameters/tokens if necessary
                if sub_fields:
                    sub_tokens = list(pluck(sub_inds, get(ids, tokens)))
                    sub_params = list(pluck(sub_inds, get(ids, params)))
                else:
                    sub_tokens = sub_params = None

                if is_transform:
                    sub_fits, sub_Xs = do_fit_transform(dsk, next_token,
                                                        sub_est, cv, sub_fields,
                                                        sub_tokens, sub_params,
                                                        sub_Xs, sub_ys,
                                                        sub_fit_params,
                                                        n_splits, error_score)
                    new_Xs.update(zip(ids, sub_Xs))
                    new_fits.update(zip(ids, sub_fits))
                else:
                    sub_fits = do_fit(dsk, next_token, sub_est, cv,
                                      sub_fields, sub_tokens, sub_params,
                                      sub_Xs, sub_ys, sub_fit_params,
                                      n_splits, error_score)
                    new_fits.update(zip(ids, sub_fits))
        # Extract lists of transformed Xs and fit steps
        all_ids = list(range(len(Xs)))
        if is_transform:
            Xs = get(all_ids, new_Xs)
        fits = get(all_ids, new_fits)
    elif step is None:
        # Nothing to do
        fits = [None] * len(Xs)
        if not none_passthrough:
            Xs = fits
    else:
        # Only subset the parameters/tokens if necessary
        if sub_fields:
            sub_tokens = list(pluck(sub_inds, tokens))
            sub_params = list(pluck(sub_inds, params))
        else:
            sub_tokens = sub_params = None

        if is_transform:
            fits, Xs = do_fit_transform(dsk, next_token, step, cv,
                                        sub_fields, sub_tokens, sub_params,
                                        Xs, ys, sub_fit_params, n_splits,
                                        error_score)
        else:
            fits = do_fit(dsk, next_token, step, cv, sub_fields,
                          sub_tokens, sub_params, Xs, ys, sub_fit_params,
                          n_splits, error_score)
    return (fits, Xs) if is_transform else (fits, None)
Exemple #46
0
def _do_featureunion(dsk, next_token, est, cv, fields, tokens, params, Xs, ys,
                     fit_params, n_splits, error_score):
    if 'transformer_list' in fields:
        raise NotImplementedError("Setting FeatureUnion.transformer_list "
                                  "in a gridsearch")

    (field_to_index,
     step_fields_lk) = _group_subparams(est.transformer_list, fields,
                                        ignore=('transformer_weights'))
    fit_params_lk = _group_fit_params(est.transformer_list, fit_params)

    token = next_token(est)

    n_samples = _do_n_samples(dsk, token, Xs, n_splits)

    fit_steps = []
    tr_Xs = []
    for (step_name, step) in est.transformer_list:
        fits, out_Xs = _do_fit_step(dsk, next_token, step, cv, fields, tokens,
                                    params, Xs, ys, fit_params, n_splits,
                                    error_score, step_fields_lk, fit_params_lk,
                                    field_to_index, step_name, False, True)
        fit_steps.append(fits)
        tr_Xs.append(out_Xs)

    # Rebuild the FeatureUnions
    step_names = [n for n, _ in est.transformer_list]

    if 'transformer_weights' in field_to_index:
        index = field_to_index['transformer_weights']
        weight_lk = {}
        weight_tokens = list(pluck(index, tokens))
        for i, tok in enumerate(weight_tokens):
            if tok not in weight_lk:
                weights = params[i][index]
                if weights is MISSING:
                    weights = est.transformer_weights
                lk = weights or {}
                weight_list = [lk.get(n) for n in step_names]
                weight_lk[tok] = (weights, weight_list)
        weights = get(weight_tokens, weight_lk)
    else:
        lk = est.transformer_weights or {}
        weight_list = [lk.get(n) for n in step_names]
        weight_tokens = repeat(None)
        weights = repeat((est.transformer_weights, weight_list))

    out = []
    out_append = out.append
    fit_name = 'feature-union-' + token
    tr_name = 'feature-union-concat-' + token
    m = 0
    seen = {}
    for steps, Xs, wt, (w, wl), nsamp in zip(zip(*fit_steps), zip(*tr_Xs),
                                             weight_tokens, weights, n_samples):
        if (steps, wt) in seen:
            out_append(seen[steps, wt])
        else:
            for n in range(n_splits):
                dsk[(fit_name, m, n)] = (feature_union, step_names,
                                         [None if s is None else s + (n,)
                                          for s in steps], w)
                dsk[(tr_name, m, n)] = (feature_union_concat,
                                        [None if x is None else x + (n,)
                                         for x in Xs], nsamp + (n,), wl)
            seen[steps, wt] = m
            out_append(m)
            m += 1
    return [(fit_name, i) for i in out], [(tr_name, i) for i in out]
Exemple #47
0
def setcompare(iter1, iter2):
    cntr1 = Counter(iter1)
    cntr2 = Counter(iter2)
    only1 = cntr1.keys() - cntr2.keys()
    only2 = cntr2.keys() - cntr1.keys()
    both = cntr1.keys() & cntr2.keys()

    cnt1 = sum(cntr1[key] for key in only1)
    cnt2 = sum(cntr2[key] for key in only2)
    cnt12a = sum(cntr1[key] for key in both)
    cnt12b = sum(cntr2[key] for key in both)
    distinct1 = len(only1)
    distinct2 = len(only2)
    distinct12 = len(both)

    cnt_perct = "{} ({:.0%})".format

    if hasattr(iter1, "name"):
        name1 = f"1 {iter1.name}"
    else:
        name1 = "1"

    if hasattr(iter2, "name"):
        name2 = f"2 {iter2.name}"
    else:
        name2 = "2"

    display_data = [
        ["", f"Set {name1} only", "Intersect.", f"Set {name2} only"],
        [
            "Count",
            cnt_perct(cnt1, cnt1 / (cnt1 + cnt12a)),
            "{} | {}".format(cnt12a, cnt12b),
            cnt_perct(cnt2, cnt2 / (cnt2 + cnt12b)),
        ],
        [
            "Distinct count",
            cnt_perct(distinct1, distinct1 / (distinct1 + distinct12)),
            distinct12,
            cnt_perct(distinct2, distinct2 / (distinct2 + distinct12)),
        ],
        [
            "Examples",
            format_tuples(
                pluck(
                    0,
                    Counter({key: cntr1[key]
                             for key in only1}).most_common(5)),
                cntr1,
            ),
            format_tuples(
                pluck(
                    0,
                    Counter({key: cntr1[key] + cntr2[key]
                             for key in both}).most_common(5),
                ),
                cntr1,
                cntr2,
            ),
            format_tuples(
                pluck(
                    0,
                    Counter({key: cntr2[key]
                             for key in only2}).most_common(5)),
                cntr2,
            ),
        ],
    ]

    make_table(display_data)
    table = apply_theme("basic_both")
    for x, y in product([0, 1, 2], [1, 2, 3]):
        set_cell_style(x, y, align="center")
    return table
Exemple #48
0
def generate_timeline(time_range, save_to_db=True, tweets=None, skip_fields=None, size=40, sideline_turns=5,
                      time_bucket_size=30, location_depth=1, time_zone=None, twitter_api=None,
                      min_length=50, exclude_replies=True, informational_only=False, exclude_retweets=False,
                      post_update=True, retweet_tweeps=True, skip_sensitive_content=True,
                      retweeted_only=False, post_to_twitter=False, update_users=True, shout_score=0.5,
                      n_candidates=None, target_entropy=0.99):

    if n_candidates is not None and n_candidates <= 0:
        n_candidates = None

    # generate filtered set
    characterizer = Characterizer()

    if not time_zone:
        time_zone = pytz.timezone(settings.TIME_ZONE)

    locations = list(Location.objects.filter(depth=location_depth))
    location_pks = [loc.pk for loc in locations]
    location_populations = dict([(loc.pk, loc.population) for loc in locations])

    if not time_zone:
        time_zone = pytz.timezone(settings.TIME_ZONE)

    if tweets is None:
        tweets = _timeline_queryset(time_range, location_depth=location_depth, min_length=min_length, exclude_replies=exclude_replies,
                                informational_only=informational_only, exclude_retweets=exclude_retweets,
                                skip_sensitive_content=skip_sensitive_content, retweeted_only=retweeted_only)

    feature_keys = ['pk', 'text', 'favourite_count', 'retweet_count',
                        'user__screen_name', 'datetime', 'user__name', 'user__profile_image_url', 'internal_id',
                        'user__internal_id', 'user__friends_count', 'user__followers_count',
                        'user__statuses_count', 'characterization__count_rts',
                        'characterization__manual_rt', 'characterization__is_reply']

    if location_depth > 0:
        feature_keys.append('user__characterization__location_depth_{0}'.format(location_depth))

    tweet_features = list(tweets.values(*feature_keys))

    if not tweet_features or len(tweet_features) < size:
        print('ERROR, not enough tweets', len(tweet_features))
        return None

    if location_depth > 0:
        pick_strategy, approve_fn = TimelineFilter.select_tweet_and_sideline(TimelineFilter.select_popular_bucketed, location_pks, turns=sideline_turns)
        start_strategy = pick_strategy
    else:
        pick_strategy = TimelineFilter.select_popular_bucketed
        start_strategy = TimelineFilter.starting_tweet
        approve_fn = None

        if skip_fields is None:
            skip_fields = {'geography'}
        else:
            skip_fields.add('geography')

    generator = TimelineFilter(characterizer,
                               skip_fields=skip_fields,
                               min_date=time_range[0],
                               max_entropy_percentile=100.0,
                               time_bucket_size=time_bucket_size,
                               start_strategy=start_strategy,
                               pick_strategy=pick_strategy,
                               approve_tweet_fn=approve_fn,
                               n_candidates=n_candidates,
                               target_entropy=target_entropy)

    for t in tweet_features:
        if location_depth > 0:
            t['geography'] = int(t['user__characterization__location_depth_{0}'.format(location_depth)])
        t['popularity'] = 2.0 * t['characterization__count_rts'] + t['retweet_count'] + 0.5 * t['favourite_count']
        generator.prepare_tweet(t)

    tweet_features = filter(lambda x: x['__shout_score__'] < shout_score, tweet_features)
    tweet_features = sorted(tweet_features, key=lambda x: x['buckets']['popularity'], reverse=True)
    print('N features', len(tweet_features))

    for i in range(0, min([size, len(tweet_features)])):
        generator(tweet_features)

    # estimate data
    tweet_pks = list(pluck('pk', generator))
    tl_tweets = Tweet.objects.in_bulk(tweet_pks)
    to_serialize = [tl_tweets[pk] for pk in tweet_pks]
    popularity = list(pluck('popularity', generator))

    html = []
    media = []
    sources = []

    rted_user_pks = set()

    for t in to_serialize:
        if t.media.exists():
            media.append(json.loads(serializers.serialize("json", t.media.all())))
        else:
            media.append(None)

        if t.rt_instance_tweet.exists():
            rt = t.rt_instance_tweet.all()[0].source_tweet
            rt_serialized = json.loads(serializers.serialize("json", [rt]))[0]
            rted_user_pks.add(rt_serialized['fields']['user'])
            # NOTE: this user is not updated. need to update it at some point.
            #rt_serialized['fields']['user'] = json.loads(serializers.serialize("json", [User.objects.get(pk=pk)]))[0]
            html.append(rt.beautify_html())
            sources.append(rt_serialized)
        else:
            html.append(t.beautify_html())
            sources.append(None)

    if not twitter_api or not update_users:
        print('not updating users')
        users = json.loads(serializers.serialize("json", [t.user for t in to_serialize]))
    else:
        print('updating user profiles')
        user_pks = [t.user_id for t in to_serialize]
        user_pks.extend(rted_user_pks)
        user_ids = User.objects.filter(pk__in=user_pks).values_list('internal_id', flat=True)
        tl_user_data = twitter_api.lookup_users(user_ids=user_ids)

        for tl_user in tl_user_data:
            user, created = User.import_json(valfilter(lambda x: x, tl_user._json))

        user_dict = User.objects.in_bulk([t.user_id for t in to_serialize])
        users = json.loads(serializers.serialize("json", [user_dict[t.user_id] for t in to_serialize]))

    for t, source in zip(to_serialize, sources):
        if source is not None:
            source['fields']['user'] = json.loads(serializers.serialize("json", [User.objects.get(pk=source['fields']['user'])]))[0]

    weight = []

    if location_depth > 0:
        max_population = 1.0 * max([loc.population for loc in locations])
    else:
        max_population = None

    for t in generator:
        if location_depth > 0:
            w = gmean([t['popularity'] + 1.0,
                   max_population / location_populations[t['geography']],
                   t['user__followers_count'] + 1.0,
                   t['user__friends_count'] + 1.0])
        else:
            w = gmean([t['popularity'] + 1.0,
                   t['user__followers_count'] + 1.0,
                   t['user__friends_count'] + 1.0])
        weight.append(w)

    hour_norm = 1.0 / (60.0 * 60.0)
    kwargs = {
        'user': users,
        'html': html,
        'popularity': popularity,
        'weight': weight,
        'recency': [(time_range[1] - t.datetime).total_seconds() * hour_norm for t in to_serialize],
        'media': media,
        'source_user': sources
    }

    if location_depth > 0:
        kwargs['geolocation'] = list(pluck('geography', generator))

    if time_zone:
        kwargs['datetime'] = [t.datetime.astimezone(time_zone).isoformat() for t in to_serialize]

    # metadata
    metadata = {
        'locations': json.loads(serializers.serialize("json", locations)),
    }

    if not save_to_db:
        return {'tweets': to_serialize, 'metadata': kwargs}

    # save filtered set
    tl = Timeline.from_tweets(to_serialize, metadata, **kwargs)

    if twitter_api and to_serialize:
        tl_url = u'http://auroratwittera.cl{0}'.format(reverse('timelines:timeline-home'))

        if len(users) > 3:
            top_users = random.sample([u'@{0}'.format(t['fields']['screen_name']) for t in users], 3)
        else:
            top_users = ['@{0}'.format(t['fields']['screen_name']) for t in users]

        status_1 = 'Nvo. resumen informativo en {0} con tweets de {1}'.format(tl_url, u' '.join(top_users))
        print(repr(status_1))
        if post_update and post_to_twitter:
            try:
                twitter_api.update_status(status_1)
                time.sleep(30)
            except tweepy.error.TweepError:
                pass

        if rted_user_pks:
            try:
                top_rted = random.sample([u'@{0}'.format(t['fields']['user']['fields']['screen_name']) for t in sources if t], min([3, len(rted_user_pks)]))
                status_2 = 'Nvo. resumen informativo en {0} con RTs de {1}'.format(tl_url, u' '.join(top_rted))
                print(repr(status_2))
                if post_update and post_to_twitter:
                    try:
                        twitter_api.update_status(status_2)
                        time.sleep(30)
                    except tweepy.error.TweepError:
                        pass
            except ValueError:
                pass

        if retweet_tweeps:
            for w, t in sorted(zip(kwargs['weight'], to_serialize), reverse=True)[0:size - 1]:
                if not post_to_twitter:
                    print('retweet', t.internal_id)
                else:
                    try:
                        print('retweet', t.internal_id)
                        twitter_api.retweet(t.internal_id)
                        time.sleep(60)
                    except tweepy.error.TweepError as e:
                        print([e])

    return tl