Exemple #1
0
def get_examples_and_features(users_labelled_dir_in: str, fe: FeatureExtractor,
                              main_class: int, class_ratio: float):
    assert 0 <= class_ratio <= 1, 'pos_neg_ratio should be between 0 and 1'
    logger.info('[get_examples] Reading examples from ' +
                users_labelled_dir_in)

    users_labelled_dir_obj = iopipes.Pipe \
        .recursive_from_path(users_labelled_dir_in, ExampleDeserializer)

    all_ex = []
    all_feat = []
    for uid, dir_obj in users_labelled_dir_obj.items():
        main_class_file_in = str(main_class)
        main_class_feat_file = os.path.join(
            users_labelled_dir_in, uid, main_class_file_in + '_features.json')

        main_ex = list(dir_obj.file(main_class_file_in))
        all_ex.extend(main_ex)
        if not filesystem.readable_file(main_class_feat_file):
            logger.info(
                '[get_examples] Generating features for user {} class {}'.
                format(uid, main_class))
            main_feat = [fe.extract(ex.point) for ex in main_ex]
            all_feat.extend(main_feat)
            np.savetxt(main_class_feat_file, main_feat, fmt='%.5e')
        else:
            all_feat.extend(np.loadtxt(main_class_feat_file))

        nr_other_ex = int((1 / class_ratio) * len(main_ex))
        for other_class in idiff(pos=label_space, neg=[main_class]):
            other_class_file_in = str(other_class)
            other_class_feat_file = os.path.join(
                users_labelled_dir_in, uid,
                other_class_file_in + '_features.json')

            other_ex = list(
                take(nr_other_ex, dir_obj.file(other_class_file_in)))
            all_ex.extend(other_ex)
            if not filesystem.readable_file(other_class_feat_file):
                logger.info(
                    '[get_examples] Generating features for user {} class {}'.
                    format(uid, other_class))
                other_feat = [fe.extract(ex.point) for ex in other_ex]
                all_feat.extend(other_feat)
                np.savetxt(other_class_feat_file, other_feat, fmt='%.5e')
            else:
                all_feat.extend(np.loadtxt(other_class_feat_file))

    points, labels = split(functions.id, all_ex)

    logger.info(
        '[gen_examples] Done reading examples and generating features. '
        'Class distribution: {}'.format(histogram(labels)))
    return points, np.array(all_feat), np.array(labels)
def get_active_seed_uids(seed_file_in, twitter_auth) -> Iterator[User_Tweet]:
    active_uts_file = filesystem.remove_ext(seed_file_in) + '_active.txt'

    if filesystem.readable_file(active_uts_file):
        logger.info('[get_active_seed_uids] Reading (active user id, '
                    'latest tweet id) pairs from {}'.format(active_uts_file))
        return list(
            iopipes.Pipe.from_path(active_uts_file, User_TweetDeserializer))

    logger.info('[get_active_seed_uids] Generating (active user ids, '
                'latest tweets id) pairs into {}'.format(active_uts_file))
    uts = iopipes.Pipe.from_path(
        seed_file_in,
        SchemaJsonDeserializer.build({
            'id': int,
            'user': {
                'id': int
            },
            'lang': str
        }))
    uts = filter(lambda d: d['lang'] == 'en', uts)
    uts = map(lambda d: User_Tweet(d['user']['id'], d['id']), uts)
    uts = dedupe(lambda ut: ut.uid, lambda ut1, ut2: ut1
                 if ut1.tid > ut2.tid else ut2, uts)
    active_uts = get_active_uts(uts, twitter_auth)
    fileio.it_to_file(active_uts, active_uts_file, User_TweetSerializer)
    return active_uts
def collect_user_hist_tweets(active_uts, user_raw_dir_out, nr_tweets_per_user,
                             twitter_auth):
    logger.info(
        '[collect_user_hist_tweets] Got {} users active user ids'.format(
            len(active_uts)))
    for idx, (uid, tid) in enumerate(active_uts, start=1):
        user_file_out = os.path.join(user_raw_dir_out, str(uid) + '.json')
        if not filesystem.readable_file(user_file_out):
            logger.info('[collect_user_hist_tweets] Retrieving tweets for {} '
                        '[user {} / {}]'.format(uid, idx, len(active_uts)))
            it = iopipes.Pipe.twitter_user_timeline(twitter_auth, uid, tid)
            it = take_unique(nr_tweets_per_user,
                             lambda json_obj: json_obj['id'], it)
            fileio.it_to_file(it, user_file_out, IdJsonSerializer)
def gen_kfold(k, points, labels, split_spec, kfold_dir):
    kfolds_file = os.path.join(kfold_dir, '{}.json'.format(k))

    if not filesystem.readable_file(kfolds_file):
        logger.info('[gen_kfold] Generating {} folds in {}'
                    .format(k, kfolds_file))
        key_extractor = lambda point: point.uid
        kfolds = list(GroupedKFold(k, key_extractor).split(
            points, labels, split_spec))
        fwrite_utils.it_to_file(kfolds, kfolds_file, KFoldIndicesSerializer)
        return kfolds

    logger.info('[get_kfold] Reading {} folds from {}'.format(k, kfolds_file))
    kfolds_it = Pipe.from_path(kfolds_file, KFoldIndicesDeserializer)
    return kfolds_it
Exemple #5
0
def gen_examples_and_stats(user_raw_dir_in: str, user_labelled_dir_out: str,
                           stats_dir_out: str, min_ex_per_user: int):

    vocab_file = os.path.join(stats_dir_out, 'vocabulary.txt')
    prev_vocab = []
    if filesystem.readable_file(vocab_file):
        prev_vocab = iopipes.Pipe.from_path(vocab_file)
    vocab = VocabularyStatBuilder(prev_vocab)

    hst_file_out = os.path.join(stats_dir_out, 'hst.json')
    topics_file_out = os.path.join(stats_dir_out, 'topics.json')
    with fileio.FileDumper(hst_file_out, HstStatSerializer) as hst_dumper, \
         fileio.FileDumper(topics_file_out, TopicsStatSerializer) as topics_dumper:
        for path, u_tweet_it in iopipes.Pipe.named_from_path(
                user_raw_dir_in, RawTweetDeserializer, min_ex_per_user):
            uid = int(filesystem.name(path))
            u_ex_it = map(add_label, filter(valid_tweet, u_tweet_it))

            path_root = os.path.join(user_labelled_dir_out, str(uid))
            ex_files_out = list(
                map(
                    lambda label: os.path.join(path_root,
                                               str(label) + '.json'),
                    label_space))
            if not filesystem.readable_files(*ex_files_out):
                logger.info('[generate_examples_and_stats]'
                            'Generating examples for user {}'.format(uid))
                with fileio.FileDumpers(
                        zip(label_space, ex_files_out),
                        ExampleSerializer,
                        mode=fileio.WriteModes.WRITE) as dumpers:
                    hst, topics = HstStatBuilder(uid), TopicsStatBuilder(uid)
                    labels_hist = defaultdict(int)
                    for ex in u_ex_it:
                        hst.add(ex)
                        topics.add(ex)
                        vocab.add(ex)
                        dumpers[ex.label].dump(ex)
                        labels_hist[ex.label] += 1
                    logger.info(
                        '[generate_examples_and_stats] Done generating '
                        'examples for user {}. Label class balance: {}'.format(
                            uid, labels_hist))
                    hst_dumper.dump(hst.get())
                    topics_dumper.dump(topics.get())

    logger.info('[generate_examples_and_stats] Done')
    fileio.it_to_file(vocab.get(), vocab_file, mode=fileio.WriteModes.WRITE)
def dir_to_recursive_named_line_it(
        path: str,
        min_lines: int,
        skip_header: bool = False,
        deserializer: Type[Deserializer[T]] = StrDeserializer
) -> Dict[str, Any]:
    assert filesystem.readable_dir(path), path + ' is not a directory'

    res = {}
    for member_path in glob.glob(path + '/*'):
        name = filesystem.name(member_path)
        if filesystem.readable_file(member_path):
            res[name] = multifile_to_line_it([member_path], min_lines,
                                             skip_header, deserializer)
        else:
            res[name] = dir_to_recursive_named_line_it(member_path, min_lines,
                                                       skip_header,
                                                       deserializer)
    return res