def get_examples_and_features(users_labelled_dir_in: str, fe: FeatureExtractor, main_class: int, class_ratio: float): assert 0 <= class_ratio <= 1, 'pos_neg_ratio should be between 0 and 1' logger.info('[get_examples] Reading examples from ' + users_labelled_dir_in) users_labelled_dir_obj = iopipes.Pipe \ .recursive_from_path(users_labelled_dir_in, ExampleDeserializer) all_ex = [] all_feat = [] for uid, dir_obj in users_labelled_dir_obj.items(): main_class_file_in = str(main_class) main_class_feat_file = os.path.join( users_labelled_dir_in, uid, main_class_file_in + '_features.json') main_ex = list(dir_obj.file(main_class_file_in)) all_ex.extend(main_ex) if not filesystem.readable_file(main_class_feat_file): logger.info( '[get_examples] Generating features for user {} class {}'. format(uid, main_class)) main_feat = [fe.extract(ex.point) for ex in main_ex] all_feat.extend(main_feat) np.savetxt(main_class_feat_file, main_feat, fmt='%.5e') else: all_feat.extend(np.loadtxt(main_class_feat_file)) nr_other_ex = int((1 / class_ratio) * len(main_ex)) for other_class in idiff(pos=label_space, neg=[main_class]): other_class_file_in = str(other_class) other_class_feat_file = os.path.join( users_labelled_dir_in, uid, other_class_file_in + '_features.json') other_ex = list( take(nr_other_ex, dir_obj.file(other_class_file_in))) all_ex.extend(other_ex) if not filesystem.readable_file(other_class_feat_file): logger.info( '[get_examples] Generating features for user {} class {}'. format(uid, other_class)) other_feat = [fe.extract(ex.point) for ex in other_ex] all_feat.extend(other_feat) np.savetxt(other_class_feat_file, other_feat, fmt='%.5e') else: all_feat.extend(np.loadtxt(other_class_feat_file)) points, labels = split(functions.id, all_ex) logger.info( '[gen_examples] Done reading examples and generating features. ' 'Class distribution: {}'.format(histogram(labels))) return points, np.array(all_feat), np.array(labels)
def get_active_seed_uids(seed_file_in, twitter_auth) -> Iterator[User_Tweet]: active_uts_file = filesystem.remove_ext(seed_file_in) + '_active.txt' if filesystem.readable_file(active_uts_file): logger.info('[get_active_seed_uids] Reading (active user id, ' 'latest tweet id) pairs from {}'.format(active_uts_file)) return list( iopipes.Pipe.from_path(active_uts_file, User_TweetDeserializer)) logger.info('[get_active_seed_uids] Generating (active user ids, ' 'latest tweets id) pairs into {}'.format(active_uts_file)) uts = iopipes.Pipe.from_path( seed_file_in, SchemaJsonDeserializer.build({ 'id': int, 'user': { 'id': int }, 'lang': str })) uts = filter(lambda d: d['lang'] == 'en', uts) uts = map(lambda d: User_Tweet(d['user']['id'], d['id']), uts) uts = dedupe(lambda ut: ut.uid, lambda ut1, ut2: ut1 if ut1.tid > ut2.tid else ut2, uts) active_uts = get_active_uts(uts, twitter_auth) fileio.it_to_file(active_uts, active_uts_file, User_TweetSerializer) return active_uts
def collect_user_hist_tweets(active_uts, user_raw_dir_out, nr_tweets_per_user, twitter_auth): logger.info( '[collect_user_hist_tweets] Got {} users active user ids'.format( len(active_uts))) for idx, (uid, tid) in enumerate(active_uts, start=1): user_file_out = os.path.join(user_raw_dir_out, str(uid) + '.json') if not filesystem.readable_file(user_file_out): logger.info('[collect_user_hist_tweets] Retrieving tweets for {} ' '[user {} / {}]'.format(uid, idx, len(active_uts))) it = iopipes.Pipe.twitter_user_timeline(twitter_auth, uid, tid) it = take_unique(nr_tweets_per_user, lambda json_obj: json_obj['id'], it) fileio.it_to_file(it, user_file_out, IdJsonSerializer)
def gen_kfold(k, points, labels, split_spec, kfold_dir): kfolds_file = os.path.join(kfold_dir, '{}.json'.format(k)) if not filesystem.readable_file(kfolds_file): logger.info('[gen_kfold] Generating {} folds in {}' .format(k, kfolds_file)) key_extractor = lambda point: point.uid kfolds = list(GroupedKFold(k, key_extractor).split( points, labels, split_spec)) fwrite_utils.it_to_file(kfolds, kfolds_file, KFoldIndicesSerializer) return kfolds logger.info('[get_kfold] Reading {} folds from {}'.format(k, kfolds_file)) kfolds_it = Pipe.from_path(kfolds_file, KFoldIndicesDeserializer) return kfolds_it
def gen_examples_and_stats(user_raw_dir_in: str, user_labelled_dir_out: str, stats_dir_out: str, min_ex_per_user: int): vocab_file = os.path.join(stats_dir_out, 'vocabulary.txt') prev_vocab = [] if filesystem.readable_file(vocab_file): prev_vocab = iopipes.Pipe.from_path(vocab_file) vocab = VocabularyStatBuilder(prev_vocab) hst_file_out = os.path.join(stats_dir_out, 'hst.json') topics_file_out = os.path.join(stats_dir_out, 'topics.json') with fileio.FileDumper(hst_file_out, HstStatSerializer) as hst_dumper, \ fileio.FileDumper(topics_file_out, TopicsStatSerializer) as topics_dumper: for path, u_tweet_it in iopipes.Pipe.named_from_path( user_raw_dir_in, RawTweetDeserializer, min_ex_per_user): uid = int(filesystem.name(path)) u_ex_it = map(add_label, filter(valid_tweet, u_tweet_it)) path_root = os.path.join(user_labelled_dir_out, str(uid)) ex_files_out = list( map( lambda label: os.path.join(path_root, str(label) + '.json'), label_space)) if not filesystem.readable_files(*ex_files_out): logger.info('[generate_examples_and_stats]' 'Generating examples for user {}'.format(uid)) with fileio.FileDumpers( zip(label_space, ex_files_out), ExampleSerializer, mode=fileio.WriteModes.WRITE) as dumpers: hst, topics = HstStatBuilder(uid), TopicsStatBuilder(uid) labels_hist = defaultdict(int) for ex in u_ex_it: hst.add(ex) topics.add(ex) vocab.add(ex) dumpers[ex.label].dump(ex) labels_hist[ex.label] += 1 logger.info( '[generate_examples_and_stats] Done generating ' 'examples for user {}. Label class balance: {}'.format( uid, labels_hist)) hst_dumper.dump(hst.get()) topics_dumper.dump(topics.get()) logger.info('[generate_examples_and_stats] Done') fileio.it_to_file(vocab.get(), vocab_file, mode=fileio.WriteModes.WRITE)
def dir_to_recursive_named_line_it( path: str, min_lines: int, skip_header: bool = False, deserializer: Type[Deserializer[T]] = StrDeserializer ) -> Dict[str, Any]: assert filesystem.readable_dir(path), path + ' is not a directory' res = {} for member_path in glob.glob(path + '/*'): name = filesystem.name(member_path) if filesystem.readable_file(member_path): res[name] = multifile_to_line_it([member_path], min_lines, skip_header, deserializer) else: res[name] = dir_to_recursive_named_line_it(member_path, min_lines, skip_header, deserializer) return res