def make_general_vocab(vocab_size=4096, min_freq=2, overwrite=False):
    if os.path.exists(CONFIG.vocab_file) and not overwrite:
        return read_lines(CONFIG.vocab_file, lambda x: x.split()[0])
    vocab_counter = defaultdict(int)

    # def must_include_words(words):
    #     for word in words:
    #         vocab_counter[word] = 999999
    # targets = read_lines(CONFIG.target_word_list, lambda x: x.strip())
    # descriptions = read_lines(CONFIG.description_word_list, lambda x: x.strip())
    # must_include_words(targets + descriptions + ['UNK'])
    def update_with_cut_reviews(reviews):
        for review in reviews:
            for sentence in review:
                for word in sentence.split():
                    vocab_counter[word] += 1

    comments = load_json_file(CONFIG.single_rate_comment_cut)
    update_with_cut_reviews(comments)
    reviews = load_json_file(CONFIG.single_rate_review_cut)
    update_with_cut_reviews(reviews)
    items = list(filter(lambda x: x[1] >= min_freq, vocab_counter.items()))
    items.sort(key=lambda x: x[1], reverse=True)
    items = items[:vocab_size]
    write_lines(CONFIG.vocab_file, items, lambda x: '%s %d' % (x[0], x[1]))
def make_vocab_lookup(vocab_file, reverse=False, unk_token=None):
    words = read_lines(vocab_file, lambda x: x.strip().split()[0])
    if unk_token is not None and unk_token not in words:
        words.insert(0, unk_token)
    words = list(filter(lambda x: x != '', words))
    if reverse:
        # id2word
        lookup = dict([(i, x.strip()) for i, x in enumerate(words)])
    else:
        # word2id
        lookup = dict([(x.strip(), i) for i, x in enumerate(words)])
    return lookup
def make_train_test_set(test_set_id=0,
                        model_type=None,
                        random_shuflle=True,
                        random_seed=0):
    train_files = [
        x for x in os.listdir(CONFIG.training_folder)
        if re.match(r'%s_\d+.txt' %
                    CONFIG.rate_record_file_name, x) is not None
    ]
    train_files.sort()
    test_file = '%s_%d.txt' % (CONFIG.rate_record_file_name, test_set_id)
    if test_file not in train_files:
        print('error test_set_id! No such file %s' % test_file)
        test_file = train_files[0]
        print('use %s instead' % test_file)
    train_files.remove(test_file)
    train_files = [
        os.path.join(CONFIG.training_folder, x) for x in train_files
    ]
    test_file = os.path.join(CONFIG.training_folder, test_file)
    testdata = read_lines(test_file, lambda x: x.split())
    test_input = [(int(x[0]), int(x[1])) for x in testdata]
    test_labels = [int(x[2]) for x in testdata]
    traindata = []
    for train_file in train_files:
        traindata.extend(read_lines(train_file, lambda x: x.split()))
    traindata = list(
        map(lambda x: (int(x[0]), int(x[1]), int(x[2])), traindata))
    if random_shuflle:
        random.Random(random_seed).shuffle(traindata)
    if model_type in {'UserCF', 'ItemCF', 'LFM'}:
        dict_train_data = defaultdict(dict)
        for user, item, rate in traindata:
            dict_train_data[user][item] = rate
        traindata = dict_train_data
    return traindata, test_input, test_labels
def split_dataset(k=5, overwrite=False):
    if not overwrite:
        files = [
            x for x in os.listdir(CONFIG.training_folder)
            if re.match(r'%s_\d+.txt' %
                        CONFIG.rate_record_file_name, x) is not None
        ]
        if len(files) == k:
            return
    all_records = read_lines(CONFIG.rate_record_all)
    for i in range(k):
        out_file = os.path.join(
            CONFIG.training_folder,
            '%s_%d.txt' % (CONFIG.rate_record_file_name, i))
        subset = all_records[i::k]
        write_lines(out_file, subset, lambda x: x.strip())
def make_tags(overwrite=False):
    def parse_movie_tags(movie):
        info_file = os.path.join(CONFIG.movie_path, movie, 'info.json')
        if not os.path.exists(info_file):
            return []
        info = load_json_file(info_file)
        return info.get("genres", [])

    def parse_user_tags(user):
        collect_profile_file = os.path.join(CONFIG.user_path, user, 'profile',
                                            'collect_distribution.json')
        if not os.path.exists(collect_profile_file):
            return []
        collect_profile = load_json_file(collect_profile_file)
        tag_distribution = collect_profile.get("type", {})
        tags = list(
            itertools.chain.from_iterable(
                [[tag for _ in range(freq)]
                 for tag, freq in tag_distribution.items()]))
        return tags

    if not overwrite and os.path.exists(CONFIG.user_tags_file):
        user_tags = load_json_file(CONFIG.user_tags_file)
    else:
        users = load_np_array(CONFIG.user_list_file)
        user_tags = list(map(parse_user_tags, users))
        save_json_file(CONFIG.user_tags_file, user_tags)
    if not overwrite and os.path.exists(CONFIG.movie_tags_file):
        movie_tags = load_json_file(CONFIG.movie_tags_file)
    else:
        movies = load_np_array(CONFIG.movie_list_file)
        movie_tags = list(map(parse_movie_tags, movies))
        save_json_file(CONFIG.movie_tags_file, movie_tags)
    if not overwrite and os.path.exists(CONFIG.tag_word_list):
        tag_words = read_lines(CONFIG.tag_word_list, lambda x: x.strip())
    else:
        tag_words = set(itertools.chain.from_iterable(user_tags + movie_tags))
        write_lines(CONFIG.tag_word_list, tag_words)

    return user_tags, movie_tags, tag_words