def make_general_vocab(vocab_size=4096, min_freq=2, overwrite=False):
    if os.path.exists(CONFIG.vocab_file) and not overwrite:
        return read_lines(CONFIG.vocab_file, lambda x: x.split()[0])
    vocab_counter = defaultdict(int)

    # def must_include_words(words):
    #     for word in words:
    #         vocab_counter[word] = 999999
    # targets = read_lines(CONFIG.target_word_list, lambda x: x.strip())
    # descriptions = read_lines(CONFIG.description_word_list, lambda x: x.strip())
    # must_include_words(targets + descriptions + ['UNK'])
    def update_with_cut_reviews(reviews):
        for review in reviews:
            for sentence in review:
                for word in sentence.split():
                    vocab_counter[word] += 1

    comments = load_json_file(CONFIG.single_rate_comment_cut)
    update_with_cut_reviews(comments)
    reviews = load_json_file(CONFIG.single_rate_review_cut)
    update_with_cut_reviews(reviews)
    items = list(filter(lambda x: x[1] >= min_freq, vocab_counter.items()))
    items.sort(key=lambda x: x[1], reverse=True)
    items = items[:vocab_size]
    write_lines(CONFIG.vocab_file, items, lambda x: '%s %d' % (x[0], x[1]))
def make_train_test_set_for_single_rate_pred(test_set_id=0,
                                             model_type=None,
                                             random_shuflle=False,
                                             random_seed=0):
    train_files = [
        x for x in os.listdir(CONFIG.single_rate_training_folder)
        if re.match(r'%s_\d+.json' %
                    CONFIG.single_rate_file_name, x) is not None
    ]
    train_files.sort()
    test_file = '%s_%d.json' % (CONFIG.single_rate_file_name, test_set_id)
    if test_file not in train_files:
        print('error test_set_id! No such file %s' % test_file)
        test_file = train_files[0]
        print('use %s instead' % test_file)
    train_files.remove(test_file)
    train_files = [
        os.path.join(CONFIG.single_rate_training_folder, x)
        for x in train_files
    ]
    test_file = os.path.join(CONFIG.single_rate_training_folder, test_file)
    testdata = load_json_file(test_file)
    test_input = [(x[0], x[1]) for x in testdata]
    test_labels = [x[2] for x in testdata]
    traindata = []
    for train_file in train_files:
        traindata.extend(load_json_file(train_file))
    if random_shuflle:
        random.Random(random_seed).shuffle(traindata)
    return traindata, test_input, test_labels
def merge_cut_profile_rate(filter_no_profile=True, overwrite=False):
    if not overwrite and os.path.exists(CONFIG.sing_rate_data_all):
        return
    # cuts_comment = load_json_file(CONFIG.single_rate_comment_cut)
    # rates_comment = [x[1] for x in load_json_file(CONFIG.single_rate_comment)]
    # profiles_comment = load_json_file(CONFIG.single_rate_comment_profile)
    # data_comment = zip(cuts_comment, profiles_comment, rates_comment)
    # if filter_no_profile:
    #     data_comment = filter(lambda x: len(x[1]) > 0, data_comment)
    # data_comment = list(data_comment)
    # del cuts_comment, rates_comment, profiles_comment
    # gc.collect()
    data_comment = []
    cuts_review = load_json_file(CONFIG.single_rate_review_cut)
    rates_review = [x[1] for x in load_json_file(CONFIG.single_rate_review)]
    profiles_review = load_json_file(CONFIG.sing_rate_review_profile)
    data_review = zip(cuts_review, profiles_review, rates_review)
    if filter_no_profile:
        data_review = filter(lambda x: len(x[1]) > 0, data_review)
    data_review = list(data_review)
    del cuts_review, rates_review, profiles_review
    gc.collect()
    data = data_comment + data_review
    save_json_file(CONFIG.sing_rate_data_all, data)
    print('got %d cut_profile_rate data' % len(data))
    del data_comment, data_review, data
    gc.collect()
def make_profiles(overwrite=False):
    def parse_profile(_id, _type):
        file = os.path.join(CONFIG.data_path, _type, _id, 'analysis',
                            'profile.json')
        if not os.path.exists(file):
            return []
        profile = load_json_file(file)
        result = []
        for target, value in profile.items():
            for sentiment, val in value.items():
                for description, sentences in val.items():
                    result.extend([(target, description, sentiment)] *
                                  len(sentences))
        return result

    if not overwrite and os.path.exists(CONFIG.user_profile_file):
        user_profile = load_json_file(CONFIG.user_profile_file)
    else:
        users = load_np_array(CONFIG.user_list_file)
        user_profile = list(map(lambda x: parse_profile(x, 'user'), users))
        save_json_file(CONFIG.user_profile_file, user_profile)
    if not overwrite and os.path.exists(CONFIG.movie_profile_file):
        movie_profile = load_json_file(CONFIG.movie_profile_file)
    else:
        movies = load_np_array(CONFIG.movie_list_file)
        movie_profile = list(map(lambda x: parse_profile(x, 'subject'),
                                 movies))
        save_json_file(CONFIG.movie_profile_file, movie_profile)
    return user_profile, movie_profile
Beispiel #5
0
 def load_json(self, save_name: str):
     if '.json' not in save_name:
         save_name += '.json'
     file = os.path.join(self.path_name, save_name)
     if not os.path.exists(file):
         raise OSError('No such json file: %s' % file)
     return load_json_file(file)
def get_rate_matrix(user_th=5, mode='rate', overwrite=False):
    if not overwrite and os.path.exists(CONFIG.rate_record_all):
        try:
            user_list = load_np_array(CONFIG.user_list_file)
            movie_list = load_np_array(CONFIG.movie_list_file)
            rate_matrix = load_np_array(CONFIG.rate_matrix_file)
            return user_list, movie_list, rate_matrix
        except:
            pass
    user_index_dict = dict()
    user_list = []
    user_index = 0
    movie_index_dict = dict()
    movie_list = []
    movie_index = 0
    users = os.listdir(CONFIG.user_path)
    records = []
    for user in users:
        collect_file = os.path.join(CONFIG.user_path, user, 'collect.json')
        try:
            collects = load_json_file(collect_file)
        except:
            collects = []
        if mode == 'comment':
            collects = list(
                filter(lambda x: x['comment'] != '' and x["rate"] != 0,
                       collects))
        elif mode == 'rate':
            collects = list(filter(lambda x: x["rate"] != 0, collects))
        if len(collects) < user_th:
            continue
        user_index_dict[user] = user_index
        user_list.append(user)
        user_index += 1
        # user_count[user] = len(collects)
        for item in collects:
            if mode == 'bool':
                value = 1
            else:
                value = item['rate']
            movie = item['movie_id']
            if movie not in movie_index_dict:
                movie_index_dict[movie] = movie_index
                movie_list.append(movie)
                movie_index += 1
            date = item["date"]
            row = user_index_dict[user]
            col = movie_index_dict[movie]
            # movie_count[movie] = movie_count.get(movie, 0) + 1
            records.append((row, col, value, date))
    write_lines(CONFIG.rate_record_all, records, lambda x: '%s %s %d %s' %
                (x[0], x[1], x[2], x[3]))
    rows, cols, values, dates = list(zip(*records))
    rate_matrix = coo_matrix((values, (rows, cols)),
                             shape=(user_index, movie_index)).todense()
    save_np_array(CONFIG.rate_matrix_file, rate_matrix)
    save_np_array(CONFIG.user_list_file, user_list)
    save_np_array(CONFIG.movie_list_file, movie_list)
    return user_list, movie_list, rate_matrix
 def parse_user_tags(user):
     collect_profile_file = os.path.join(CONFIG.user_path, user, 'profile',
                                         'collect_distribution.json')
     if not os.path.exists(collect_profile_file):
         return []
     collect_profile = load_json_file(collect_profile_file)
     tag_distribution = collect_profile.get("type", {})
     tags = list(
         itertools.chain.from_iterable(
             [[tag for _ in range(freq)]
              for tag, freq in tag_distribution.items()]))
     return tags
def make_tags(overwrite=False):
    def parse_movie_tags(movie):
        info_file = os.path.join(CONFIG.movie_path, movie, 'info.json')
        if not os.path.exists(info_file):
            return []
        info = load_json_file(info_file)
        return info.get("genres", [])

    def parse_user_tags(user):
        collect_profile_file = os.path.join(CONFIG.user_path, user, 'profile',
                                            'collect_distribution.json')
        if not os.path.exists(collect_profile_file):
            return []
        collect_profile = load_json_file(collect_profile_file)
        tag_distribution = collect_profile.get("type", {})
        tags = list(
            itertools.chain.from_iterable(
                [[tag for _ in range(freq)]
                 for tag, freq in tag_distribution.items()]))
        return tags

    if not overwrite and os.path.exists(CONFIG.user_tags_file):
        user_tags = load_json_file(CONFIG.user_tags_file)
    else:
        users = load_np_array(CONFIG.user_list_file)
        user_tags = list(map(parse_user_tags, users))
        save_json_file(CONFIG.user_tags_file, user_tags)
    if not overwrite and os.path.exists(CONFIG.movie_tags_file):
        movie_tags = load_json_file(CONFIG.movie_tags_file)
    else:
        movies = load_np_array(CONFIG.movie_list_file)
        movie_tags = list(map(parse_movie_tags, movies))
        save_json_file(CONFIG.movie_tags_file, movie_tags)
    if not overwrite and os.path.exists(CONFIG.tag_word_list):
        tag_words = read_lines(CONFIG.tag_word_list, lambda x: x.strip())
    else:
        tag_words = set(itertools.chain.from_iterable(user_tags + movie_tags))
        write_lines(CONFIG.tag_word_list, tag_words)

    return user_tags, movie_tags, tag_words
 def parse_profile(_id, _type):
     file = os.path.join(CONFIG.data_path, _type, _id, 'analysis',
                         'profile.json')
     if not os.path.exists(file):
         return []
     profile = load_json_file(file)
     result = []
     for target, value in profile.items():
         for sentiment, val in value.items():
             for description, sentences in val.items():
                 result.extend([(target, description, sentiment)] *
                               len(sentences))
     return result
def split_cut_profile_rate(k=5, overwrite=False):
    if not overwrite:
        files = [
            x for x in os.listdir(CONFIG.single_rate_training_folder)
            if re.match(r'%s_\d+.json' %
                        CONFIG.single_rate_file_name, x) is not None
        ]
        if len(files) == k:
            return
    all_records = load_json_file(CONFIG.sing_rate_data_all)
    print_with_time('all records loaded')
    random.shuffle(all_records)
    print_with_time('shuffled')
    for i in range(k):
        out_file = os.path.join(
            CONFIG.single_rate_training_folder,
            '%s_%d.json' % (CONFIG.single_rate_file_name, i))
        subset = all_records[i::k]
        save_json_file(out_file, subset)
        del subset
        gc.collect()
    del all_records
    gc.collect()
def get_all_comments_and_reviews():
    users = load_np_array(CONFIG.user_list_file)
    movies = load_np_array(CONFIG.movie_list_file)
    user_comment = []
    user_review = []
    movie_comment = []
    movie_review = []
    for user in users:
        try:
            review = load_json_file(
                os.path.join(CONFIG.user_path, user, 'review', 'review.json'))
        except Exception as e:
            print(e)
            review = []
        try:
            comment = load_json_file(
                os.path.join(CONFIG.user_path, user, 'collect',
                             'collect.json'))
        except Exception as e:
            print(e)
            comment = []
        review = [(x["content"], x["rate"]) for x in review
                  if len(x) > 0 and x["rate"] > 0]
        comment = [(x["comment"], x["rate"]) for x in comment
                   if x["rate"] > 0 and x["comment"].strip() != ""]
        user_review.extend(review)
        user_comment.extend(comment)
    print('user: comment: %d, review: %d' %
          (len(user_comment), len(user_review)))
    for movie in movies:
        review_folder = os.path.join(CONFIG.movie_path, movie, 'reviews')
        review = []
        if os.path.exists(review_folder):
            files = [
                os.path.join(review_folder, x)
                for x in os.listdir(review_folder) if x.endswith('0.json')
            ]
            for file in files:
                data = load_json_file(file)
                review.extend([(x["content"], x["rating"]["value"])
                               for x in data['reviews']
                               if x["rating"]["value"] > 0])
        movie_review.extend(review)
        comment_folder = os.path.join(CONFIG.movie_path, movie, 'comments')
        comment = []
        if os.path.exists(comment_folder):
            files = [
                os.path.join(comment_folder, x)
                for x in os.listdir(comment_folder) if x.endswith('0.json')
            ]
            for file in files:
                data = load_json_file(file)
                comment.extend([(x["content"], x["rating"]["value"])
                                for x in data['comments']
                                if x["rating"]["value"] > 0])
        movie_comment.extend(comment)
    print('movie: comment: %d, review: %d' %
          (len(movie_comment), len(movie_review)))
    comment_rate = user_comment + movie_comment
    review_rate = user_review + movie_review
    save_json_file(CONFIG.comment_rate_file, comment_rate)
    save_json_file(CONFIG.review_rate_file, review_rate)
 def parse_movie_tags(movie):
     info_file = os.path.join(CONFIG.movie_path, movie, 'info.json')
     if not os.path.exists(info_file):
         return []
     info = load_json_file(info_file)
     return info.get("genres", [])