コード例 #1
0
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
    # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(classifier_records,
                                                     Constants.REVIEW_ID_FIELD,
                                                     non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[
            reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
コード例 #2
0
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
コード例 #3
0
ファイル: reviews_classifier.py プロジェクト: bachlog/yelp
def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
コード例 #4
0
    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
コード例 #5
0
 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(self.items_to_predict, write_file,
                     pickle.HIGHEST_PROTOCOL)
コード例 #6
0
ファイル: review_analysis.py プロジェクト: antoine-tran/yelp
    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)


        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
コード例 #7
0
ファイル: top_n_runner.py プロジェクト: bachlog/yelp
def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall
コード例 #8
0
    def export_without_context(self):
        print('%s: exporting to CARSKit binary ratings format without context' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        numpy.random.seed(0)

        for record in self.records:

            context_na_value = 1

            new_records.append({
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
                'context:na': context_na_value,
            })

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]

        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
コード例 #9
0
ファイル: context_transformer.py プロジェクト: swarnamd/yelp
    def export_records(self):
        print('%s: exporting transformed records' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        records_to_export = []
        desired_fields = [
            Constants.USER_INTEGER_ID_FIELD,
            Constants.ITEM_INTEGER_ID_FIELD,
            Constants.RATING_FIELD,
            Constants.CONTEXT_FIELD,
        ]

        for record in self.records:
            new_record = {field: record[field] for field in desired_fields}
            records_to_export.append(new_record)

        file_name = Constants.generate_file_name(
            'recsys_formatted_context_records',
            'json',
            Constants.CACHE_FOLDER,
            None,
            None,
            True,
            True,
            uses_carskit=False,
            normalize_topics=True,
            format_context=True)
        ETLUtils.save_json_file(file_name, records_to_export)
コード例 #10
0
 def export_records(self):
     print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
     self.dictionary.save(Constants.DICTIONARY_FILE)
     ETLUtils.save_json_file(
         Constants.FULL_PROCESSED_RECORDS_FILE, self.records)
     self.drop_unnecessary_fields()
     ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
コード例 #11
0
ファイル: top_n_evaluator.py プロジェクト: melqkiades/yelp
 def export_records_to_predict(self, records_file):
     if self.records_to_predict is None:
         self.records_to_predict = self.get_records_to_predict()
     ETLUtils.save_json_file(records_file, self.records_to_predict)
     with open(records_file + '.pkl', 'wb') as write_file:
         pickle.dump(
             self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #12
0
def create_topic_models():

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.RECORDS_FILE)

    plant_seeds()
    num_cycles = Constants.NUM_CYCLES
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1 / float(num_folds))

    for i in range(num_cycles):

        print('\n\nCycle: %d/%d' % ((i + 1), num_cycles))

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

        train_records_list = []

        for j in range(num_folds):

            cv_start = float(j) / num_folds

            train_records, test_records =\
                ETLUtils.split_train_test(records, split=split, start=cv_start)
            train_records_list.append(train_records)

        args = zip(train_records_list,
                   [i] * Constants.CROSS_VALIDATION_NUM_FOLDS,
                   range(Constants.CROSS_VALIDATION_NUM_FOLDS))

        parallel_context_top_n(args)
コード例 #13
0
ファイル: business_etl.py プロジェクト: antoine-tran/yelp
    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes',
            'business_id',
            'categories',
            'city',
            'full_address',
            'latitude',
            'longitude',
            'hours',
            'name',
            'neighborhoods',
            'open',
            'review_count',
            'stars',
            'state',
            'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)
コード例 #14
0
    def multiple_lineal_regression(file_path):
        records = ReviewETL.load_file(file_path)
        ratings = np.array([record['stars'] for record in records])
        ETLUtils.drop_fields(['stars'], records)
        data = np.array([record.values() for record in records])

        # Create linear regression object
        regr = linear_model.LinearRegression()

        # Train the model using the training sets
        regr.fit(data, ratings)

        model = linear_model.LinearRegression(fit_intercept=True)
        model.fit(data, ratings)
        p = np.array([model.predict(xi) for xi in data])
        e = p - ratings

        total_error = np.dot(e, e)
        rmse_train = np.sqrt(total_error / len(p))

        kf = KFold(len(data), n_folds=10)
        err = 0
        for train, test in kf:
            model.fit(data[train], ratings[train])
            p = np.array([model.predict(xi) for xi in data[test]])
            e = p - ratings[test]
            err += np.dot(e, e)

        rmse_10cv = np.sqrt(err / len(data))
        print('RMSE on training: {}'.format(rmse_train))
        print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
コード例 #15
0
def run_top_n_test(records_file,
                   recommenders,
                   binary_reviews_file,
                   reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')
コード例 #16
0
ファイル: main.py プロジェクト: swarnamd/yelp
def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
コード例 #17
0
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
コード例 #18
0
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
コード例 #19
0
def parallel_run_topn_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product(
        [records],
        recommenders,
        [top_n],
        [num_folds],
        [split],
        [min_like_score],
        [binary_reviews],
        [reviews_type]
    )

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(context_recommender_tests.process_topn_results(
            recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

    return results_list
コード例 #20
0
def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None,
        'random_over_sampler',
        'smote_regular',
        'smote_bl1',
        'smote_bl2',
        'smote_tomek',
        'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    document_levels = ['review', 'sentence', 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels)
    index = 1

    results_list = []

    for document_level in document_levels:

        Constants.DOCUMENT_LEVEL = document_level
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers, my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
コード例 #21
0
    def test_select_fields(self):

        select_fields = ['user_id', 'offering_id', 'overall_rating']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
        self.assertEqual(result, reviews_matrix_5_short)

        select_fields = ['user_id']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
        self.assertEqual(result, reviews_matrix_5_users)
コード例 #22
0
ファイル: test_etl_utils.py プロジェクト: antoine-tran/yelp
    def test_select_fields(self):

        select_fields = ['user_id', 'offering_id', 'overall_rating']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5)
        self.assertEqual(result, reviews_matrix_5_short)

        select_fields = ['user_id']
        result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short)
        self.assertEqual(result, reviews_matrix_5_users)
コード例 #23
0
def parallel_run_topn_test(records_file,
                           recommenders,
                           binary_reviews_file,
                           reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product([records], recommenders, [top_n], [num_folds],
                             [split], [min_like_score], [binary_reviews],
                             [reviews_type])

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            context_recommender_tests.process_topn_results(
                recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')

    return results_list
コード例 #24
0
def main():
    topic_model_creator.plant_seeds()

    my_resamplers = [
        None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2',
        'smote_tomek', 'smoteenn'
    ]

    my_classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf', probability=True),
        SVC(C=1.0, kernel='linear', probability=True),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(probability=True),
        RandomForestClassifier(n_estimators=100)
    ]

    max_sentences_list = [None, 1]

    num_cyles = len(my_resamplers) * len(my_classifiers) * len(
        max_sentences_list)
    index = 1

    results_list = []

    for max_sentences in max_sentences_list:

        Constants.MAX_SENTENCES = max_sentences
        my_records = load_records()
        preprocess_records(my_records)
        x_matrix, y_vector = transform(my_records)

        count_specific_generic(my_records)

        for resampler, classifier in itertools.product(my_resamplers,
                                                       my_classifiers):

            print('Cycle %d/%d' % (index, num_cyles))

            classification_results =\
                test_classifier(x_matrix, y_vector, resampler, classifier)
            results_list.append(classification_results)
            index += 1

    for results in results_list:
        print(results)

    csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\
               '_sentence_classifier_results.csv'
    ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
コード例 #25
0
    def drop_unnecessary_fields(self):
        print(
            '%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            # Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)
コード例 #26
0
def run_top_n_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) - count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
コード例 #27
0
    def drop_unnecessary_fields(self):
        print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        unnecessary_fields = [
            Constants.TEXT_FIELD,
            Constants.POS_TAGS_FIELD,
            Constants.VOTES_FIELD,
            Constants.BOW_FIELD
        ]

        ETLUtils.drop_fields(unnecessary_fields, self.records)
コード例 #28
0
ファイル: basic_knn.py プロジェクト: antoine-tran/yelp
def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records
コード例 #29
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #30
0
def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars', 'text', 'review_id']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records
コード例 #31
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #32
0
    def remove_reviews_from_classifier_training_set(self):
        """
        Removes the records that are part of the training set of the reviews
        classifier
        """
        classifier_records = \
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
        classifier_review_ids = \
            {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}

        self.records = ETLUtils.filter_out_records(
            self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)
コード例 #33
0
    def export_as_predefined_context(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []

        context_categories = utilities.context_words[Constants.ITEM_TYPE].keys()
        context_headers = [
            'context:%s' % category for category in context_categories]

        index = 0

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            review_categories = \
                find_predefined_context(record[Constants.BOW_FIELD])

            context_found = False
            for category in context_categories:
                category_key = 'context:' + category
                category_value = 0
                if category in review_categories:
                    category_value = 1
                    context_found = True
                new_record[category_key] = category_value

            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)
            index += 1

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
コード例 #34
0
def run_rmse_test(records_file,
                  recommenders,
                  binary_reviews_file,
                  reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\n%d/%d\n**************' %
              (count, len(recommenders)))
        results = recommender_evaluator.perform_cross_validation(
            records, recommender, num_folds, binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_rmse_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-rmse-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS,
                           '\t')
コード例 #35
0
    def test_drop_fields(self):

        drop_fields = [
            'cleanliness_rating', 'location_rating', 'rooms_rating',
            'service_rating', 'value_rating'
        ]

        test_list = list(reviews_matrix_5)

        ETLUtils.drop_fields(drop_fields, test_list)
        self.assertEqual(reviews_matrix_5_short, test_list)

        test_list = list(reviews_matrix_5_short)
        self.assertEqual(reviews_matrix_5_short, test_list)
コード例 #36
0
    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
コード例 #37
0
    def train_topic_model(self, cycle_index, fold_index):

        context_extractor = topic_model_creator.create_topic_model(
            self.train_records, cycle_index, fold_index)
        self.context_rich_topics = context_extractor.context_rich_topics

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)
        ETLUtils.save_json_file(topics_file_path,
                                [dict(self.context_rich_topics)])
        print('Trained Context Extractor: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        return context_extractor
コード例 #38
0
ファイル: business_etl.py プロジェクト: neostoic/yelp-1
    def drop_unwanted_fields(dictionary_list):
        """
        Drops fields that are not useful for data analysis in the business
        data set

        :rtype : void
        :param dictionary_list: the list of dictionaries containing the data
        """
        unwanted_fields = [
            'attributes', 'business_id', 'categories', 'city', 'full_address',
            'latitude', 'longitude', 'hours', 'name', 'neighborhoods', 'open',
            'review_count', 'stars', 'state', 'type'
        ]

        ETLUtils.drop_fields(unwanted_fields, dictionary_list)
コード例 #39
0
    def export_as_top_word(self):
        print('%s: exporting to CARSKit ratings binary format with context as '
              'top words' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(CSV_FILE):
            print('Binary ratings file already exists')
            copy_to_workspace(CSV_FILE)
            return

        new_records = []
        topic_model_string = self.topic_extractor.print_topic_model()
        top_terms = [get_topic_terms(topic) for topic in topic_model_string]
        context_headers = ['context:%s' % term[0] for term in top_terms]

        for record in self.records:

            new_record = {
                Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD],
                Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD],
                Constants.RATING_FIELD: record[Constants.RATING_FIELD],
            }

            topics = record[self.topics_field]
            context_found = False

            for topic in topics:
                topic_index = topic[0]
                topic_weight = topic[1]

                context_key = context_headers[topic_index]
                context_value = 1 if topic_weight > 0.0 else 0

                new_record[context_key] = context_value
            # print(new_record)
            context_na_value = 0 if context_found else 1
            new_record['context:na'] = context_na_value

            new_records.append(new_record)

        headers = [
            Constants.USER_ID_FIELD,
            Constants.ITEM_ID_FIELD,
            Constants.RATING_FIELD,
            'context:na'
        ]
        headers.extend(context_headers)
        ETLUtils.save_csv_file(CSV_FILE, new_records, headers)
        copy_to_workspace(CSV_FILE)
コード例 #40
0
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
コード例 #41
0
ファイル: precision_in_top_n.py プロジェクト: neostoic/yelp-1
def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds):

    start_time = time.time()
    split = 1 - (1 / float(num_folds))
    total_precision = 0.
    num_cycles = 0

    for i in xrange(0, num_folds):
        print('Fold', i)
        start = float(i) / num_folds
        train, test = ETLUtils.split_train_test(reviews,
                                                split=split,
                                                start=start)
        recommender.load(train)
        user_ids = recommender.user_ids

        for user_id in user_ids:
            precision = calculate_recommender_precision(
                test, user_id, recommender, n, min_score)

            if precision is not None:
                total_precision += precision
                num_cycles += 1

    final_precision = total_precision / num_cycles
    execution_time = time.time() - start_time

    print('Final Top N Precision: %f' % final_precision)
    print("--- %s seconds ---" % execution_time)

    result = {'Top N': final_precision, 'Execution time': execution_time}

    return result
コード例 #42
0
def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]
コード例 #43
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
コード例 #44
0
ファイル: extractor.py プロジェクト: antoine-tran/yelp
def initialize_cluster_users(reviews, significant_criteria_ranges=None):
    """
    Builds a dictionary containing all the users in the reviews. Each user
    contains information about its average overall rating, the list of reviews
    that user has made, and the cluster the user belongs to

    :param reviews: the list of reviews
    :return: a dictionary with the users initialized, the keys of the
    dictionaries are the users' ID
    """
    user_ids = get_groupby_list(reviews, 'user_id')
    user_dictionary = {}

    for user_id in user_ids:
        user = User(user_id)
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
        user.average_overall_rating = get_user_average_overall_rating(
            user_reviews, user_id, apply_filter=False)
        user.criteria_weights = get_criteria_weights(
            user_reviews, user_id, apply_filter=False)
        _, user.cluster = get_significant_criteria(
            user.criteria_weights, significant_criteria_ranges)
        user.item_ratings = get_user_item_ratings(user_reviews, user_id)
        user.item_multi_ratings = get_user_item_multi_ratings(user_reviews, user_id)
        user_dictionary[user_id] = user

    # print('Total users: %i' % len(user_ids))

    return user_dictionary
コード例 #45
0
    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError("Can not determine the sparsity for an empty list")

        user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        item_ids = extractor.get_groupby_list(self.reviews, "offering_id")

        non_missing_reviews = 0.0
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user])
            user_items = extractor.get_groupby_list(user_reviews, "offering_id")

            non_missing_reviews += len(set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews
コード例 #46
0
ファイル: extractor.py プロジェクト: antoine-tran/yelp
def get_user_item_ratings(reviews, user_id, apply_filter=False):
    """
    Returns a dictionary that contains the items that the given user has rated,
    where the key of the dictionary is the ID of the item and the value is the
    rating that user_id has given to that item

    :param reviews: a list of reviews
    :param user_id: the ID of the user
    :param apply_filter: a boolean that indicates if the reviews have to be
    filtered by user_id or not. In other word this boolean indicates if the list
    contains reviews from several users or not. If it does contains reviews from
    other users, those have to be removed
    :return: a dictionary with the items that the given user has rated
    """

    if apply_filter:
        user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    else:
        user_reviews = reviews

    if not user_reviews:
        return {}

    data_frame = DataFrame(user_reviews)
    column = 'offering_id'
    counts = data_frame.groupby(column).mean()

    items = counts.index.get_level_values(0).tolist()
    items_ratings = {}

    for item, mean in zip(items, counts['overall_rating']):
        items_ratings[item] = mean

    return items_ratings
コード例 #47
0
ファイル: main.py プロジェクト: swarnamd/yelp
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
コード例 #48
0
def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]
コード例 #49
0
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
コード例 #50
0
    def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError(
                'Can not determine the sparsity for an empty list')

        user_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.USER_ID_FIELD)
        item_ids = extractor.get_groupby_list(self.reviews,
                                              Constants.ITEM_ID_FIELD)

        non_missing_reviews = 0.
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews,
                                                   Constants.USER_ID_FIELD,
                                                   [user])
            user_items = extractor.get_groupby_list(user_reviews,
                                                    Constants.ITEM_ID_FIELD)

            non_missing_reviews += len(
                set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews
コード例 #51
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #52
0
def get_ml_100K_dataset():
    # records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-1k.csv', '\t')
    records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-100k.csv', '\t')
    # records = ETLUtils.load_csv_file('/Users/fpena/UCC/Thesis/datasets/uncompressed/ml-100k.csv', '\t')
    for record in records:
        record['overall_rating'] = float(record['overall_rating'])
    return records