Exemple #1
0
def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
Exemple #2
0
def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
Exemple #4
0
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
Exemple #12
0
def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
Exemple #13
0
    def get_business_ids(file_path, business_type=None):
        records = ETLUtils.load_json_file(file_path)

        if not business_type:
            return [record['business_id'] for record in records]

        return [record['business_id'] for record in records
                if business_type in record['categories']]
 def generate_report_yelp_phoenix():
     reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
     report_file = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_yelp_phoenix.ipynb'
     reviews = ETLUtils.load_json_file(reviews_file)
     load_reviews_code =\
         'file_path = \'' + reviews_file + '\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Yelp Phoenix', report_file, load_reviews_code)
 def generate_report_ruihai():
     file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)
 def generate_report_fourcity_filtered():
     file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)
def main():

    records = ETLUtils.load_json_file(
        Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
    context_transformer = ContextTransformer(records)
    context_transformer.load_data()
    context_transformer.transform_records()
    context_transformer.export_records()
Exemple #18
0
    def load_data(self):
        """
        Loads the records and the topic model from files

        """
        self.records = ETLUtils.load_json_file(
            Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        self.topic_extractor = NmfTopicExtractor()
        self.topic_extractor.load_trained_data()
def main():
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    # count_specific_generic_ratio(classifier_records)

    # load_data(file_name)
    # compare_records()
    # update_labeled_reviews_records()
    # foo()
    cohens_kappa()
Exemple #20
0
def main():

    # records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    records = ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()
Exemple #21
0
def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records
def analyze_topics():

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    topic_model_string = None
    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model('max')
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    data_frame = DataFrame.from_dict(topic_data)
    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score

    print('probability score: %f' % scores['probability_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores
Exemple #23
0
def main_export():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    print('num_records', len(records))

    test_records = ETLUtils.load_json_file(TEST_RECORDS_FILE)
    # test_reviews = review_metrics_extractor.build_reviews(test_records)
    # with open(TEST_REVIEWS_FILE, 'wb') as write_file:
    #     pickle.dump(test_reviews, write_file, pickle.HIGHEST_PROTOCOL)
    # with open(TEST_REVIEWS_FILE, 'rb') as read_file:
    #     test_reviews = pickle.load(read_file)
    # train_file = RECORDS_FILE + '_train'
    # train_records = ETLUtils.load_json_file(train_file)

    with open(USER_ITEM_MAP_FILE, 'rb') as read_file:
        user_item_map = pickle.load(read_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.initialize(user_item_map)

    top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
Exemple #24
0
def main_converter():

    csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv'
    csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv'

    # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True)
    # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True)

    headers = ['stars', 'user_id', 'business_id']
    train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
    records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE)
    train_records = ETLUtils.select_fields(headers, train_records)
    records_to_predict = ETLUtils.select_fields(headers, records_to_predict)

    ETLUtils.save_csv_file(csv_train_file, train_records, headers)
    ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers)

    csv_files = [
        csv_train_file,
        csv_test_file
    ]

    csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
def main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = NmfContextExtractor(records)
    context_extractor.generate_review_bows()
    context_extractor.build_document_term_matrix()
    # context_extractor.build_topic_model()
    context_extractor.build_stable_topic_model()
    context_extractor.print_topic_model()
    context_extractor.update_reviews_with_topics()
    context_extractor.get_context_rich_topics()
Exemple #26
0
def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame, 'overall_rating', plot_type='bar', title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()
Exemple #27
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.preprocess()

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
Exemple #28
0
    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
Exemple #29
0
    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()
def main():
    utilities.plant_seeds()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    context_extractor = \
        topic_model_creator.create_topic_model(records, None, None)

    topic_latex_generator = TopicLatexGenerator(context_extractor)
    topic_latex_generator.generate_pdf()
Exemple #31
0
def export_records_to_text():
    print('%s: Exporting bag-of-words to text files' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    print('Total records: %d' % len(records))

    folder = '/Users/fpena/tmp/topic-ensemble/data/' + Constants.ITEM_TYPE + '/'

    for record in records:
        file_name = folder + Constants.ITEM_TYPE + '_' + \
                    record[Constants.REVIEW_ID_FIELD] + '.txt'

        with codecs.open(file_name, "w", encoding="utf-8-sig") as text_file:
            text_file.write(" ".join(record[Constants.BOW_FIELD]))
Exemple #32
0
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
Exemple #33
0
def main2():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)[:10]
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    # print(data_frame)
    # data_frame['a'] = data_frame[Constants.USER_ID_FIELD].astype('category')
    # data_frame['b'] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    data_frame[Constants.USER_ID_FIELD] = data_frame[Constants.USER_ID_FIELD].astype('category')
    data_frame[Constants.ITEM_ID_FIELD] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    # category_columns = data_frame.select_dtypes(['category']).columns
    # print(category_columns)
    # data_frame[category_columns] = \
    #     data_frame[category_columns].apply(lambda x: x.cat.codes)
    # print(data_frame)
    # print(data_frame['b'].cat.categories[0])
    print(data_frame[Constants.USER_ID_FIELD].cat.codes)
    print(data_frame[Constants.ITEM_ID_FIELD].cat.codes)

    plays = coo_matrix((data_frame[Constants.RATING_FIELD].astype(float),
                        (data_frame[Constants.USER_ID_FIELD].cat.codes,
                         data_frame[Constants.ITEM_ID_FIELD].cat.codes)))

    print(plays)
    # from sklearn.decomposition import NMF
    model = NMF(n_components=2, init='random', random_state=0)
    W = model.fit_transform(plays)
    H = model.components_
    nR = numpy.dot(W, H)
    # print(nR)
    # print(nR.shape)

    print 'User-based CF MSE: ' + str(
        mean_squared_error(nR, plays.toarray()))

    # get SVD components from train matrix. Choose k.
    u, s, vt = svds(plays, k=5)
    s_diag_matrix = numpy.diag(s)
    X_pred = numpy.dot(numpy.dot(u, s_diag_matrix), vt)
    # print(X_pred)
    print 'User-based CF MSE: ' + str(mean_squared_error(X_pred, plays.toarray()))
Exemple #34
0
    def create_category_sets(file_path):
        """
        Creates an array of arrays in which each sub-array contains the
        categories of each business in the Yelp Phoenix Business data set

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array of numpy arrays with the categories that each
        business has, for example [['Restaurant', 'Mexican', 'Bar'],
        ['Bar', 'Disco']]
        """
        records = ETLUtils.load_json_file(file_path)
        sets = numpy.array([set(record['categories']) for record in records])

        return sets
Exemple #35
0
def analyze_results():
    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize'
    ]
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE +
                      '_carskit.csv')
def load_data(file_name):
    records = ETLUtils.load_json_file(file_name)
    data_frame = pandas.DataFrame.from_records(records)

    column = 'review_type'
    # column = 'specific'

    # print(data_frame.describe())
    # print(data_frame.head())
    # data_frame = data_frame['specific']
    # print(data_frame.groupby(column)[column].count())
    # reviews = list(data_frame['text'])
    values = list(data_frame[column])
    values = [value.encode('ascii', 'ignore') for value in values]
    # print(reviews)
    print(values)

    return records
def see_topic_analysis_results():
    topic_analysis_file = Constants.DATASET_FOLDER + 'topic_model_analysis_' + \
                          Constants.ITEM_TYPE + '.json'

    results = ETLUtils.load_json_file(topic_analysis_file)

    index = 0
    for result in results:
        score_ratio = result['high_ratio_mean_score'] / result[
            'low_ratio_mean_score']
        count_ratio = result['weighted_high_ratio_count'] / result[
            'weighted_low_ratio_count']
        print(index, score_ratio, count_ratio,
              result['high_ratio_mean_score'],
              result['low_ratio_mean_score'],
              result['lda_epsilon'], result['topic_weighting_method'],
              result['num_context_topics'], result['lda_num_topics'])
        index += 1
Exemple #38
0
def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame,
                          'overall_rating',
                          plot_type='bar',
                          title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()
Exemple #39
0
    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]
Exemple #40
0
def topic_stability_main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    # num_topic_list = range(2, 101)
    num_topic_list = [2, 5]
    results = {}
    for num_topics in num_topic_list:
        new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}
        Constants.update_properties(new_properties)
        results[num_topics] = calculate_topic_stability(records)

    print('Results:')
    for num_topics in num_topic_list:
        scores = results[num_topics]
        print('%d: %.4f [%.4f,%.4f]' %
              (num_topics, numpy.nanmean(scores), numpy.nanmin(scores),
               numpy.nanmax(scores)))
Exemple #41
0
def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json'
    reviews = ETLUtils.load_json_file(reviews_file)

    select_fields = ['user_id', 'business_id', 'stars']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['business_id', 'stars'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    reviews = clean_reviews(reviews)

    return reviews
Exemple #42
0
def main():
    # reviews = pre_process_reviews()
    # save_dictionary_list_to_file(reviews, '/Users/fpena/tmp/filtered_reviews.json')
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)
    column = 'offering_id'
    groupby = data_frame.groupby(column)
    counts = groupby.mean()
    # print(counts)

    items = counts.index.get_level_values(0).tolist()

    for item, mean in zip(items, counts['overall_rating']):
        print(item, mean)

    # print(get_item_list(reviews, 2))
    # print(len(reviews))
    # initialize_users(reviews, 10)
    pass
Exemple #43
0
    def tag_reviews_language(self):

        print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
            print('Records have already been tagged with language field')
            self.records = \
                ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
            return

        DetectorFactory.seed = 0

        for record in self.records:
            try:
                language = langdetect.detect(record[Constants.TEXT_FIELD])
            except LangDetectException:
                language = 'unknown'
            record[Constants.LANGUAGE_FIELD] = language

        ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
Exemple #44
0
def create_single_topic_model(cycle_index, fold_index):

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1 / float(num_folds))

    for i in range(cycle_index + 1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    create_topic_model(train_records, cycle_index, fold_index)
Exemple #45
0
def analyze_fourcity():
    records = ETLUtils.load_json_file(Constants.FULL_PROCESSED_RECORDS_FILE)
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    print(data_frame.describe())

    zero_records = 0
    for record in records:
        if record[Constants.RATING_FIELD] < 1.0:
            print(record)
            zero_records += 1

    for record in records:
        if record[Constants.RATING_FIELD] > 5.0:
            print(record)
            zero_records += 1
    print('zero records: %d' % zero_records)

    # Look for duplicates
    keys_set = set()
    num_duplicates = 0
    print('Looking for duplicates')
    records_map = {}
    for record in records:
        # if record[Constants.USER_ITEM_KEY_FIELD] in keys_set:
        record_key = record[Constants.USER_ITEM_KEY_FIELD]
        if record_key in records_map:
            print('old record', records_map[record_key][Constants.TEXT_FIELD])
            print('new record', record[Constants.TEXT_FIELD])
            num_duplicates += 1
        keys_set.add(record_key)
        records_map[record_key] = record

    print('duplicate records: %d' % num_duplicates)
Exemple #46
0
    def create_reviews(self):
        with self.doc.create(Section('Reviews')):
            with self.doc.create(Subsection('A subsection')):

                sg_map = {'yes': 'specific', 'no': 'generic'}

                review_index = 0
                # full_records = ETLUtils.load_json_file(
                #     Constants.FULL_PROCESSED_RECORDS_FILE)
                records_file = Constants.DATASET_FOLDER +\
                    'classified_' + Constants.ITEM_TYPE + '_reviews.json'
                full_records = ETLUtils.load_json_file(records_file)

                for record in full_records:
                    with self.doc.create(Subsection(
                                    'Review %d (%s)' % (
                            (review_index + 1), sg_map[record['specific']]))):
                        # for doc_part in self.build_text(
                        #         record[Constants.TEXT_FIELD]):
                        for doc_part in self.build_text_manual(record):
                            self.doc.append(doc_part)
                    review_index += 1
Exemple #47
0
def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/'
    review_file_path = data_folder + 'review.txt'
    # review_file_path = data_folder + 'review-short.json'
    reviews = ETLUtils.load_json_file(review_file_path)

    select_fields = ['ratings', 'author', 'offering_id']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['author', 'ratings'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat')
    reviews = clean_reviews(reviews)

    return reviews
Exemple #48
0
def build_manual_topic_model():

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
          Constants.ITEM_TYPE + '_reviews_first_sentences.json'
    records = ETLUtils.load_json_file(new_classified_records_file)
    # records = ETLUtils.filter_records(records, 'context_type', ['context'])
    # records = ETLUtils.filter_records(records, 'sentence_type', ['specific'])
    # records = ETLUtils.filter_records(records, 'sentence_index', [0])
    print('total records: %d' % len(records))

    # print(records[0])
    count = 0

    for i in range(len(records)):
        record = records[i]
        if record['sentence_index'] == 0.0:
            # if record['context_type'] == 'context' and record['context_summary'] != 'all_context':
            if record['sentence_type'] == 'specific':
                print('%d:\t%s' % (i+1, records[i]['text']))
                count += 1

    print('count: %d' % count)
Exemple #49
0
    def create_category_matrix(file_path):
        """
        Creates a matrix with all the categories for businesses that are
        contained in the Yelp Phoenix Business data set. Each column of the
        matrix represents a category, and each row a business. This is a binary
        matrix that contains a 1 at the position i,j if the business i contains
        the category j, and a 0 otherwise.

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array binary matrix
        """
        records = ETLUtils.load_json_file(file_path)

        # Now we obtain the categories for all the businesses
        records = ETLUtils.add_transpose_list_column('categories', records)
        BusinessETL.drop_unwanted_fields(records)
        matrix = numpy.array(
            [numpy.array(record.values()) for record in records])

        return matrix
def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_hotel_reviews.json'
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_restaurant_reviews.json'
    # records = ETLUtils.load_json_file(file_name)
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

    # Take only the first sentence
    # document_level = 1
    if isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
        records = [
            record for record in records
            if record['sentence_index'] < Constants.DOCUMENT_LEVEL
        ]

    return records
Exemple #51
0
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
Exemple #52
0
def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)
Exemple #53
0
def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    dataset = Constants.ITEM_TYPE
    folder = Constants.DATASET_FOLDER
    records_file = folder + \
                   'classified_' + dataset + '_reviews_sentences.json'
    records = ETLUtils.load_json_file(records_file)

    # Take only the first sentence
    # max_sentences = 1
    if Constants.MAX_SENTENCES is not None:
        records = [
            record for record in records
            if record['sentence_index'] < Constants.MAX_SENTENCES
        ]

    return records
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
Exemple #55
0
    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()
Exemple #56
0
 def load_records_to_predict(self, records_file):
     self.records_to_predict = ETLUtils.load_json_file(records_file)
     with open(records_file + '.pkl', 'rb') as read_file:
         self.items_to_predict = pickle.load(read_file)
Exemple #57
0
def transform_manually_labeled_reviews():

    full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json')

    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    print('total records: %d' % len(records))

    new_records = []

    for record in records:

        sentence_index = record['sentence_index']

        if sentence_index > 0:
            continue
        record['predicted_class'] = record['sentence_type']
        new_records.append(record)

    # count = 0
    # for new_record in new_records:
    #     internal_count = 0
    #     for full_record in full_records:
    #         if full_record['text'].startswith(new_record['text']):
    #             # print(full_record['text'])
    #             internal_count += 1
    #             count += 1
    #             print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id']))
    #
    #             if internal_count > 1:
    #                 print('internal count: %d\treview_id: %s' % (internal_count, new_record['text']))

    # print('count: %d' % count)

    index = 0

    for new_record in new_records:

        while True:

            full_record = full_records[index]

            if full_record['text'].startswith(new_record['text']):
                new_record[Constants.USER_ID_FIELD] = full_record['user_id']
                new_record[Constants.ITEM_ID_FIELD] = full_record['business_id']
                new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id']
                new_record[Constants.RATING_FIELD] = full_record['stars']
                break
            index += 1
        index += 1

    print('index: %d' % index)

    for new_record in new_records:

        for full_record in full_records:
            if new_record['review_id'] == full_record['review_id']:
                print('%s ====' % new_record['text'])
                print(full_record['text'])
                print('******************\n******************\n******************\n******************')
                break

    # reviews_preprocessor = ReviewsPreprocessor()
    # new_records = reviews_preprocessor.lemmatize_sentences(new_records)
    # reviews_preprocessor.records = new_records
    # reviews_preprocessor.build_bag_of_words()
    # reviews_preprocessor.drop_unnecessary_fields()

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
        Constants.ITEM_TYPE + '_reviews_first_sentences.json'

    print(new_records[0])

    ETLUtils.save_json_file(new_classified_records_file, new_records)
Exemple #58
0
            if review['business_id'] in business_ids:
                filtered_reviews.append(review)

        return filtered_reviews

    @staticmethod
    def sort_records(records, field, reverse=False):
        return sorted(records, key=itemgetter(field), reverse=reverse)



start = time.time()

review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))


# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)
Exemple #59
0
def main():
    item_type = 'hotel'
    # item_type = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_file = my_folder + 'classified_' + item_type + '_reviews.json'
    binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl'
    my_records = ETLUtils.load_json_file(my_file)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    num_features = 2

    my_metrics = numpy.zeros((len(my_reviews), num_features))
    for index in range(len(my_reviews)):
        my_metrics[index] =\
            review_metrics_extractor.get_review_metrics(my_reviews[index])

    review_metrics_extractor.normalize_matrix_by_columns(my_metrics)

    count_specific = 0
    count_generic = 0
    for record in my_records:

        if record['specific'] == 'yes':
            count_specific += 1

        if record['specific'] == 'no':
            count_generic += 1

    print('count_specific: %d' % count_specific)
    print('count_generic: %d' % count_generic)
    print('specific percentage: %f%%' % (float(count_specific)/len(my_records)))
    print('generic percentage: %f%%' % (float(count_generic)/len(my_records)))

    my_labels = numpy.array([record['specific'] == 'yes' for record in my_records])

    classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        # DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf'),
        SVC(C=1.0, kernel='linear'),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(),
        LinearSVC()
    ]
    scores = [[] for _ in range(len(classifiers))]

    Xtrans = my_metrics

    cv = KFold(n=len(my_metrics), n_folds=5)

    for i in range(len(classifiers)):
        for train, test in cv:
            x_train, y_train = Xtrans[train], my_labels[train]
            x_test, y_test = Xtrans[test], my_labels[test]

            clf = classifiers[i]
            clf.fit(x_train, y_train)
            scores[i].append(clf.score(x_test, y_test))

    for classifier, score in zip(classifiers, scores):
        print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score)))

    plot(my_metrics, my_labels)