コード例 #1
0
ファイル: reviews_classifier.py プロジェクト: bachlog/yelp
def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)
コード例 #2
0
ファイル: top_n_runner.py プロジェクト: bachlog/yelp
def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall
コード例 #3
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #4
0
ファイル: main.py プロジェクト: melqkiades/yelp
def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')
コード例 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)
コード例 #6
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
コード例 #7
0
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
コード例 #8
0
def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]
コード例 #9
0
def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))
コード例 #10
0
    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)
コード例 #11
0
def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()
コード例 #12
0
ファイル: main.py プロジェクト: melqkiades/yelp
def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])
コード例 #13
0
ファイル: business_etl.py プロジェクト: antoine-tran/yelp
    def get_business_ids(file_path, business_type=None):
        records = ETLUtils.load_json_file(file_path)

        if not business_type:
            return [record['business_id'] for record in records]

        return [record['business_id'] for record in records
                if business_type in record['categories']]
コード例 #14
0
 def generate_report_yelp_phoenix():
     reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
     report_file = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_yelp_phoenix.ipynb'
     reviews = ETLUtils.load_json_file(reviews_file)
     load_reviews_code =\
         'file_path = \'' + reviews_file + '\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Yelp Phoenix', report_file, load_reviews_code)
コード例 #15
0
 def generate_report_ruihai():
     file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)
コード例 #16
0
 def generate_report_fourcity_filtered():
     file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)
コード例 #17
0
def main():

    records = ETLUtils.load_json_file(
        Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
    context_transformer = ContextTransformer(records)
    context_transformer.load_data()
    context_transformer.transform_records()
    context_transformer.export_records()
コード例 #18
0
ファイル: carskit_exporter.py プロジェクト: melqkiades/yelp
    def load_data(self):
        """
        Loads the records and the topic model from files

        """
        self.records = ETLUtils.load_json_file(
            Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        self.topic_extractor = NmfTopicExtractor()
        self.topic_extractor.load_trained_data()
コード例 #19
0
def main():
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    # count_specific_generic_ratio(classifier_records)

    # load_data(file_name)
    # compare_records()
    # update_labeled_reviews_records()
    # foo()
    cohens_kappa()
コード例 #20
0
ファイル: context_extractor.py プロジェクト: melqkiades/yelp
def main():

    # records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    records = ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()
コード例 #21
0
ファイル: basic_knn.py プロジェクト: antoine-tran/yelp
def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records
コード例 #22
0
def analyze_topics():

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    topic_model_string = None
    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model('max')
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    data_frame = DataFrame.from_dict(topic_data)
    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score

    print('probability score: %f' % scores['probability_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores
コード例 #23
0
ファイル: top_n_runner.py プロジェクト: bachlog/yelp
def main_export():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    print('num_records', len(records))

    test_records = ETLUtils.load_json_file(TEST_RECORDS_FILE)
    # test_reviews = review_metrics_extractor.build_reviews(test_records)
    # with open(TEST_REVIEWS_FILE, 'wb') as write_file:
    #     pickle.dump(test_reviews, write_file, pickle.HIGHEST_PROTOCOL)
    # with open(TEST_REVIEWS_FILE, 'rb') as read_file:
    #     test_reviews = pickle.load(read_file)
    # train_file = RECORDS_FILE + '_train'
    # train_records = ETLUtils.load_json_file(train_file)

    with open(USER_ITEM_MAP_FILE, 'rb') as read_file:
        user_item_map = pickle.load(read_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.initialize(user_item_map)

    top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
コード例 #24
0
ファイル: top_n_runner.py プロジェクト: bachlog/yelp
def main_converter():

    csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv'
    csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv'

    # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True)
    # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True)

    headers = ['stars', 'user_id', 'business_id']
    train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
    records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE)
    train_records = ETLUtils.select_fields(headers, train_records)
    records_to_predict = ETLUtils.select_fields(headers, records_to_predict)

    ETLUtils.save_csv_file(csv_train_file, train_records, headers)
    ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers)

    csv_files = [
        csv_train_file,
        csv_test_file
    ]

    csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
コード例 #25
0
def main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = NmfContextExtractor(records)
    context_extractor.generate_review_bows()
    context_extractor.build_document_term_matrix()
    # context_extractor.build_topic_model()
    context_extractor.build_stable_topic_model()
    context_extractor.print_topic_model()
    context_extractor.update_reviews_with_topics()
    context_extractor.get_context_rich_topics()
コード例 #26
0
ファイル: data_analyzer.py プロジェクト: antoine-tran/yelp
def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame, 'overall_rating', plot_type='bar', title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()
コード例 #27
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.preprocess()

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
コード例 #28
0
    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
コード例 #29
0
    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()
コード例 #30
0
def main():
    utilities.plant_seeds()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    context_extractor = \
        topic_model_creator.create_topic_model(records, None, None)

    topic_latex_generator = TopicLatexGenerator(context_extractor)
    topic_latex_generator.generate_pdf()
コード例 #31
0
ファイル: main.py プロジェクト: melqkiades/yelp
def export_records_to_text():
    print('%s: Exporting bag-of-words to text files' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    print('Total records: %d' % len(records))

    folder = '/Users/fpena/tmp/topic-ensemble/data/' + Constants.ITEM_TYPE + '/'

    for record in records:
        file_name = folder + Constants.ITEM_TYPE + '_' + \
                    record[Constants.REVIEW_ID_FIELD] + '.txt'

        with codecs.open(file_name, "w", encoding="utf-8-sig") as text_file:
            text_file.write(" ".join(record[Constants.BOW_FIELD]))
コード例 #32
0
ファイル: carskit_caller.py プロジェクト: melqkiades/yelp
def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
コード例 #33
0
ファイル: main.py プロジェクト: swarnamd/yelp
def main2():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)[:10]
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    # print(data_frame)
    # data_frame['a'] = data_frame[Constants.USER_ID_FIELD].astype('category')
    # data_frame['b'] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    data_frame[Constants.USER_ID_FIELD] = data_frame[Constants.USER_ID_FIELD].astype('category')
    data_frame[Constants.ITEM_ID_FIELD] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    # category_columns = data_frame.select_dtypes(['category']).columns
    # print(category_columns)
    # data_frame[category_columns] = \
    #     data_frame[category_columns].apply(lambda x: x.cat.codes)
    # print(data_frame)
    # print(data_frame['b'].cat.categories[0])
    print(data_frame[Constants.USER_ID_FIELD].cat.codes)
    print(data_frame[Constants.ITEM_ID_FIELD].cat.codes)

    plays = coo_matrix((data_frame[Constants.RATING_FIELD].astype(float),
                        (data_frame[Constants.USER_ID_FIELD].cat.codes,
                         data_frame[Constants.ITEM_ID_FIELD].cat.codes)))

    print(plays)
    # from sklearn.decomposition import NMF
    model = NMF(n_components=2, init='random', random_state=0)
    W = model.fit_transform(plays)
    H = model.components_
    nR = numpy.dot(W, H)
    # print(nR)
    # print(nR.shape)

    print 'User-based CF MSE: ' + str(
        mean_squared_error(nR, plays.toarray()))

    # get SVD components from train matrix. Choose k.
    u, s, vt = svds(plays, k=5)
    s_diag_matrix = numpy.diag(s)
    X_pred = numpy.dot(numpy.dot(u, s_diag_matrix), vt)
    # print(X_pred)
    print 'User-based CF MSE: ' + str(mean_squared_error(X_pred, plays.toarray()))
コード例 #34
0
ファイル: business_etl.py プロジェクト: neostoic/yelp-1
    def create_category_sets(file_path):
        """
        Creates an array of arrays in which each sub-array contains the
        categories of each business in the Yelp Phoenix Business data set

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array of numpy arrays with the categories that each
        business has, for example [['Restaurant', 'Mexican', 'Bar'],
        ['Bar', 'Disco']]
        """
        records = ETLUtils.load_json_file(file_path)
        sets = numpy.array([set(record['categories']) for record in records])

        return sets
コード例 #35
0
ファイル: carskit_caller.py プロジェクト: swarnamd/yelp
def analyze_results():
    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize'
    ]
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE +
                      '_carskit.csv')
コード例 #36
0
def load_data(file_name):
    records = ETLUtils.load_json_file(file_name)
    data_frame = pandas.DataFrame.from_records(records)

    column = 'review_type'
    # column = 'specific'

    # print(data_frame.describe())
    # print(data_frame.head())
    # data_frame = data_frame['specific']
    # print(data_frame.groupby(column)[column].count())
    # reviews = list(data_frame['text'])
    values = list(data_frame[column])
    values = [value.encode('ascii', 'ignore') for value in values]
    # print(reviews)
    print(values)

    return records
コード例 #37
0
def see_topic_analysis_results():
    topic_analysis_file = Constants.DATASET_FOLDER + 'topic_model_analysis_' + \
                          Constants.ITEM_TYPE + '.json'

    results = ETLUtils.load_json_file(topic_analysis_file)

    index = 0
    for result in results:
        score_ratio = result['high_ratio_mean_score'] / result[
            'low_ratio_mean_score']
        count_ratio = result['weighted_high_ratio_count'] / result[
            'weighted_low_ratio_count']
        print(index, score_ratio, count_ratio,
              result['high_ratio_mean_score'],
              result['low_ratio_mean_score'],
              result['lda_epsilon'], result['topic_weighting_method'],
              result['num_context_topics'], result['lda_num_topics'])
        index += 1
コード例 #38
0
def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame,
                          'overall_rating',
                          plot_type='bar',
                          title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()
コード例 #39
0
    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]
コード例 #40
0
ファイル: main.py プロジェクト: swarnamd/yelp
def topic_stability_main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    # num_topic_list = range(2, 101)
    num_topic_list = [2, 5]
    results = {}
    for num_topics in num_topic_list:
        new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}
        Constants.update_properties(new_properties)
        results[num_topics] = calculate_topic_stability(records)

    print('Results:')
    for num_topics in num_topic_list:
        scores = results[num_topics]
        print('%d: %.4f [%.4f,%.4f]' %
              (num_topics, numpy.nanmean(scores), numpy.nanmin(scores),
               numpy.nanmax(scores)))
コード例 #41
0
def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json'
    reviews = ETLUtils.load_json_file(reviews_file)

    select_fields = ['user_id', 'business_id', 'stars']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['business_id', 'stars'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    reviews = clean_reviews(reviews)

    return reviews
コード例 #42
0
def main():
    # reviews = pre_process_reviews()
    # save_dictionary_list_to_file(reviews, '/Users/fpena/tmp/filtered_reviews.json')
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)
    column = 'offering_id'
    groupby = data_frame.groupby(column)
    counts = groupby.mean()
    # print(counts)

    items = counts.index.get_level_values(0).tolist()

    for item, mean in zip(items, counts['overall_rating']):
        print(item, mean)

    # print(get_item_list(reviews, 2))
    # print(len(reviews))
    # initialize_users(reviews, 10)
    pass
コード例 #43
0
    def tag_reviews_language(self):

        print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
            print('Records have already been tagged with language field')
            self.records = \
                ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
            return

        DetectorFactory.seed = 0

        for record in self.records:
            try:
                language = langdetect.detect(record[Constants.TEXT_FIELD])
            except LangDetectException:
                language = 'unknown'
            record[Constants.LANGUAGE_FIELD] = language

        ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
コード例 #44
0
def create_single_topic_model(cycle_index, fold_index):

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1 / float(num_folds))

    for i in range(cycle_index + 1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    create_topic_model(train_records, cycle_index, fold_index)
コード例 #45
0
ファイル: main.py プロジェクト: swarnamd/yelp
def analyze_fourcity():
    records = ETLUtils.load_json_file(Constants.FULL_PROCESSED_RECORDS_FILE)
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    print(data_frame.describe())

    zero_records = 0
    for record in records:
        if record[Constants.RATING_FIELD] < 1.0:
            print(record)
            zero_records += 1

    for record in records:
        if record[Constants.RATING_FIELD] > 5.0:
            print(record)
            zero_records += 1
    print('zero records: %d' % zero_records)

    # Look for duplicates
    keys_set = set()
    num_duplicates = 0
    print('Looking for duplicates')
    records_map = {}
    for record in records:
        # if record[Constants.USER_ITEM_KEY_FIELD] in keys_set:
        record_key = record[Constants.USER_ITEM_KEY_FIELD]
        if record_key in records_map:
            print('old record', records_map[record_key][Constants.TEXT_FIELD])
            print('new record', record[Constants.TEXT_FIELD])
            num_duplicates += 1
        keys_set.add(record_key)
        records_map[record_key] = record

    print('duplicate records: %d' % num_duplicates)
コード例 #46
0
    def create_reviews(self):
        with self.doc.create(Section('Reviews')):
            with self.doc.create(Subsection('A subsection')):

                sg_map = {'yes': 'specific', 'no': 'generic'}

                review_index = 0
                # full_records = ETLUtils.load_json_file(
                #     Constants.FULL_PROCESSED_RECORDS_FILE)
                records_file = Constants.DATASET_FOLDER +\
                    'classified_' + Constants.ITEM_TYPE + '_reviews.json'
                full_records = ETLUtils.load_json_file(records_file)

                for record in full_records:
                    with self.doc.create(Subsection(
                                    'Review %d (%s)' % (
                            (review_index + 1), sg_map[record['specific']]))):
                        # for doc_part in self.build_text(
                        #         record[Constants.TEXT_FIELD]):
                        for doc_part in self.build_text_manual(record):
                            self.doc.append(doc_part)
                    review_index += 1
コード例 #47
0
def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/'
    review_file_path = data_folder + 'review.txt'
    # review_file_path = data_folder + 'review-short.json'
    reviews = ETLUtils.load_json_file(review_file_path)

    select_fields = ['ratings', 'author', 'offering_id']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['author', 'ratings'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat')
    reviews = clean_reviews(reviews)

    return reviews
コード例 #48
0
ファイル: main.py プロジェクト: swarnamd/yelp
def build_manual_topic_model():

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
          Constants.ITEM_TYPE + '_reviews_first_sentences.json'
    records = ETLUtils.load_json_file(new_classified_records_file)
    # records = ETLUtils.filter_records(records, 'context_type', ['context'])
    # records = ETLUtils.filter_records(records, 'sentence_type', ['specific'])
    # records = ETLUtils.filter_records(records, 'sentence_index', [0])
    print('total records: %d' % len(records))

    # print(records[0])
    count = 0

    for i in range(len(records)):
        record = records[i]
        if record['sentence_index'] == 0.0:
            # if record['context_type'] == 'context' and record['context_summary'] != 'all_context':
            if record['sentence_type'] == 'specific':
                print('%d:\t%s' % (i+1, records[i]['text']))
                count += 1

    print('count: %d' % count)
コード例 #49
0
ファイル: business_etl.py プロジェクト: neostoic/yelp-1
    def create_category_matrix(file_path):
        """
        Creates a matrix with all the categories for businesses that are
        contained in the Yelp Phoenix Business data set. Each column of the
        matrix represents a category, and each row a business. This is a binary
        matrix that contains a 1 at the position i,j if the business i contains
        the category j, and a 0 otherwise.

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array binary matrix
        """
        records = ETLUtils.load_json_file(file_path)

        # Now we obtain the categories for all the businesses
        records = ETLUtils.add_transpose_list_column('categories', records)
        BusinessETL.drop_unwanted_fields(records)
        matrix = numpy.array(
            [numpy.array(record.values()) for record in records])

        return matrix
コード例 #50
0
def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_hotel_reviews.json'
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_restaurant_reviews.json'
    # records = ETLUtils.load_json_file(file_name)
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

    # Take only the first sentence
    # document_level = 1
    if isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
        records = [
            record for record in records
            if record['sentence_index'] < Constants.DOCUMENT_LEVEL
        ]

    return records
コード例 #51
0
ファイル: main.py プロジェクト: swarnamd/yelp
def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)
コード例 #52
0
def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)
コード例 #53
0
def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    dataset = Constants.ITEM_TYPE
    folder = Constants.DATASET_FOLDER
    records_file = folder + \
                   'classified_' + dataset + '_reviews_sentences.json'
    records = ETLUtils.load_json_file(records_file)

    # Take only the first sentence
    # max_sentences = 1
    if Constants.MAX_SENTENCES is not None:
        records = [
            record for record in records
            if record['sentence_index'] < Constants.MAX_SENTENCES
        ]

    return records
コード例 #54
0
def evaluate_topic_model(metric):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
コード例 #55
0
    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()
コード例 #56
0
 def load_records_to_predict(self, records_file):
     self.records_to_predict = ETLUtils.load_json_file(records_file)
     with open(records_file + '.pkl', 'rb') as read_file:
         self.items_to_predict = pickle.load(read_file)
コード例 #57
0
ファイル: main.py プロジェクト: swarnamd/yelp
def transform_manually_labeled_reviews():

    full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json')

    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    print('total records: %d' % len(records))

    new_records = []

    for record in records:

        sentence_index = record['sentence_index']

        if sentence_index > 0:
            continue
        record['predicted_class'] = record['sentence_type']
        new_records.append(record)

    # count = 0
    # for new_record in new_records:
    #     internal_count = 0
    #     for full_record in full_records:
    #         if full_record['text'].startswith(new_record['text']):
    #             # print(full_record['text'])
    #             internal_count += 1
    #             count += 1
    #             print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id']))
    #
    #             if internal_count > 1:
    #                 print('internal count: %d\treview_id: %s' % (internal_count, new_record['text']))

    # print('count: %d' % count)

    index = 0

    for new_record in new_records:

        while True:

            full_record = full_records[index]

            if full_record['text'].startswith(new_record['text']):
                new_record[Constants.USER_ID_FIELD] = full_record['user_id']
                new_record[Constants.ITEM_ID_FIELD] = full_record['business_id']
                new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id']
                new_record[Constants.RATING_FIELD] = full_record['stars']
                break
            index += 1
        index += 1

    print('index: %d' % index)

    for new_record in new_records:

        for full_record in full_records:
            if new_record['review_id'] == full_record['review_id']:
                print('%s ====' % new_record['text'])
                print(full_record['text'])
                print('******************\n******************\n******************\n******************')
                break

    # reviews_preprocessor = ReviewsPreprocessor()
    # new_records = reviews_preprocessor.lemmatize_sentences(new_records)
    # reviews_preprocessor.records = new_records
    # reviews_preprocessor.build_bag_of_words()
    # reviews_preprocessor.drop_unnecessary_fields()

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
        Constants.ITEM_TYPE + '_reviews_first_sentences.json'

    print(new_records[0])

    ETLUtils.save_json_file(new_classified_records_file, new_records)
コード例 #58
0
            if review['business_id'] in business_ids:
                filtered_reviews.append(review)

        return filtered_reviews

    @staticmethod
    def sort_records(records, field, reverse=False):
        return sorted(records, key=itemgetter(field), reverse=reverse)



start = time.time()

review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))


# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)
コード例 #59
0
ファイル: main.py プロジェクト: neostoic/yelp-1
def main():
    item_type = 'hotel'
    # item_type = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_file = my_folder + 'classified_' + item_type + '_reviews.json'
    binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl'
    my_records = ETLUtils.load_json_file(my_file)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    num_features = 2

    my_metrics = numpy.zeros((len(my_reviews), num_features))
    for index in range(len(my_reviews)):
        my_metrics[index] =\
            review_metrics_extractor.get_review_metrics(my_reviews[index])

    review_metrics_extractor.normalize_matrix_by_columns(my_metrics)

    count_specific = 0
    count_generic = 0
    for record in my_records:

        if record['specific'] == 'yes':
            count_specific += 1

        if record['specific'] == 'no':
            count_generic += 1

    print('count_specific: %d' % count_specific)
    print('count_generic: %d' % count_generic)
    print('specific percentage: %f%%' % (float(count_specific)/len(my_records)))
    print('generic percentage: %f%%' % (float(count_generic)/len(my_records)))

    my_labels = numpy.array([record['specific'] == 'yes' for record in my_records])

    classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        # DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf'),
        SVC(C=1.0, kernel='linear'),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(),
        LinearSVC()
    ]
    scores = [[] for _ in range(len(classifiers))]

    Xtrans = my_metrics

    cv = KFold(n=len(my_metrics), n_folds=5)

    for i in range(len(classifiers)):
        for train, test in cv:
            x_train, y_train = Xtrans[train], my_labels[train]
            x_test, y_test = Xtrans[test], my_labels[test]

            clf = classifiers[i]
            clf.fit(x_train, y_train)
            scores[i].append(clf.score(x_test, y_test))

    for classifier, score in zip(classifiers, scores):
        print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score)))

    plot(my_metrics, my_labels)