Python ETLUtils.load_json_fileの例、etl.ETLUtils.load_json_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: reviews_classifier.py プロジェクト: bachlog/yelp

def main():
    # dataset = 'hotel'
    dataset = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_training_records_file =\
        my_folder + 'classified_' + dataset + '_reviews.json'
    my_training_reviews_file =\
        my_folder + 'classified_' + dataset + '_reviews.pkl'
    my_training_records = ETLUtils.load_json_file(my_training_records_file)

    with open(my_training_reviews_file, 'rb') as read_file:
        my_training_reviews = pickle.load(read_file)

    classifier = ReviewsClassifier()
    classifier.train(my_training_records, my_training_reviews)

    my_input_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json'
    my_input_reviews_file =\
        my_folder + 'reviews_' + dataset + '_shuffled.pkl'
    my_output_records_file =\
        my_folder + 'yelp_training_set_review_' + dataset +\
        's_shuffled_tagged.json'

    with open(my_input_reviews_file, 'rb') as read_file:
        my_input_reviews = pickle.load(read_file)

    my_input_records = ETLUtils.load_json_file(my_input_records_file)

    my_output_records =\
        classifier.label_json_reviews(my_input_records, my_input_reviews)

    ETLUtils.save_json_file(my_output_records_file, my_output_records)

コード例 #2

0

ファイルを表示

ファイル: top_n_runner.py プロジェクト: bachlog/yelp

def main_evaluate():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    # print('num_records', len(records))

    test_file = RECORDS_FILE + '_test'
    test_records = ETLUtils.load_json_file(test_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.find_important_records()
    # top_n_evaluator.initialize()

    # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json'
    top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE)

    predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt'
    predictions = rmse_calculator.read_targets_from_txt(predictions_file)

    # print('total predictions', len(predictions))
    top_n_evaluator.evaluate(predictions)
    # print('precision', top_n_evaluator.precision)
    print('recall', top_n_evaluator.recall)

    return top_n_evaluator.recall

コード例 #3

0

ファイルを表示

ファイル: context_top_n_runner.py プロジェクト: antoine-tran/yelp

    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records =\
            ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        # ETLUtils.drop_fields(['tagged_words'], self.original_records)
        print('num_records: %d' % len(self.original_records))

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)

コード例 #4

0

ファイルを表示

ファイル: main.py プロジェクト: melqkiades/yelp

def dataset_bucket_analysis_by_field(field):
    # Set the dataset
    hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'}
    Constants.update_properties(hotel_dataset_properties)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('Loaded %d records' % len(records))

    user_frequency_map = {}

    for record in records:

        user_id = record[field]
        if user_id not in user_frequency_map:
            user_frequency_map[user_id] = 0
        user_frequency_map[user_id] += 1

    print('There is a total of %d %ss' % (len(user_frequency_map), field))
    sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_x[0])
    print(sorted_x[1])
    print(sorted_x[2])
    # print(user_frequency_map)

    # Number of reviews per user
    rda = ReviewsDatasetAnalyzer(records)
    users_summary = rda.summarize_reviews_by_field(field)
    print('Average number of reviews per %s: %f' % (field,
          float(rda.num_reviews) / rda.num_users))
    users_summary.plot(kind='line', rot=0)

    pandas.set_option('display.max_rows', len(users_summary))
    print(users_summary)
    pandas.reset_option('display.max_rows')

コード例 #5

0

ファイルを表示

ファイル: topic_model_creator.py プロジェクト: melqkiades/yelp

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-c', '--cycle', metavar='int', type=int,
        nargs=1, help='The index of the running cycle')
    parser.add_argument(
        '-f', '--fold', metavar='int', type=int,
        nargs=1, help='The index of the cross validation fold')
    parser.add_argument(
        '-t', '--numtopics', metavar='int', type=int,
        nargs=1, help='The number of topics of the topic model')

    args = parser.parse_args()
    fold = args.fold[0] if args.fold is not None else None
    cycle = args.cycle[0] if args.cycle is not None else None
    num_topics = args.numtopics[0] if args.numtopics is not None else None

    if num_topics is not None:
        Constants.update_properties(
            {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics})

    if fold is None and cycle is None:
        records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            num_records = len(records)
            records = records[:num_records / 2]
        print('num_reviews', len(records))

        create_topic_model(records, None, None)
    else:
        create_single_topic_model(cycle, fold)

コード例 #6

0

ファイルを表示

ファイル: word_context_top_n_runner.py プロジェクト: melqkiades/yelp

    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
        with open(Constants.REVIEWS_FILE, 'rb') as read_file:
            self.original_reviews = pickle.load(read_file)
        print('num_records: %d' % len(self.original_records))

        for record, review in zip(self.original_records, self.original_reviews):
            review.id = record[Constants.REVIEW_ID_FIELD]
            review.rating = record[Constants.RATING_FIELD]

        if not os.path.exists(Constants.USER_ITEM_MAP_FILE):
            records = ETLUtils.load_json_file(Constants.RECORDS_FILE)
            user_item_map = create_user_item_map(records)
            with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file:
                pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)

コード例 #7

0

ファイルを表示

ファイル: yelp_reviews_preprocessor.py プロジェクト: neostoic/yelp-1

    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        dataset = Constants.ITEM_TYPE
        folder = Constants.DATASET_FOLDER
        file_name_suffix =\
            '' if Constants.MAX_SENTENCES is None else '_sentences'
        training_records_file = folder +\
            'classified_' + dataset + '_reviews' + file_name_suffix + '.json'
        training_records = ETLUtils.load_json_file(training_records_file)

        if Constants.MAX_SENTENCES is not None:
            training_records = [
                record for record in training_records
                if record['sentence_index'] < Constants.MAX_SENTENCES
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)

コード例 #8

0

ファイルを表示

ファイル: reviews_clusterer.py プロジェクト: antoine-tran/yelp

def main():
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json'
    my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json'
    my_records = ETLUtils.load_json_file(my_file)
    # my_reviews = []
    # my_index = 0
    #
    # print("records:", len(my_records))
    #
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record['text']))
    #     print('index', my_index)

    # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl'
    binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl'
    # with open(binary_reviews_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    cluster_labels = cluster_reviews(my_reviews)
    specific_records = split_list_by_labels(my_records, cluster_labels)[0]
    generic_records = split_list_by_labels(my_records, cluster_labels)[1]

コード例 #9

0

ファイルを表示

ファイル: labeled_reviews_comparator.py プロジェクト: melqkiades/yelp

def update_labeled_reviews_records():

    reviews_label_map = compare_records()
    agreed_review_ids = set(reviews_label_map.keys())
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    classifier_review_ids = \
        {record[Constants.REVIEW_ID_FIELD] for record in classifier_records}
    non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids)

    # for record in classifier_records:
        # print(record)

    print('number of records before: %d' % len(classifier_records))

    print(reviews_label_map)
    print(non_agreed_review_ids)
    review_type_map = {'s': 'yes', 'g': 'no'}

    # We remove from the classifier records the ones who don't have agreed on a
    # label
    classifier_records = ETLUtils.filter_out_records(
        classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids)

    # Finally we make the update of the labels
    for record in classifier_records:
        review_id = record[Constants.REVIEW_ID_FIELD]
        record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]]
        # print(record)

    print('number of records after: %d' % len(classifier_records))

コード例 #10

0

ファイルを表示

ファイル: yelp_reviews_preprocessor.py プロジェクト: antoine-tran/yelp

    def classify_reviews(self):
        print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        print(Constants.CLASSIFIED_RECORDS_FILE)
        training_records =\
            ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

        # If document level set to sentence (can be either 'sentence' or int)
        document_level = Constants.DOCUMENT_LEVEL
        if document_level != 'review':

            if document_level == 'sentence':
                document_level = float("inf")

            training_records = [
                record for record in training_records
                if record['sentence_index'] < document_level
            ]
            for record in training_records:
                record['specific'] = \
                    'yes' if record['sentence_type'] == 'specific' else 'no'
            print('num training records', len(training_records))

        training_records = self.lemmatize_reviews(training_records)

        classifier = ReviewsClassifier(self.classifier, self.resampler)
        classifier.train(training_records)
        classifier.label_json_reviews(self.records)

コード例 #11

0

ファイルを表示

ファイル: business_clusterer.py プロジェクト: anuragreddygv323/yelp

def get_categories(file_path):
    records = ETLUtils.load_json_file(file_path)

    # Now we obtain the categories for all the businesses
    records = ETLUtils.add_transpose_list_column('categories', records)
    BusinessETL.drop_unwanted_fields(records)

    return records[0].keys()

コード例 #12

0

ファイルを表示

ファイル: main.py プロジェクト: melqkiades/yelp

def analyze_context_records():
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    records = ETLUtils.filter_records(records, 'context_type', ['context'])

    print('num records: %d' % len(records))

    for record in records:
        print(record[Constants.TEXT_FIELD])

コード例 #13

0

ファイルを表示

ファイル: business_etl.py プロジェクト: antoine-tran/yelp

    def get_business_ids(file_path, business_type=None):
        records = ETLUtils.load_json_file(file_path)

        if not business_type:
            return [record['business_id'] for record in records]

        return [record['business_id'] for record in records
                if business_type in record['categories']]

コード例 #14

0

ファイルを表示

ファイル: reviews_dataset_analyzer_report.py プロジェクト: antoine-tran/yelp

 def generate_report_yelp_phoenix():
     reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json'
     report_file = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_yelp_phoenix.ipynb'
     reviews = ETLUtils.load_json_file(reviews_file)
     load_reviews_code =\
         'file_path = \'' + reviews_file + '\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Yelp Phoenix', report_file, load_reviews_code)

コード例 #15

0

ファイルを表示

ファイル: reviews_dataset_analyzer_report.py プロジェクト: antoine-tran/yelp

 def generate_report_ruihai():
     file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)

コード例 #16

0

ファイルを表示

ファイル: reviews_dataset_analyzer_report.py プロジェクト: antoine-tran/yelp

 def generate_report_fourcity_filtered():
     file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json'
     file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb'
     reviews = ETLUtils.load_json_file(file_path)
     load_reviews_code =\
         'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\
         'reviews = ETLUtils.load_json_file(file_path)\n'
     ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)

コード例 #17

0

ファイルを表示

ファイル: context_transformer.py プロジェクト: melqkiades/yelp

def main():

    records = ETLUtils.load_json_file(
        Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
    context_transformer = ContextTransformer(records)
    context_transformer.load_data()
    context_transformer.transform_records()
    context_transformer.export_records()

コード例 #18

0

ファイルを表示

ファイル: carskit_exporter.py プロジェクト: melqkiades/yelp

    def load_data(self):
        """
        Loads the records and the topic model from files

        """
        self.records = ETLUtils.load_json_file(
            Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
        self.topic_extractor = NmfTopicExtractor()
        self.topic_extractor.load_trained_data()

コード例 #19

0

ファイルを表示

ファイル: labeled_reviews_comparator.py プロジェクト: melqkiades/yelp

def main():
    classifier_records = \
        ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    # count_specific_generic_ratio(classifier_records)

    # load_data(file_name)
    # compare_records()
    # update_labeled_reviews_records()
    # foo()
    cohens_kappa()

コード例 #20

0

ファイルを表示

ファイル: context_extractor.py プロジェクト: melqkiades/yelp

def main():

    # records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    records = ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

コード例 #21

0

ファイルを表示

ファイル: basic_knn.py プロジェクト: antoine-tran/yelp

def load_data(json_file):
    records = ETLUtils.load_json_file(json_file)
    fields = ['user_id', 'business_id', 'stars']
    records = ETLUtils.select_fields(fields, records)

    # We rename the 'stars' field to 'overall_rating' to take advantage of the
    # function extractor.get_user_average_overall_rating
    for record in records:
        record['overall_rating'] = record.pop('stars')
        record['offering_id'] = record.pop('business_id')

    return records

コード例 #22

0

ファイルを表示

ファイル: topic_model_context_richness.py プロジェクト: melqkiades/yelp

def analyze_topics():

    start_time = time.time()

    utilities.plant_seeds()
    records = \
        ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE)
    print('num_reviews', len(records))
    num_topics = Constants.TOPIC_MODEL_NUM_TOPICS
    num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS

    topic_model_string = None
    if Constants.TOPIC_MODEL_TYPE == 'ensemble':
        topic_model = NmfTopicExtractor()
        topic_model.load_trained_data()
        topic_model_string = topic_model.print_topic_model('max')
    elif Constants.TOPIC_MODEL_TYPE == 'lda':
        topic_model = topic_model_creator.load_topic_model(None, None)
        topic_model_string = [
            topic_model.print_topic(topic_id, num_terms)
            for topic_id in range(num_topics)
        ]
    context_extractor = ContextExtractor(records)
    context_extractor.separate_reviews()
    context_extractor.get_context_rich_topics()

    topic_data = []

    for topic in range(num_topics):
        result = {}
        result['topic_id'] = topic
        result.update(split_topic(topic_model_string[topic]))
        result['ratio'] = context_extractor.topic_ratio_map[topic]
        result['weighted_frequency'] = \
            context_extractor.topic_weighted_frequency_map[topic]
        topic_data.append(result)

    data_frame = DataFrame.from_dict(topic_data)
    scores = {}
    scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS
    probability_score = data_frame['probability_score'].mean()
    scores['probability_score'] = probability_score

    print('probability score: %f' % scores['probability_score'])

    end_time = time.time()
    cycle_time = end_time - start_time
    scores['cycle_time'] = cycle_time

    print("Cycle time = %f seconds" % cycle_time)

    return scores

コード例 #23

0

ファイルを表示

ファイル: top_n_runner.py プロジェクト: bachlog/yelp

def main_export():
    I = my_i

    records = ETLUtils.load_json_file(RECORDS_FILE)
    print('num_records', len(records))

    test_records = ETLUtils.load_json_file(TEST_RECORDS_FILE)
    # test_reviews = review_metrics_extractor.build_reviews(test_records)
    # with open(TEST_REVIEWS_FILE, 'wb') as write_file:
    #     pickle.dump(test_reviews, write_file, pickle.HIGHEST_PROTOCOL)
    # with open(TEST_REVIEWS_FILE, 'rb') as read_file:
    #     test_reviews = pickle.load(read_file)
    # train_file = RECORDS_FILE + '_train'
    # train_records = ETLUtils.load_json_file(train_file)

    with open(USER_ITEM_MAP_FILE, 'rb') as read_file:
        user_item_map = pickle.load(read_file)

    top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I)
    top_n_evaluator.initialize(user_item_map)

    top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)

コード例 #24

0

ファイルを表示

ファイル: top_n_runner.py プロジェクト: bachlog/yelp

def main_converter():

    csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv'
    csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv'

    # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True)
    # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True)

    headers = ['stars', 'user_id', 'business_id']
    train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
    records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE)
    train_records = ETLUtils.select_fields(headers, train_records)
    records_to_predict = ETLUtils.select_fields(headers, records_to_predict)

    ETLUtils.save_csv_file(csv_train_file, train_records, headers)
    ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers)

    csv_files = [
        csv_train_file,
        csv_test_file
    ]

    csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)

コード例 #25

0

ファイルを表示

ファイル: nmf_context_extractor.py プロジェクト: melqkiades/yelp

def main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    print('num_reviews', len(records))
    # lda_context_utils.discover_topics(my_reviews, 150)
    context_extractor = NmfContextExtractor(records)
    context_extractor.generate_review_bows()
    context_extractor.build_document_term_matrix()
    # context_extractor.build_topic_model()
    context_extractor.build_stable_topic_model()
    context_extractor.print_topic_model()
    context_extractor.update_reviews_with_topics()
    context_extractor.get_context_rich_topics()

コード例 #26

0

ファイルを表示

ファイル: data_analyzer.py プロジェクト: antoine-tran/yelp

def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame, 'overall_rating', plot_type='bar', title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()

コード例 #27

0

ファイルを表示

    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.preprocess()

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()

コード例 #28

0

ファイルを表示

    def lemmatize_records(self):

        if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE):
            print('Records were already lemmatized')
            self.records = \
                ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE)
            return

        if Constants.DOCUMENT_LEVEL == 'review':
            self.records = self.lemmatize_reviews(self.records)
        elif Constants.DOCUMENT_LEVEL == 'sentence' or\
                isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
            self.records = self.lemmatize_sentences(self.records)

        ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)

コード例 #29

0

ファイルを表示

    def load_context_reviews(self, cycle_index, fold_index):

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)
        important_records_file_path = Constants.generate_file_name(
            'context_important_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, True)

        self.train_records = ETLUtils.load_json_file(train_records_file_path)
        self.important_records = \
            ETLUtils.load_json_file(important_records_file_path)
        self.load_cache_context_topics(cycle_index, fold_index)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        # self.train_records = self.filter_context_words(self.train_records)
        # self.print_context_topics(self.important_records)

        self.important_records = None
        gc.collect()

コード例 #30

0

ファイルを表示

ファイル: topic_latex_generator.py プロジェクト: melqkiades/yelp

def main():
    utilities.plant_seeds()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    context_extractor = \
        topic_model_creator.create_topic_model(records, None, None)

    topic_latex_generator = TopicLatexGenerator(context_extractor)
    topic_latex_generator.generate_pdf()

コード例 #31

0

ファイルを表示

ファイル: main.py プロジェクト: melqkiades/yelp

def export_records_to_text():
    print('%s: Exporting bag-of-words to text files' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    print('Total records: %d' % len(records))

    folder = '/Users/fpena/tmp/topic-ensemble/data/' + Constants.ITEM_TYPE + '/'

    for record in records:
        file_name = folder + Constants.ITEM_TYPE + '_' + \
                    record[Constants.REVIEW_ID_FIELD] + '.txt'

        with codecs.open(file_name, "w", encoding="utf-8-sig") as text_file:
            text_file.write(" ".join(record[Constants.BOW_FIELD]))

コード例 #32

0

ファイルを表示

ファイル: carskit_caller.py プロジェクト: melqkiades/yelp

def analyze_results():
    json_file = Constants.generate_file_name(
        'carskit_results', 'json', OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize']
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')

コード例 #33

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def main2():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)[:10]
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    # print(data_frame)
    # data_frame['a'] = data_frame[Constants.USER_ID_FIELD].astype('category')
    # data_frame['b'] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    data_frame[Constants.USER_ID_FIELD] = data_frame[Constants.USER_ID_FIELD].astype('category')
    data_frame[Constants.ITEM_ID_FIELD] = data_frame[Constants.ITEM_ID_FIELD].astype('category')
    # category_columns = data_frame.select_dtypes(['category']).columns
    # print(category_columns)
    # data_frame[category_columns] = \
    #     data_frame[category_columns].apply(lambda x: x.cat.codes)
    # print(data_frame)
    # print(data_frame['b'].cat.categories[0])
    print(data_frame[Constants.USER_ID_FIELD].cat.codes)
    print(data_frame[Constants.ITEM_ID_FIELD].cat.codes)

    plays = coo_matrix((data_frame[Constants.RATING_FIELD].astype(float),
                        (data_frame[Constants.USER_ID_FIELD].cat.codes,
                         data_frame[Constants.ITEM_ID_FIELD].cat.codes)))

    print(plays)
    # from sklearn.decomposition import NMF
    model = NMF(n_components=2, init='random', random_state=0)
    W = model.fit_transform(plays)
    H = model.components_
    nR = numpy.dot(W, H)
    # print(nR)
    # print(nR.shape)

    print 'User-based CF MSE: ' + str(
        mean_squared_error(nR, plays.toarray()))

    # get SVD components from train matrix. Choose k.
    u, s, vt = svds(plays, k=5)
    s_diag_matrix = numpy.diag(s)
    X_pred = numpy.dot(numpy.dot(u, s_diag_matrix), vt)
    # print(X_pred)
    print 'User-based CF MSE: ' + str(mean_squared_error(X_pred, plays.toarray()))

コード例 #34

0

ファイルを表示

ファイル: business_etl.py プロジェクト: neostoic/yelp-1

    def create_category_sets(file_path):
        """
        Creates an array of arrays in which each sub-array contains the
        categories of each business in the Yelp Phoenix Business data set

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array of numpy arrays with the categories that each
        business has, for example [['Restaurant', 'Mexican', 'Bar'],
        ['Bar', 'Disco']]
        """
        records = ETLUtils.load_json_file(file_path)
        sets = numpy.array([set(record['categories']) for record in records])

        return sets

コード例 #35

0

ファイルを表示

ファイル: carskit_caller.py プロジェクト: swarnamd/yelp

def analyze_results():
    json_file = Constants.generate_file_name('carskit_results', 'json',
                                             OUTPUT_FOLDER, None, None, False)
    records = ETLUtils.load_json_file(json_file)

    data_frame = pandas.DataFrame(records)
    print(sorted(list(data_frame.columns.values)))
    cols = [
        'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format',
        'topic_model_num_topics', 'topic_model_normalize'
    ]
    data_frame = data_frame[cols]
    data_frame = data_frame.sort_values(['ck_rec10'])
    print(data_frame)

    data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE +
                      '_carskit.csv')

コード例 #36

0

ファイルを表示

ファイル: labeled_reviews_comparator.py プロジェクト: swarnamd/yelp

def load_data(file_name):
    records = ETLUtils.load_json_file(file_name)
    data_frame = pandas.DataFrame.from_records(records)

    column = 'review_type'
    # column = 'specific'

    # print(data_frame.describe())
    # print(data_frame.head())
    # data_frame = data_frame['specific']
    # print(data_frame.groupby(column)[column].count())
    # reviews = list(data_frame['text'])
    values = list(data_frame[column])
    values = [value.encode('ascii', 'ignore') for value in values]
    # print(reviews)
    print(values)

    return records

コード例 #37

0

ファイルを表示

ファイル: topic_model_analyzer.py プロジェクト: neostoic/yelp-1

def see_topic_analysis_results():
    topic_analysis_file = Constants.DATASET_FOLDER + 'topic_model_analysis_' + \
                          Constants.ITEM_TYPE + '.json'

    results = ETLUtils.load_json_file(topic_analysis_file)

    index = 0
    for result in results:
        score_ratio = result['high_ratio_mean_score'] / result[
            'low_ratio_mean_score']
        count_ratio = result['weighted_high_ratio_count'] / result[
            'weighted_low_ratio_count']
        print(index, score_ratio, count_ratio,
              result['high_ratio_mean_score'],
              result['low_ratio_mean_score'],
              result['lda_epsilon'], result['topic_weighting_method'],
              result['num_context_topics'], result['lda_num_topics'])
        index += 1

コード例 #38

0

ファイルを表示

def plot_overall_rating():

    # reviews = extractor.pre_process_reviews()
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)

    print(data_frame)

    DataPlotter.plot_data(data_frame,
                          'overall_rating',
                          plot_type='bar',
                          title='Overall Rating')
    # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating')
    # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating')
    # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating')
    # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating')
    # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating')
    plt.show()

コード例 #39

0

ファイルを表示

    def load_cache_context_topics(self, cycle_index, fold_index):

        print('load cache context topics: %s' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        topics_file_path = Constants.generate_file_name(
            'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index,
            fold_index, True)

        self.context_rich_topics = sorted(
            ETLUtils.load_json_file(topics_file_path)[0].items(),
            key=operator.itemgetter(1),
            reverse=True)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

コード例 #40

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def topic_stability_main():

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    # num_topic_list = range(2, 101)
    num_topic_list = [2, 5]
    results = {}
    for num_topics in num_topic_list:
        new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}
        Constants.update_properties(new_properties)
        results[num_topics] = calculate_topic_stability(records)

    print('Results:')
    for num_topics in num_topic_list:
        scores = results[num_topics]
        print('%d: %.4f [%.4f,%.4f]' %
              (num_topics, numpy.nanmean(scores), numpy.nanmin(scores),
               numpy.nanmax(scores)))

コード例 #41

0

ファイルを表示

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json'
    reviews = ETLUtils.load_json_file(reviews_file)

    select_fields = ['user_id', 'business_id', 'stars']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['business_id', 'stars'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    reviews = clean_reviews(reviews)

    return reviews

コード例 #42

0

ファイルを表示

def main():
    # reviews = pre_process_reviews()
    # save_dictionary_list_to_file(reviews, '/Users/fpena/tmp/filtered_reviews.json')
    reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    data_frame = DataFrame(reviews)
    column = 'offering_id'
    groupby = data_frame.groupby(column)
    counts = groupby.mean()
    # print(counts)

    items = counts.index.get_level_values(0).tolist()

    for item, mean in zip(items, counts['overall_rating']):
        print(item, mean)

    # print(get_item_list(reviews, 2))
    # print(len(reviews))
    # initialize_users(reviews, 10)
    pass

コード例 #43

0

ファイルを表示

    def tag_reviews_language(self):

        print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if os.path.exists(Constants.LANGUAGE_RECORDS_FILE):
            print('Records have already been tagged with language field')
            self.records = \
                ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE)
            return

        DetectorFactory.seed = 0

        for record in self.records:
            try:
                language = langdetect.detect(record[Constants.TEXT_FIELD])
            except LangDetectException:
                language = 'unknown'
            record[Constants.LANGUAGE_FIELD] = language

        ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)

コード例 #44

0

ファイルを表示

def create_single_topic_model(cycle_index, fold_index):

    print(Constants._properties)
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1 / float(num_folds))

    for i in range(cycle_index + 1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    create_topic_model(train_records, cycle_index, fold_index)

コード例 #45

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def analyze_fourcity():
    records = ETLUtils.load_json_file(Constants.FULL_PROCESSED_RECORDS_FILE)
    # for record in records:
    #     print(record)

    cols = [
        Constants.USER_ID_FIELD,
        Constants.ITEM_ID_FIELD,
        Constants.RATING_FIELD
    ]
    data_frame = pandas.DataFrame(records, columns=cols)
    print(data_frame.describe())

    zero_records = 0
    for record in records:
        if record[Constants.RATING_FIELD] < 1.0:
            print(record)
            zero_records += 1

    for record in records:
        if record[Constants.RATING_FIELD] > 5.0:
            print(record)
            zero_records += 1
    print('zero records: %d' % zero_records)

    # Look for duplicates
    keys_set = set()
    num_duplicates = 0
    print('Looking for duplicates')
    records_map = {}
    for record in records:
        # if record[Constants.USER_ITEM_KEY_FIELD] in keys_set:
        record_key = record[Constants.USER_ITEM_KEY_FIELD]
        if record_key in records_map:
            print('old record', records_map[record_key][Constants.TEXT_FIELD])
            print('new record', record[Constants.TEXT_FIELD])
            num_duplicates += 1
        keys_set.add(record_key)
        records_map[record_key] = record

    print('duplicate records: %d' % num_duplicates)

コード例 #46

0

ファイルを表示

    def create_reviews(self):
        with self.doc.create(Section('Reviews')):
            with self.doc.create(Subsection('A subsection')):

                sg_map = {'yes': 'specific', 'no': 'generic'}

                review_index = 0
                # full_records = ETLUtils.load_json_file(
                #     Constants.FULL_PROCESSED_RECORDS_FILE)
                records_file = Constants.DATASET_FOLDER +\
                    'classified_' + Constants.ITEM_TYPE + '_reviews.json'
                full_records = ETLUtils.load_json_file(records_file)

                for record in full_records:
                    with self.doc.create(Subsection(
                                    'Review %d (%s)' % (
                            (review_index + 1), sg_map[record['specific']]))):
                        # for doc_part in self.build_text(
                        #         record[Constants.TEXT_FIELD]):
                        for doc_part in self.build_text_manual(record):
                            self.doc.append(doc_part)
                    review_index += 1

コード例 #47

0

ファイルを表示

def pre_process_reviews():
    """
    Returns a list of preprocessed reviews, where the reviews have been filtered
    to obtain only relevant data, have dropped any fields that are not useful,
    and also have additional fields that are handy to make calculations

    :return: a list of preprocessed reviews
    """
    data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/'
    review_file_path = data_folder + 'review.txt'
    # review_file_path = data_folder + 'review-short.json'
    reviews = ETLUtils.load_json_file(review_file_path)

    select_fields = ['ratings', 'author', 'offering_id']
    reviews = ETLUtils.select_fields(select_fields, reviews)
    extract_fields(reviews)
    ETLUtils.drop_fields(['author', 'ratings'], reviews)
    # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json')
    # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat')
    reviews = clean_reviews(reviews)

    return reviews

コード例 #48

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def build_manual_topic_model():

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
          Constants.ITEM_TYPE + '_reviews_first_sentences.json'
    records = ETLUtils.load_json_file(new_classified_records_file)
    # records = ETLUtils.filter_records(records, 'context_type', ['context'])
    # records = ETLUtils.filter_records(records, 'sentence_type', ['specific'])
    # records = ETLUtils.filter_records(records, 'sentence_index', [0])
    print('total records: %d' % len(records))

    # print(records[0])
    count = 0

    for i in range(len(records)):
        record = records[i]
        if record['sentence_index'] == 0.0:
            # if record['context_type'] == 'context' and record['context_summary'] != 'all_context':
            if record['sentence_type'] == 'specific':
                print('%d:\t%s' % (i+1, records[i]['text']))
                count += 1

    print('count: %d' % count)

コード例 #49

0

ファイルを表示

ファイル: business_etl.py プロジェクト: neostoic/yelp-1

    def create_category_matrix(file_path):
        """
        Creates a matrix with all the categories for businesses that are
        contained in the Yelp Phoenix Business data set. Each column of the
        matrix represents a category, and each row a business. This is a binary
        matrix that contains a 1 at the position i,j if the business i contains
        the category j, and a 0 otherwise.

        :rtype : numpy array matrix
        :param file_path: the path for the file that contains the businesses
        data
        :return: a numpy array binary matrix
        """
        records = ETLUtils.load_json_file(file_path)

        # Now we obtain the categories for all the businesses
        records = ETLUtils.add_transpose_list_column('categories', records)
        BusinessETL.drop_unwanted_fields(records)
        matrix = numpy.array(
            [numpy.array(record.values()) for record in records])

        return matrix

コード例 #50

0

ファイルを表示

ファイル: classifier_evaluator.py プロジェクト: srividya89/yelp

def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_hotel_reviews.json'
    # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_restaurant_reviews.json'
    # records = ETLUtils.load_json_file(file_name)
    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)

    # Take only the first sentence
    # document_level = 1
    if isinstance(Constants.DOCUMENT_LEVEL, (int, long)):
        records = [
            record for record in records
            if record['sentence_index'] < Constants.DOCUMENT_LEVEL
        ]

    return records

コード例 #51

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def create_topic_model_with_context_records():

    processed_records_file = Constants.generate_file_name(
        'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None,
        None, False, True)
    records = ETLUtils.load_json_file(processed_records_file)
    print('records length: %d' % len(records))

    context_records = ETLUtils.filter_records(records, 'context_type', ['context'])
    print('context records length: %d' % len(context_records))
    context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific'])
    print('context specific records length: %d' % len(context_specific_records))

    for i in range(len(context_specific_records)):
        # print('%d:\t%s' % (i, context_records[i]['text']))
        print('%d:\t%s' % (i, context_specific_records[i]['bow']))

    for i in range(1, len(context_records)+1):

        Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i})
        context_extractor = \
            topic_model_creator.create_topic_model(records, None, None)

        topic_data = []

        for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            result = {}
            result['topic_id'] = topic
            result.update(split_topic(context_extractor.print_topic_model(
                num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic]))
            result['ratio'] = context_extractor.topic_ratio_map[topic]
            result['weighted_frequency'] = \
                context_extractor.topic_weighted_frequency_map[topic]
            topic_data.append(result)

        file_name = Constants.generate_file_name(
            'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True)
        generate_excel_file(topic_data, file_name)

コード例 #52

0

ファイルを表示

def create_single_topic_model(cycle_index, fold_index, check_exists=True):

    Constants.print_properties()
    print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        msg = 'This function shouldn\'t be used when the ' \
              'separate_topic_model_recsys_reviews property is set to True'
        raise ValueError(msg)

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

    if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test':
        pass
    elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate':
        num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
        cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE
        split = 1 - (1 / float(num_folds))
        cv_start = float(cycle) / num_folds
        print('cv_start', cv_start)
        records, _ = ETLUtils.split_train_test(records, split, cv_start)
    else:
        raise ValueError('Unknown cross-validation strategy')

    utilities.plant_seeds()
    num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS
    split = 1 - (1/float(num_folds))

    for i in range(cycle_index+1):

        if Constants.SHUFFLE_DATA:
            random.shuffle(records)

    cv_start = float(fold_index) / num_folds
    train_records, test_records = \
        ETLUtils.split_train_test(records, split=split, start=cv_start)
    return create_topic_model(
        train_records, cycle_index, fold_index, check_exists)

コード例 #53

0

ファイルを表示

def load_records():
    """
    Loads the reviews that have been manually tagged at a sentence level,
    this are the reviews that we will use to train our classifier. Only the
    first sentence of each review will be used
    """

    print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    dataset = Constants.ITEM_TYPE
    folder = Constants.DATASET_FOLDER
    records_file = folder + \
                   'classified_' + dataset + '_reviews_sentences.json'
    records = ETLUtils.load_json_file(records_file)

    # Take only the first sentence
    # max_sentences = 1
    if Constants.MAX_SENTENCES is not None:
        records = [
            record for record in records
            if record['sentence_index'] < Constants.MAX_SENTENCES
        ]

    return records

コード例 #54

0

ファイルを表示

ファイル: topic_model_stability.py プロジェクト: swarnamd/yelp

def evaluate_topic_model(metric):
    print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    Constants.update_properties({
        Constants.NUMPY_RANDOM_SEED_FIELD:
        Constants.NUMPY_RANDOM_SEED + 10,
        Constants.RANDOM_SEED_FIELD:
        Constants.RANDOM_SEED + 10
    })
    utilities.plant_seeds()
    Constants.print_properties()

    records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
    if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
        num_records = len(records)
        records = records[:num_records / 2]
    print('num_reviews', len(records))

    all_term_rankings = None
    topic_model_type = Constants.TOPIC_MODEL_TYPE
    if topic_model_type in ['lda', 'nmf']:
        all_term_rankings = create_all_term_rankings(records, metric)
    elif topic_model_type == 'ensemble':
        all_term_rankings = create_all_term_rankings_from_ensemble()
    else:
        raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' %
                         topic_model_type)
    print('Total iterations: %d' % len(all_term_rankings))

    if metric == TERM_STABILITY_REFERENCE:
        return eval_term_stability_reference(all_term_rankings)
    if metric == TERM_STABILITY_PAIRWISE:
        return eval_term_stability_pairwise(all_term_rankings)
    elif metric == TERM_DIFFERENCE:
        return eval_term_difference(all_term_rankings)
    else:
        raise ValueError('Unknown evaluation metric: \'%s\'' % metric)

コード例 #55

0

ファイルを表示

    def find_reviews_topics(self, context_extractor, cycle_index, fold_index):
        print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        train_records_file_path = Constants.generate_file_name(
            'context_train_records', 'json', Constants.CACHE_FOLDER,
            cycle_index, fold_index, Constants.USE_CONTEXT)

        if os.path.exists(train_records_file_path):
            self.train_records = \
                ETLUtils.load_json_file(train_records_file_path)
        else:
            context_extractor.find_contextual_topics(self.train_records)
            ETLUtils.save_json_file(train_records_file_path,
                                    self.train_records)
        context_extractor.find_contextual_topics(
            self.important_records, Constants.TEXT_SAMPLING_PROPORTION)

        self.context_topics_map = {}
        for record in self.important_records:
            self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \
                record[Constants.CONTEXT_TOPICS_FIELD]

        self.important_records = None
        gc.collect()

コード例 #56

0

ファイルを表示

 def load_records_to_predict(self, records_file):
     self.records_to_predict = ETLUtils.load_json_file(records_file)
     with open(records_file + '.pkl', 'rb') as read_file:
         self.items_to_predict = pickle.load(read_file)

コード例 #57

0

ファイルを表示

ファイル: main.py プロジェクト: swarnamd/yelp

def transform_manually_labeled_reviews():

    full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json')

    records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE)
    print('total records: %d' % len(records))

    new_records = []

    for record in records:

        sentence_index = record['sentence_index']

        if sentence_index > 0:
            continue
        record['predicted_class'] = record['sentence_type']
        new_records.append(record)

    # count = 0
    # for new_record in new_records:
    #     internal_count = 0
    #     for full_record in full_records:
    #         if full_record['text'].startswith(new_record['text']):
    #             # print(full_record['text'])
    #             internal_count += 1
    #             count += 1
    #             print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id']))
    #
    #             if internal_count > 1:
    #                 print('internal count: %d\treview_id: %s' % (internal_count, new_record['text']))

    # print('count: %d' % count)

    index = 0

    for new_record in new_records:

        while True:

            full_record = full_records[index]

            if full_record['text'].startswith(new_record['text']):
                new_record[Constants.USER_ID_FIELD] = full_record['user_id']
                new_record[Constants.ITEM_ID_FIELD] = full_record['business_id']
                new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id']
                new_record[Constants.RATING_FIELD] = full_record['stars']
                break
            index += 1
        index += 1

    print('index: %d' % index)

    for new_record in new_records:

        for full_record in full_records:
            if new_record['review_id'] == full_record['review_id']:
                print('%s ====' % new_record['text'])
                print(full_record['text'])
                print('******************\n******************\n******************\n******************')
                break

    # reviews_preprocessor = ReviewsPreprocessor()
    # new_records = reviews_preprocessor.lemmatize_sentences(new_records)
    # reviews_preprocessor.records = new_records
    # reviews_preprocessor.build_bag_of_words()
    # reviews_preprocessor.drop_unnecessary_fields()

    new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \
        Constants.ITEM_TYPE + '_reviews_first_sentences.json'

    print(new_records[0])

    ETLUtils.save_json_file(new_classified_records_file, new_records)

コード例 #58

0

ファイルを表示

            if review['business_id'] in business_ids:
                filtered_reviews.append(review)

        return filtered_reviews

    @staticmethod
    def sort_records(records, field, reverse=False):
        return sorted(records, key=itemgetter(field), reverse=reverse)



start = time.time()

review_etl = ReviewETL()
my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json"
my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json"
my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels')
my_reviews = ETLUtils.load_json_file(my_reviews_file)
# print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text')))
my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids)
my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews)
# my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id')
# print(len(my_sorted_reviews))


# main()
end = time.time()
total_time = end - start
print("Total time = %f seconds" % total_time)

コード例 #59

0

ファイルを表示

ファイル: main.py プロジェクト: neostoic/yelp-1

def main():
    item_type = 'hotel'
    # item_type = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_file = my_folder + 'classified_' + item_type + '_reviews.json'
    binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl'
    my_records = ETLUtils.load_json_file(my_file)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    num_features = 2

    my_metrics = numpy.zeros((len(my_reviews), num_features))
    for index in range(len(my_reviews)):
        my_metrics[index] =\
            review_metrics_extractor.get_review_metrics(my_reviews[index])

    review_metrics_extractor.normalize_matrix_by_columns(my_metrics)

    count_specific = 0
    count_generic = 0
    for record in my_records:

        if record['specific'] == 'yes':
            count_specific += 1

        if record['specific'] == 'no':
            count_generic += 1

    print('count_specific: %d' % count_specific)
    print('count_generic: %d' % count_generic)
    print('specific percentage: %f%%' % (float(count_specific)/len(my_records)))
    print('generic percentage: %f%%' % (float(count_generic)/len(my_records)))

    my_labels = numpy.array([record['specific'] == 'yes' for record in my_records])

    classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        # DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf'),
        SVC(C=1.0, kernel='linear'),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(),
        LinearSVC()
    ]
    scores = [[] for _ in range(len(classifiers))]

    Xtrans = my_metrics

    cv = KFold(n=len(my_metrics), n_folds=5)

    for i in range(len(classifiers)):
        for train, test in cv:
            x_train, y_train = Xtrans[train], my_labels[train]
            x_test, y_test = Xtrans[test], my_labels[test]

            clf = classifiers[i]
            clf.fit(x_train, y_train)
            scores[i].append(clf.score(x_test, y_test))

    for classifier, score in zip(classifiers, scores):
        print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score)))

    plot(my_metrics, my_labels)