Exemple #1
0
    def load(self, records):
        self.records = records
        self.ratings_matrix = basic_knn.create_ratings_matrix(records)
        self.reviews_matrix = create_reviews_matrix(records)
        self.user_dictionary = extractor.initialize_users(self.records, False)
        self.user_ids = extractor.get_groupby_list(self.records, 'user_id')

        lda_based_context = LdaBasedContext(self.records, self.reviews)

        # self.lda_model =\
        #     lda_context_utils.discover_topics(text_reviews, self.num_topics)
        # if self.reviews:
        #     lda_based_context = LdaBasedContext()
        #     lda_based_context.reviews = self.reviews
        #     lda_based_context.init_reviews()
        # else:
        #     text_reviews = []
        #     for record in self.records:
        #         text_reviews.append(record['text'])
        #     lda_based_context = LdaBasedContext(text_reviews)
        #     lda_based_context.init_reviews()
        self.context_rich_topics = lda_based_context.get_context_rich_topics()

        self.lda_model = lda_based_context.topic_model
        print('building similarity matrix', time.strftime("%H:%M:%S"))
        self.context_matrix = self.create_context_matrix(records)
        self.similarity_matrix = self.create_similarity_matrix()
        print('finished building similarity matrix', time.strftime("%H:%M:%S"))
Exemple #2
0
    def load(self, records):
        self.records = records
        self.ratings_matrix = basic_knn.create_ratings_matrix(records)
        self.reviews_matrix = create_reviews_matrix(records)
        self.user_dictionary = extractor.initialize_users(self.records, False)
        self.user_ids = extractor.get_groupby_list(self.records, 'user_id')

        lda_based_context = LdaBasedContext(self.records, self.reviews)

        # self.lda_model =\
        #     lda_context_utils.discover_topics(text_reviews, self.num_topics)
        # if self.reviews:
        #     lda_based_context = LdaBasedContext()
        #     lda_based_context.reviews = self.reviews
        #     lda_based_context.init_reviews()
        # else:
        #     text_reviews = []
        #     for record in self.records:
        #         text_reviews.append(record['text'])
        #     lda_based_context = LdaBasedContext(text_reviews)
        #     lda_based_context.init_reviews()
        self.context_rich_topics = lda_based_context.get_context_rich_topics()

        self.lda_model = lda_based_context.topic_model
        print('building similarity matrix', time.strftime("%H:%M:%S"))
        self.context_matrix = self.create_context_matrix(records)
        self.similarity_matrix = self.create_similarity_matrix()
        print('finished building similarity matrix', time.strftime("%H:%M:%S"))
Exemple #3
0
    def train_topic_model(self):
        print('train topic model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
        # self.train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE)
        lda_based_context = LdaBasedContext(self.train_records)
        lda_based_context.get_context_rich_topics()
        self.context_rich_topics = lda_based_context.context_rich_topics

        print('Trained LDA Model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # with open(TOPIC_MODEL_FILE, 'wb') as write_file:
        #     pickle.dump(lda_based_context, write_file, pickle.HIGHEST_PROTOCOL)

        # with open(TOPIC_MODEL_FILE, 'rb') as read_file:
        #     lda_based_context = pickle.load(read_file)

        self.context_rich_topics = lda_based_context.context_rich_topics

        return lda_based_context
Exemple #4
0
def train_context_topics_model(records):
    print('%s: train context topics model' %
          time.strftime("%Y/%m/%d-%H:%M:%S"))
    lda_based_context = LdaBasedContext(records)
    lda_based_context.generate_review_corpus()
    lda_based_context.build_topic_model()
    lda_based_context.update_reviews_with_topics()

    print('%s: Trained LDA Model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    return lda_based_context
Exemple #5
0
    def train_topic_model(self, cycle_index, fold_index):

        if Constants.CACHE_TOPIC_MODEL:
            print('loading topic model')
            lda_based_context = topic_model_creator.load_topic_model(
                cycle_index, fold_index)
        else:
            print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

            lda_based_context = LdaBasedContext(self.train_records)
            lda_based_context.generate_review_corpus()
            lda_based_context.build_topic_model()
            lda_based_context.update_reviews_with_topics()

        lda_based_context.get_context_rich_topics()
        self.context_rich_topics = lda_based_context.context_rich_topics
        print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        return lda_based_context
Exemple #6
0
    def load_context(self, records):
        if self.reviews:
            lda_based_context = LdaBasedContext()
            lda_based_context.reviews = self.reviews
            lda_based_context.init_reviews()
        else:
            text_reviews = []
            for record in records:
                text_reviews.append(record['text'])
            lda_based_context = LdaBasedContext(text_reviews)
            lda_based_context.init_reviews()
        self.context_rich_topics = lda_based_context.get_context_rich_topics()

        self.lda_model = lda_based_context.topic_model

        for user in self.user_dictionary.values():
            user.item_contexts = lda_context_utils.get_user_item_contexts(
                records, self.lda_model, user.user_id, True
            )
def train_context_topics_model(records):
    print('%s: train context topics model' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    lda_based_context = LdaBasedContext(records)
    lda_based_context.generate_review_corpus()
    lda_based_context.build_topic_model()
    lda_based_context.update_reviews_with_topics()

    print('%s: Trained LDA Model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    return lda_based_context
    def train_topic_model(self, cycle_index, fold_index):

        if Constants.CACHE_TOPIC_MODEL:
            print('loading topic model')
            lda_based_context = topic_model_creator.load_topic_model(
                cycle_index, fold_index)
        else:
            print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

            lda_based_context = LdaBasedContext(self.train_records)
            lda_based_context.generate_review_corpus()
            lda_based_context.build_topic_model()
            lda_based_context.update_reviews_with_topics()

        lda_based_context.get_context_rich_topics()
        self.context_rich_topics = lda_based_context.context_rich_topics
        print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        return lda_based_context
    def full_cycle(self, train_records, test_records, train_reviews, test_reviews):

        self.lda_based_context = LdaBasedContext(train_records, train_reviews)
        self.lda_based_context.get_context_rich_topics()

        print("Trained LDA Model: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        contextual_train_set = self.lda_based_context.find_contextual_topics(train_records)
        contextual_test_set = self.lda_based_context.find_contextual_topics(test_records)

        print("contextual test set size: %d" % len(contextual_test_set))

        self.build_headers()
        contextual_train_set = ETLUtils.select_fields(self.headers, contextual_train_set)
        contextual_test_set = ETLUtils.select_fields(self.headers, contextual_test_set)

        print("Exported contextual topics: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        return contextual_train_set, contextual_test_set
Exemple #10
0
def train_context_extractor(records, stable=True):
    print('%s: train context topics model' % time.strftime("%Y/%m/%d-%H:%M:%S"))
    if Constants.TOPIC_MODEL_TYPE == 'lda':
        context_extractor = LdaBasedContext(records)
        context_extractor.generate_review_corpus()
        context_extractor.build_topic_model()
        context_extractor.update_reviews_with_topics()
        context_extractor.get_context_rich_topics()
        context_extractor.clear_reviews()
    elif Constants.TOPIC_MODEL_TYPE == 'nmf':
        context_extractor = NmfContextExtractor(records)
        context_extractor.generate_review_bows()
        context_extractor.build_document_term_matrix()
        if stable:
            context_extractor.build_stable_topic_model()
        else:
            context_extractor.build_topic_model()
        context_extractor.update_reviews_with_topics()
        context_extractor.get_context_rich_topics()
        context_extractor.clear_reviews()
    else:
        raise ValueError('Unrecognized topic model type: \'%s\'' %
                         Constants.TOPIC_MODEL_TYPE)

    print('%s: Trained Topic Model' % time.strftime("%Y/%m/%d-%H:%M:%S"))

    return context_extractor
class ContextDataConverter:
    def __init__(self, reviews_classifier):
        self.reviews_classifier = reviews_classifier
        self.shuffle_seed = 0
        self.headers = None
        self.lda_based_context = None

    def full_cycle(self, train_records, test_records, train_reviews, test_reviews):

        self.lda_based_context = LdaBasedContext(train_records, train_reviews)
        self.lda_based_context.get_context_rich_topics()

        print("Trained LDA Model: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        contextual_train_set = self.lda_based_context.find_contextual_topics(train_records)
        contextual_test_set = self.lda_based_context.find_contextual_topics(test_records)

        print("contextual test set size: %d" % len(contextual_test_set))

        self.build_headers()
        contextual_train_set = ETLUtils.select_fields(self.headers, contextual_train_set)
        contextual_test_set = ETLUtils.select_fields(self.headers, contextual_test_set)

        print("Exported contextual topics: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        return contextual_train_set, contextual_test_set

    def build_headers(self):
        self.headers = ["stars", "user_id", "business_id"]
        for i in self.lda_based_context.context_rich_topics:
            topic_id = "topic" + str(i[0])
            self.headers.append(topic_id)

    def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None):

        contextual_train_set, contextual_test_set = self.full_cycle(
            train_records, test_records, train_reviews, test_reviews
        )

        print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json'
        csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv"
        # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json'
        csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv"

        # ETLUtils.save_json_file(json_train_file, contextual_train_set)
        ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers)

        # ETLUtils.save_json_file(json_test_file, contextual_test_set)
        ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers)

        print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))

        csv_files = [csv_train_file, csv_test_file]

        num_cols = len(self.headers)
        context_cols = num_cols
        print("num_cols", num_cols)
        # print('context_cols', context_cols)

        libfm_converter.csv_to_libfm(
            csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm"
        )
        libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm")

        print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))