def load(self, records): self.records = records self.ratings_matrix = basic_knn.create_ratings_matrix(records) self.reviews_matrix = create_reviews_matrix(records) self.user_dictionary = extractor.initialize_users(self.records, False) self.user_ids = extractor.get_groupby_list(self.records, 'user_id') lda_based_context = LdaBasedContext(self.records, self.reviews) # self.lda_model =\ # lda_context_utils.discover_topics(text_reviews, self.num_topics) # if self.reviews: # lda_based_context = LdaBasedContext() # lda_based_context.reviews = self.reviews # lda_based_context.init_reviews() # else: # text_reviews = [] # for record in self.records: # text_reviews.append(record['text']) # lda_based_context = LdaBasedContext(text_reviews) # lda_based_context.init_reviews() self.context_rich_topics = lda_based_context.get_context_rich_topics() self.lda_model = lda_based_context.topic_model print('building similarity matrix', time.strftime("%H:%M:%S")) self.context_matrix = self.create_context_matrix(records) self.similarity_matrix = self.create_similarity_matrix() print('finished building similarity matrix', time.strftime("%H:%M:%S"))
def train_topic_model(self): print('train topic model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) # self.train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) lda_based_context = LdaBasedContext(self.train_records) lda_based_context.get_context_rich_topics() self.context_rich_topics = lda_based_context.context_rich_topics print('Trained LDA Model: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) # with open(TOPIC_MODEL_FILE, 'wb') as write_file: # pickle.dump(lda_based_context, write_file, pickle.HIGHEST_PROTOCOL) # with open(TOPIC_MODEL_FILE, 'rb') as read_file: # lda_based_context = pickle.load(read_file) self.context_rich_topics = lda_based_context.context_rich_topics return lda_based_context
def train_context_topics_model(records): print('%s: train context topics model' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context = LdaBasedContext(records) lda_based_context.generate_review_corpus() lda_based_context.build_topic_model() lda_based_context.update_reviews_with_topics() print('%s: Trained LDA Model' % time.strftime("%Y/%m/%d-%H:%M:%S")) return lda_based_context
def train_topic_model(self, cycle_index, fold_index): if Constants.CACHE_TOPIC_MODEL: print('loading topic model') lda_based_context = topic_model_creator.load_topic_model( cycle_index, fold_index) else: print('train topic model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) lda_based_context = LdaBasedContext(self.train_records) lda_based_context.generate_review_corpus() lda_based_context.build_topic_model() lda_based_context.update_reviews_with_topics() lda_based_context.get_context_rich_topics() self.context_rich_topics = lda_based_context.context_rich_topics print('Trained LDA Model: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return lda_based_context
def load_context(self, records): if self.reviews: lda_based_context = LdaBasedContext() lda_based_context.reviews = self.reviews lda_based_context.init_reviews() else: text_reviews = [] for record in records: text_reviews.append(record['text']) lda_based_context = LdaBasedContext(text_reviews) lda_based_context.init_reviews() self.context_rich_topics = lda_based_context.get_context_rich_topics() self.lda_model = lda_based_context.topic_model for user in self.user_dictionary.values(): user.item_contexts = lda_context_utils.get_user_item_contexts( records, self.lda_model, user.user_id, True )
def full_cycle(self, train_records, test_records, train_reviews, test_reviews): self.lda_based_context = LdaBasedContext(train_records, train_reviews) self.lda_based_context.get_context_rich_topics() print("Trained LDA Model: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set = self.lda_based_context.find_contextual_topics(train_records) contextual_test_set = self.lda_based_context.find_contextual_topics(test_records) print("contextual test set size: %d" % len(contextual_test_set)) self.build_headers() contextual_train_set = ETLUtils.select_fields(self.headers, contextual_train_set) contextual_test_set = ETLUtils.select_fields(self.headers, contextual_test_set) print("Exported contextual topics: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) return contextual_train_set, contextual_test_set
def train_context_extractor(records, stable=True): print('%s: train context topics model' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.TOPIC_MODEL_TYPE == 'lda': context_extractor = LdaBasedContext(records) context_extractor.generate_review_corpus() context_extractor.build_topic_model() context_extractor.update_reviews_with_topics() context_extractor.get_context_rich_topics() context_extractor.clear_reviews() elif Constants.TOPIC_MODEL_TYPE == 'nmf': context_extractor = NmfContextExtractor(records) context_extractor.generate_review_bows() context_extractor.build_document_term_matrix() if stable: context_extractor.build_stable_topic_model() else: context_extractor.build_topic_model() context_extractor.update_reviews_with_topics() context_extractor.get_context_rich_topics() context_extractor.clear_reviews() else: raise ValueError('Unrecognized topic model type: \'%s\'' % Constants.TOPIC_MODEL_TYPE) print('%s: Trained Topic Model' % time.strftime("%Y/%m/%d-%H:%M:%S")) return context_extractor
class ContextDataConverter: def __init__(self, reviews_classifier): self.reviews_classifier = reviews_classifier self.shuffle_seed = 0 self.headers = None self.lda_based_context = None def full_cycle(self, train_records, test_records, train_reviews, test_reviews): self.lda_based_context = LdaBasedContext(train_records, train_reviews) self.lda_based_context.get_context_rich_topics() print("Trained LDA Model: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set = self.lda_based_context.find_contextual_topics(train_records) contextual_test_set = self.lda_based_context.find_contextual_topics(test_records) print("contextual test set size: %d" % len(contextual_test_set)) self.build_headers() contextual_train_set = ETLUtils.select_fields(self.headers, contextual_train_set) contextual_test_set = ETLUtils.select_fields(self.headers, contextual_test_set) print("Exported contextual topics: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) return contextual_train_set, contextual_test_set def build_headers(self): self.headers = ["stars", "user_id", "business_id"] for i in self.lda_based_context.context_rich_topics: topic_id = "topic" + str(i[0]) self.headers.append(topic_id) def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None): contextual_train_set, contextual_test_set = self.full_cycle( train_records, test_records, train_reviews, test_reviews ) print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json' csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv" # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json' csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv" # ETLUtils.save_json_file(json_train_file, contextual_train_set) ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers) # ETLUtils.save_json_file(json_test_file, contextual_test_set) ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers) print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [csv_train_file, csv_test_file] num_cols = len(self.headers) context_cols = num_cols print("num_cols", num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm" ) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm") print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))