def prediction(self, test_input, test_labels=None, only_prediction=True, from_bets_model=False): if from_bets_model: if self.best_ckpt is None: print_with_time('No checkpoint found! Use current weights!') else: self.saver.restore(self.sess, self.best_ckpt) predictions = [] losses = 0 for _ in range(self.prediction_times): res = self._prediction(test_input, test_labels, only_prediction) if only_prediction: predictions.append(res) else: predictions.append(res[0]) losses += res[1] averaged_prediction = np.apply_along_axis( lambda x: np.argmax(np.bincount(x, minlength=2)), axis=0, arr=predictions) averaged_loss = losses / self.prediction_times if only_prediction: return averaged_prediction else: return averaged_prediction, averaged_loss
def initialize(self, model_manager): print_with_time('initialize tags') user_tags, item_tags, _ = make_tags(False) def map_tags(tags): return list(map(lambda x: self.tag_word2id.get(x, 0), tags)) self.all_user_tags = list(map(map_tags, user_tags)) self.all_item_tags = list(map(map_tags, item_tags)) print_with_time('tags initialized')
def make_rate_prediction_dataset(k=5, filter_no_profile=True, overwrite=False): print_with_time('merge data...') merge_cut_profile_rate(filter_no_profile, overwrite) print_with_time('make vocab ...') make_general_vocab(overwrite=overwrite) print_with_time('split dataset...') split_cut_profile_rate(k, overwrite) print_with_time('done dataset')
def create_or_load_model(self, overwrite=False): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) variables = tf.global_variables() saver = tf.train.Saver(variables, max_to_keep=1) ckpt = tf.train.latest_checkpoint(self.ckpt_path) if ckpt is None or overwrite: print_with_time('fresh training... checkpoint path: %s' % self.ckpt_path) self.model_manager.remove_dir(self.ckpt_path) os.makedirs(self.ckpt_path, exist_ok=True) sess.run(tf.global_variables_initializer()) else: print('load pre-training checkpoint at %s' % ckpt) saver.restore(sess, ckpt) self.saver = saver self.sess = sess
def create_or_load_model(self, overwrite=False): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) variables = tf.global_variables() saver = tf.train.Saver(variables, max_to_keep=1) ckpt = tf.train.latest_checkpoint(self.ckpt_path) if ckpt is None or overwrite: print_with_time('fresh training... checkpoint path: %s' % self.ckpt_path) self.model_manager.remove_dir(self.ckpt_path) os.makedirs(self.ckpt_path, exist_ok=True) sess.run(tf.global_variables_initializer()) for m, pretrain_dir in zip(self.models, self.pretrain_dirs): if pretrain_dir is None: continue pretrain_ckpt = tf.train.latest_checkpoint(pretrain_dir) if pretrain_ckpt is None: print_with_time('no checkpoint found in %s' % pretrain_dir) else: namespace = m.__class__.__name__ + '/' sub_variables = [ x for x in variables if x.name.startswith(namespace) ] sub_saver = tf.train.Saver(sub_variables) sub_saver.restore(sess, pretrain_ckpt) print_with_time('load %d variables from %s successfully' % (len(sub_variables), pretrain_dir)) else: print('load pre-training checkpoint at %s' % ckpt) saver.restore(sess, ckpt) self.saver = saver self.sess = sess
def parse_profile(_id, _type='subject'): file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis', 'profile.json') targets = [] descriptions = [] sentiments = [] freqs = [] if not os.path.exists(file): print_with_time('file not exists: %s' % file) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': 0 } profile = load_json_file(file) for target, sentiment_description_sample in profile.items(): for sentiment, description_sample in sentiment_description_sample.items( ): for description, samples in description_sample.items(): targets.append(target) descriptions.append(description) sentiments.append(sentiment) freqs.append(len(samples)) targets = list(map(lambda x: self.target_word2id[x], targets)) descriptions = list( map(lambda x: self.description_word2id[x], descriptions)) sentiments = list( map(lambda x: self.sentiment_word2id[x], sentiments)) length = len(freqs) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': length }
def split_cut_profile_rate(k=5, overwrite=False): if not overwrite: files = [ x for x in os.listdir(CONFIG.single_rate_training_folder) if re.match(r'%s_\d+.json' % CONFIG.single_rate_file_name, x) is not None ] if len(files) == k: return all_records = load_json_file(CONFIG.sing_rate_data_all) print_with_time('all records loaded') random.shuffle(all_records) print_with_time('shuffled') for i in range(k): out_file = os.path.join( CONFIG.single_rate_training_folder, '%s_%d.json' % (CONFIG.single_rate_file_name, i)) subset = all_records[i::k] save_json_file(out_file, subset) del subset gc.collect() del all_records gc.collect()
def initialize(self, model_manager): print_with_time('initialize profiles') user_profiles, item_profiles = make_profiles(False) print_with_time('initial user profiles') try: self.all_user_profiles = model_manager.load_json('user_profiles_v2') except OSError: self.all_user_profiles = list(map(self.triples_to_ids, user_profiles)) model_manager.save_json(self.all_user_profiles, 'user_profiles_v2') print_with_time('initial item profiles') try: self.all_item_profiles = model_manager.load_json('item_profiles_v2') except OSError: self.all_item_profiles = list(map(self.triples_to_ids, item_profiles)) model_manager.save_json(self.all_item_profiles, 'item_profiles_v2') print_with_time('profiles initialized')
def initialize(self, model_manager): def parse_profile(_id, _type='subject'): file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis', 'profile.json') targets = [] descriptions = [] sentiments = [] freqs = [] if not os.path.exists(file): print_with_time('file not exists: %s' % file) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': 0 } profile = load_json_file(file) for target, sentiment_description_sample in profile.items(): for sentiment, description_sample in sentiment_description_sample.items( ): for description, samples in description_sample.items(): targets.append(target) descriptions.append(description) sentiments.append(sentiment) freqs.append(len(samples)) targets = list(map(lambda x: self.target_word2id[x], targets)) descriptions = list( map(lambda x: self.description_word2id[x], descriptions)) sentiments = list( map(lambda x: self.sentiment_word2id[x], sentiments)) length = len(freqs) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': length } print_with_time('initial user profiles') try: self.user_profiles = model_manager.load_json('user_profiles') except OSError: self.user_profiles = list( map(lambda x: parse_profile(x, 'user'), self.user_list)) model_manager.save_json(self.user_profiles, 'user_profiles') print_with_time('initial item profiles') try: self.item_profiles = model_manager.load_json('item_profiles') except OSError: self.item_profiles = list( map(lambda x: parse_profile(x, 'subject'), self.item_list)) model_manager.save_json(self.item_profiles, 'item_profiles') print_with_time('profiles initialized')
def fit(self, trainset, valid_rate=0.1): valid_num = round(len(trainset) * valid_rate) train_num = len(trainset) - valid_num print_with_time("start training on %d samples, valid on %d samples" % (train_num, valid_num)) users_train, items_train, rates_train = zip(*trainset[:train_num]) users_valid, items_valid, rates_valid = zip(*trainset[train_num:]) valid_input = list(zip(users_valid, items_valid)) # Training loop records = [] best_valid_loss = None last_save_epoch = None epoch = 0 for batch_users, batch_items, batch_rates, last_batch \ in self.batch_iter(users_train, items_train, rates_train, True, self.epochs): feed_dict = self.make_feed_dict(batch_users, batch_items, batch_rates, 'train') _, step, loss = self.sess.run( [self.train_op, self.global_step, self.loss], feed_dict=feed_dict) if step % 100 == 0: print_with_time("step {0} : loss = {1}".format(step, loss)) if last_batch: epoch += 1 # evaluate valid_prediction, valid_loss = self.prediction( valid_input, rates_valid, False) valid_acc = accuracy_score(rates_valid, valid_prediction) print_with_time('epoch %d: valid_loss %f, valid_acc %f' % (epoch, valid_loss, valid_acc)) if best_valid_loss is None or valid_loss < best_valid_loss: print_with_time('get a better one!') best_valid_loss = valid_loss self.saver.save(self.sess, os.path.join(self.ckpt_path, 'model.ckpt'), global_step=step) self.best_ckpt = os.path.join(self.ckpt_path, 'model.ckpt-%d' % step) last_save_epoch = epoch records.append({ 'step': int(step), 'train_loss': float(loss), 'valid_loss': float(valid_loss), 'valid_acc': float(valid_acc) }) # early stop if last_save_epoch is not None and self.early_stop is not None and epoch - last_save_epoch > self.early_stop: print_with_time( 'No loss decrease on valid set for %d epochs, stop training' % self.early_stop) break # result self.model_manager.save_json(records, 'train_loss') print_with_time('training done')
def __init__(self, model_dir, batch_size=8192, epochs=100, lr=1e-3, dropout=0.5, early_stop=10, max_length=50, overwrite=True, **kwargs): self.model_manager = utils.ModelManager(model_dir) self.ckpt_path = os.path.join(self.model_manager.path_name, 'ckpt') self.batch_size = batch_size self.epochs = epochs self.lr = lr self.dropout = dropout self.early_stop = early_stop self.max_length = max_length self.user_list = load_np_array(CONFIG.user_list_file) self.item_list = load_np_array(CONFIG.movie_list_file) self.target_word2id = make_vocab_lookup(CONFIG.target_word_list, unk_token='UNK') self.description_word2id = make_vocab_lookup( CONFIG.description_word_list, unk_token='UNK') self.sentiment_word2id = make_vocab_lookup( CONFIG.sentiment_category_list) self.target_num = len(self.target_word2id) self.description_num = len(self.description_word2id) tf.reset_default_graph() self.model = TripleSentimentRating(self.target_num, self.description_num, **kwargs) self.sess = None self.saver = None self.global_step = tf.Variable(0, trainable=False) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.variable_scope("Optimizer"): params = tf.trainable_variables() gradients = tf.gradients(self.model.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5) optimizer = tf.train.AdamOptimizer(self.lr) # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 0.5) # optimizer = tf.train.GradientDescentOptimizer(self.lr) with tf.control_dependencies(update_ops): self.train_op = optimizer.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step) self.create_or_load_model(overwrite) def parse_profile(_id, _type='subject'): file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis', 'profile.json') targets = [] descriptions = [] sentiments = [] freqs = [] if not os.path.exists(file): print_with_time('file not exists: %s' % file) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': 0 } profile = load_json_file(file) for target, sentiment_description_sample in profile.items(): for sentiment, description_sample in sentiment_description_sample.items( ): for description, samples in description_sample.items(): targets.append(target) descriptions.append(description) sentiments.append(sentiment) freqs.append(len(samples)) targets = list(map(lambda x: self.target_word2id[x], targets)) descriptions = list( map(lambda x: self.description_word2id[x], descriptions)) sentiments = list( map(lambda x: self.sentiment_word2id[x], sentiments)) length = len(freqs) return { 'target': targets, 'description': descriptions, 'sentiment': sentiments, 'freq': freqs, 'length': length } print_with_time('initial user profiles') try: self.user_profiles = self.model_manager.load_json('user_profiles') except OSError: self.user_profiles = list( map(lambda x: parse_profile(x, 'user'), self.user_list)) self.model_manager.save_json(self.user_profiles, 'user_profiles') print_with_time('initial movie profiles') try: self.movie_profiles = self.model_manager.load_json( 'movie_profiles') except OSError: self.movie_profiles = list( map(lambda x: parse_profile(x, 'subject'), self.item_list)) self.model_manager.save_json(self.movie_profiles, 'movie_profiles') print_with_time('profiles initialized')