def prediction(self,
                test_input,
                test_labels=None,
                only_prediction=True,
                from_bets_model=False):
     if from_bets_model:
         if self.best_ckpt is None:
             print_with_time('No checkpoint found! Use current weights!')
         else:
             self.saver.restore(self.sess, self.best_ckpt)
     predictions = []
     losses = 0
     for _ in range(self.prediction_times):
         res = self._prediction(test_input, test_labels, only_prediction)
         if only_prediction:
             predictions.append(res)
         else:
             predictions.append(res[0])
             losses += res[1]
     averaged_prediction = np.apply_along_axis(
         lambda x: np.argmax(np.bincount(x, minlength=2)),
         axis=0,
         arr=predictions)
     averaged_loss = losses / self.prediction_times
     if only_prediction:
         return averaged_prediction
     else:
         return averaged_prediction, averaged_loss
    def initialize(self, model_manager):
        print_with_time('initialize tags')
        user_tags, item_tags, _ = make_tags(False)

        def map_tags(tags):
            return list(map(lambda x: self.tag_word2id.get(x, 0), tags))

        self.all_user_tags = list(map(map_tags, user_tags))
        self.all_item_tags = list(map(map_tags, item_tags))
        print_with_time('tags initialized')
def make_rate_prediction_dataset(k=5, filter_no_profile=True, overwrite=False):
    print_with_time('merge data...')
    merge_cut_profile_rate(filter_no_profile, overwrite)
    print_with_time('make vocab ...')
    make_general_vocab(overwrite=overwrite)
    print_with_time('split dataset...')
    split_cut_profile_rate(k, overwrite)
    print_with_time('done dataset')
Example #4
0
 def create_or_load_model(self, overwrite=False):
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     sess = tf.Session(config=config)
     variables = tf.global_variables()
     saver = tf.train.Saver(variables, max_to_keep=1)
     ckpt = tf.train.latest_checkpoint(self.ckpt_path)
     if ckpt is None or overwrite:
         print_with_time('fresh training... checkpoint path: %s' %
                         self.ckpt_path)
         self.model_manager.remove_dir(self.ckpt_path)
         os.makedirs(self.ckpt_path, exist_ok=True)
         sess.run(tf.global_variables_initializer())
     else:
         print('load pre-training checkpoint at %s' % ckpt)
         saver.restore(sess, ckpt)
     self.saver = saver
     self.sess = sess
    def create_or_load_model(self, overwrite=False):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        sess = tf.InteractiveSession(config=config)
        variables = tf.global_variables()
        saver = tf.train.Saver(variables, max_to_keep=1)
        ckpt = tf.train.latest_checkpoint(self.ckpt_path)
        if ckpt is None or overwrite:
            print_with_time('fresh training... checkpoint path: %s' %
                            self.ckpt_path)
            self.model_manager.remove_dir(self.ckpt_path)
            os.makedirs(self.ckpt_path, exist_ok=True)
            sess.run(tf.global_variables_initializer())
            for m, pretrain_dir in zip(self.models, self.pretrain_dirs):
                if pretrain_dir is None:
                    continue
                pretrain_ckpt = tf.train.latest_checkpoint(pretrain_dir)
                if pretrain_ckpt is None:
                    print_with_time('no checkpoint found in %s' % pretrain_dir)
                else:
                    namespace = m.__class__.__name__ + '/'
                    sub_variables = [
                        x for x in variables if x.name.startswith(namespace)
                    ]
                    sub_saver = tf.train.Saver(sub_variables)
                    sub_saver.restore(sess, pretrain_ckpt)
                    print_with_time('load %d variables from %s successfully' %
                                    (len(sub_variables), pretrain_dir))
        else:
            print('load pre-training checkpoint at %s' % ckpt)
            saver.restore(sess, ckpt)
        self.saver = saver
        self.sess = sess
Example #6
0
 def parse_profile(_id, _type='subject'):
     file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis',
                         'profile.json')
     targets = []
     descriptions = []
     sentiments = []
     freqs = []
     if not os.path.exists(file):
         print_with_time('file not exists: %s' % file)
         return {
             'target': targets,
             'description': descriptions,
             'sentiment': sentiments,
             'freq': freqs,
             'length': 0
         }
     profile = load_json_file(file)
     for target, sentiment_description_sample in profile.items():
         for sentiment, description_sample in sentiment_description_sample.items(
         ):
             for description, samples in description_sample.items():
                 targets.append(target)
                 descriptions.append(description)
                 sentiments.append(sentiment)
                 freqs.append(len(samples))
     targets = list(map(lambda x: self.target_word2id[x], targets))
     descriptions = list(
         map(lambda x: self.description_word2id[x], descriptions))
     sentiments = list(
         map(lambda x: self.sentiment_word2id[x], sentiments))
     length = len(freqs)
     return {
         'target': targets,
         'description': descriptions,
         'sentiment': sentiments,
         'freq': freqs,
         'length': length
     }
def split_cut_profile_rate(k=5, overwrite=False):
    if not overwrite:
        files = [
            x for x in os.listdir(CONFIG.single_rate_training_folder)
            if re.match(r'%s_\d+.json' %
                        CONFIG.single_rate_file_name, x) is not None
        ]
        if len(files) == k:
            return
    all_records = load_json_file(CONFIG.sing_rate_data_all)
    print_with_time('all records loaded')
    random.shuffle(all_records)
    print_with_time('shuffled')
    for i in range(k):
        out_file = os.path.join(
            CONFIG.single_rate_training_folder,
            '%s_%d.json' % (CONFIG.single_rate_file_name, i))
        subset = all_records[i::k]
        save_json_file(out_file, subset)
        del subset
        gc.collect()
    del all_records
    gc.collect()
Example #8
0
    def initialize(self, model_manager):
        print_with_time('initialize profiles')
        user_profiles, item_profiles = make_profiles(False)

        print_with_time('initial user profiles')
        try:
            self.all_user_profiles = model_manager.load_json('user_profiles_v2')
        except OSError:
            self.all_user_profiles = list(map(self.triples_to_ids, user_profiles))
            model_manager.save_json(self.all_user_profiles, 'user_profiles_v2')
        print_with_time('initial item profiles')
        try:
            self.all_item_profiles = model_manager.load_json('item_profiles_v2')
        except OSError:
            self.all_item_profiles = list(map(self.triples_to_ids, item_profiles))
            model_manager.save_json(self.all_item_profiles, 'item_profiles_v2')
        print_with_time('profiles initialized')
    def initialize(self, model_manager):
        def parse_profile(_id, _type='subject'):
            file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis',
                                'profile.json')
            targets = []
            descriptions = []
            sentiments = []
            freqs = []
            if not os.path.exists(file):
                print_with_time('file not exists: %s' % file)
                return {
                    'target': targets,
                    'description': descriptions,
                    'sentiment': sentiments,
                    'freq': freqs,
                    'length': 0
                }
            profile = load_json_file(file)
            for target, sentiment_description_sample in profile.items():
                for sentiment, description_sample in sentiment_description_sample.items(
                ):
                    for description, samples in description_sample.items():
                        targets.append(target)
                        descriptions.append(description)
                        sentiments.append(sentiment)
                        freqs.append(len(samples))
            targets = list(map(lambda x: self.target_word2id[x], targets))
            descriptions = list(
                map(lambda x: self.description_word2id[x], descriptions))
            sentiments = list(
                map(lambda x: self.sentiment_word2id[x], sentiments))
            length = len(freqs)
            return {
                'target': targets,
                'description': descriptions,
                'sentiment': sentiments,
                'freq': freqs,
                'length': length
            }

        print_with_time('initial user profiles')
        try:
            self.user_profiles = model_manager.load_json('user_profiles')
        except OSError:
            self.user_profiles = list(
                map(lambda x: parse_profile(x, 'user'), self.user_list))
            model_manager.save_json(self.user_profiles, 'user_profiles')
        print_with_time('initial item profiles')
        try:
            self.item_profiles = model_manager.load_json('item_profiles')
        except OSError:
            self.item_profiles = list(
                map(lambda x: parse_profile(x, 'subject'), self.item_list))
            model_manager.save_json(self.item_profiles, 'item_profiles')
        print_with_time('profiles initialized')
 def fit(self, trainset, valid_rate=0.1):
     valid_num = round(len(trainset) * valid_rate)
     train_num = len(trainset) - valid_num
     print_with_time("start training on %d samples, valid on %d samples" %
                     (train_num, valid_num))
     users_train, items_train, rates_train = zip(*trainset[:train_num])
     users_valid, items_valid, rates_valid = zip(*trainset[train_num:])
     valid_input = list(zip(users_valid, items_valid))
     # Training loop
     records = []
     best_valid_loss = None
     last_save_epoch = None
     epoch = 0
     for batch_users, batch_items, batch_rates, last_batch \
             in self.batch_iter(users_train, items_train, rates_train, True, self.epochs):
         feed_dict = self.make_feed_dict(batch_users, batch_items,
                                         batch_rates, 'train')
         _, step, loss = self.sess.run(
             [self.train_op, self.global_step, self.loss],
             feed_dict=feed_dict)
         if step % 100 == 0:
             print_with_time("step {0} : loss = {1}".format(step, loss))
         if last_batch:
             epoch += 1
             # evaluate
             valid_prediction, valid_loss = self.prediction(
                 valid_input, rates_valid, False)
             valid_acc = accuracy_score(rates_valid, valid_prediction)
             print_with_time('epoch %d: valid_loss %f, valid_acc %f' %
                             (epoch, valid_loss, valid_acc))
             if best_valid_loss is None or valid_loss < best_valid_loss:
                 print_with_time('get a better one!')
                 best_valid_loss = valid_loss
                 self.saver.save(self.sess,
                                 os.path.join(self.ckpt_path, 'model.ckpt'),
                                 global_step=step)
                 self.best_ckpt = os.path.join(self.ckpt_path,
                                               'model.ckpt-%d' % step)
                 last_save_epoch = epoch
             records.append({
                 'step': int(step),
                 'train_loss': float(loss),
                 'valid_loss': float(valid_loss),
                 'valid_acc': float(valid_acc)
             })
             # early stop
             if last_save_epoch is not None and self.early_stop is not None and epoch - last_save_epoch > self.early_stop:
                 print_with_time(
                     'No loss decrease on valid set for %d epochs, stop training'
                     % self.early_stop)
                 break
     # result
     self.model_manager.save_json(records, 'train_loss')
     print_with_time('training done')
Example #11
0
    def __init__(self,
                 model_dir,
                 batch_size=8192,
                 epochs=100,
                 lr=1e-3,
                 dropout=0.5,
                 early_stop=10,
                 max_length=50,
                 overwrite=True,
                 **kwargs):
        self.model_manager = utils.ModelManager(model_dir)
        self.ckpt_path = os.path.join(self.model_manager.path_name, 'ckpt')
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.dropout = dropout
        self.early_stop = early_stop
        self.max_length = max_length
        self.user_list = load_np_array(CONFIG.user_list_file)
        self.item_list = load_np_array(CONFIG.movie_list_file)
        self.target_word2id = make_vocab_lookup(CONFIG.target_word_list,
                                                unk_token='UNK')
        self.description_word2id = make_vocab_lookup(
            CONFIG.description_word_list, unk_token='UNK')
        self.sentiment_word2id = make_vocab_lookup(
            CONFIG.sentiment_category_list)
        self.target_num = len(self.target_word2id)
        self.description_num = len(self.description_word2id)

        tf.reset_default_graph()
        self.model = TripleSentimentRating(self.target_num,
                                           self.description_num, **kwargs)
        self.sess = None
        self.saver = None
        self.global_step = tf.Variable(0, trainable=False)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.variable_scope("Optimizer"):
            params = tf.trainable_variables()
            gradients = tf.gradients(self.model.loss, params)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5)
            optimizer = tf.train.AdamOptimizer(self.lr)
            # clipped_gradients, _ = tf.clip_by_global_norm(gradients, 0.5)
            # optimizer = tf.train.GradientDescentOptimizer(self.lr)
            with tf.control_dependencies(update_ops):
                self.train_op = optimizer.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)
        self.create_or_load_model(overwrite)

        def parse_profile(_id, _type='subject'):
            file = os.path.join(CONFIG.data_path, _type, str(_id), 'analysis',
                                'profile.json')
            targets = []
            descriptions = []
            sentiments = []
            freqs = []
            if not os.path.exists(file):
                print_with_time('file not exists: %s' % file)
                return {
                    'target': targets,
                    'description': descriptions,
                    'sentiment': sentiments,
                    'freq': freqs,
                    'length': 0
                }
            profile = load_json_file(file)
            for target, sentiment_description_sample in profile.items():
                for sentiment, description_sample in sentiment_description_sample.items(
                ):
                    for description, samples in description_sample.items():
                        targets.append(target)
                        descriptions.append(description)
                        sentiments.append(sentiment)
                        freqs.append(len(samples))
            targets = list(map(lambda x: self.target_word2id[x], targets))
            descriptions = list(
                map(lambda x: self.description_word2id[x], descriptions))
            sentiments = list(
                map(lambda x: self.sentiment_word2id[x], sentiments))
            length = len(freqs)
            return {
                'target': targets,
                'description': descriptions,
                'sentiment': sentiments,
                'freq': freqs,
                'length': length
            }

        print_with_time('initial user profiles')
        try:
            self.user_profiles = self.model_manager.load_json('user_profiles')
        except OSError:
            self.user_profiles = list(
                map(lambda x: parse_profile(x, 'user'), self.user_list))
            self.model_manager.save_json(self.user_profiles, 'user_profiles')
        print_with_time('initial movie profiles')
        try:
            self.movie_profiles = self.model_manager.load_json(
                'movie_profiles')
        except OSError:
            self.movie_profiles = list(
                map(lambda x: parse_profile(x, 'subject'), self.item_list))
            self.model_manager.save_json(self.movie_profiles, 'movie_profiles')
        print_with_time('profiles initialized')