Ejemplo n.º 1
0
def output_embedding():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test')
	data_set.read_train_product_ids(FLAGS.input_train_dir)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Read model")
		model = create_model(sess, True, data_set, data_set.train_review_size)
		user_ranklist_map = {}
		print('Start Testing')
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		test_seq = [i for i in range(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		model.intialize_epoch(test_seq)
		model.prepare_test_epoch()
		has_next = True
		user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next, uqr_pairs = model.get_test_batch()

		if len(user_idxs) > 0:
			part_1 , part_2 = model.step(sess, learning_rate, user_idxs, product_idxs, query_word_idxs, 
						review_idxs, word_idxs, context_word_idxs, True, FLAGS.test_mode)

			# record the results
			user_emb = part_1[0]
			product_emb = part_1[1]
			Wu = part_1[2]
			data_set.output_embedding(user_emb, FLAGS.train_dir + 'user_emb.txt')
			data_set.output_embedding(product_emb, FLAGS.train_dir + 'product_emb.txt')
			data_set.output_embedding(Wu, FLAGS.train_dir + 'Wu.txt')
	return
Ejemplo n.º 2
0
def output_embedding():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test')
	data_set.read_train_product_ids(FLAGS.input_train_dir)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Read model")
		model = create_model(sess, True, data_set, data_set.train_review_size)
		user_ranklist_map = {}
		print('Start Testing')
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		test_seq = [i for i in xrange(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		model.intialize_epoch(test_seq)
		model.prepare_test_epoch()
		has_next = True
		input_feed, has_next, uqr_pairs = model.get_test_batch()

		if len(uqr_pairs) > 0:
			embeddings , keys = model.step(sess, input_feed, True, FLAGS.test_mode)

			# record the results
			for i in xrange(len(keys)):
				data_set.output_embedding(embeddings[i], FLAGS.train_dir + '%s.txt' % keys[i])
			
	return
Ejemplo n.º 3
0
def get_product_scores():
    # Prepare data.
    print("Reading data in %s" % FLAGS.data_dir)

    data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir,
                                         'test')
    data_set.read_train_product_ids(FLAGS.input_train_dir)
    # add image features
    data_set.read_image_features(FLAGS.data_dir)
    # add rating features
    data_set.read_latent_factor(FLAGS.data_dir)

    current_step = 0
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Create model.
        print("Read model")
        model = create_model(sess, True, data_set, data_set.train_review_size)
        user_ranklist_map = {}
        user_ranklist_score_map = {}
        print('Start Testing')
        words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
        test_seq = [i for i in xrange(data_set.review_size)]
        model.setup_data_set(data_set, words_to_train)
        model.intialize_epoch(test_seq)
        has_next = True
        while has_next:
            user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_test_batch(
            )

            if len(user_idxs) > 0:
                user_product_scores, _ = model.step(sess, learning_rate,
                                                    user_idxs, product_idxs,
                                                    review_idxs, word_idxs,
                                                    context_word_idxs, True)
                current_step += 1

            # record the results
            for i in xrange(len(user_idxs)):
                u_idx = user_idxs[i]
                sorted_product_idxs = sorted(
                    range(len(user_product_scores[i])),
                    key=lambda k: user_product_scores[i][k],
                    reverse=True)
                user_ranklist_map[u_idx], user_ranklist_score_map[
                    u_idx] = data_set.compute_test_product_ranklist(
                        u_idx, user_product_scores[i], sorted_product_idxs,
                        FLAGS.rank_cutoff)  #(product name, rank)
            if current_step % FLAGS.steps_per_checkpoint == 0:
                print("Finish test review %d/%d\r" %
                      (model.cur_review_i, model.review_size),
                      end="")

    data_set.output_ranklist(user_ranklist_map, user_ranklist_score_map,
                             FLAGS.train_dir, FLAGS.similarity_func)
    return
def get_doc_softmax_norm():
    # Prepare data.
    print("Reading data in %s" % FLAGS.data_dir)

    #if 'pv' in FLAGS.net_struct:
    data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir,
                                         'train', FLAGS.DF_sampling)
    #else:
    #	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test', FLAGS.DF_sampling)
    current_step = 0
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Create model.
        print("Read model")
        model = create_model(sess, True, data_set, data_set.doc_num)
        print('Start softmax denominator computing')
        words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
        test_seq = [i for i in xrange(data_set.doc_num)]
        model.setup_data_set(data_set, words_to_train)
        model.prepare_test_epoch(test_seq)
        softmax_denominators = []
        has_next = True
        while has_next:
            word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch(
            )

            if len(word_idxs) > 0:
                doc_softmax_denominator, _ = model.step(
                    sess, learning_rate, word_idxs, context_word_idxs,
                    doc_idxs, doc_word_idxs, doc_lengths, True,
                    FLAGS.test_mode)
                current_step += 1

            # record the results
            for i in xrange(len(doc_idxs)):
                doc_idx = doc_idxs[i]
                softmax_denominators.append((data_set.doc_info[doc_idx][0],
                                             doc_softmax_denominator[i]))

            if current_step % FLAGS.steps_per_checkpoint == 0:
                print("Finish test doc %d/%d\r" %
                      (model.cur_doc_i, len(model.test_seq)),
                      end="")

    with open(FLAGS.train_dir + 'test_doc.softmax_denominators',
              'w') as softmax_denominator_fout:
        for i in xrange(len(softmax_denominators)):
            #softmax_denominator_fout.write(softmax_denominators[i][0] + '\t%.3f\n'%softmax_denominators[i][1])
            softmax_denominator_fout.write(softmax_denominators[i][0] + '\t' +
                                           str(softmax_denominators[i][1]) +
                                           '\n')

    return
Ejemplo n.º 5
0
def interactive_explain_mode():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	FLAGS.batch_size = 1
	
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test')
	data_set.read_train_product_ids(FLAGS.input_train_dir)
	#data_set.read_image_features(FLAGS.data_dir)
	current_step = 0
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Read model")
		model = create_model(sess, True, data_set, data_set.train_review_size)
		user_ranklist_map = {}
		user_ranklist_score_map = {}
		print('Start Interactive Process')
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		test_seq = [i for i in xrange(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		model.intialize_epoch(test_seq)
		has_next = True
		input_feed, has_next = model.get_test_batch()
		while True:
			# read information from stdin
			mode, user_idx, product_idx = None, None, None
			test_feed = copy.deepcopy(input_feed)
			print('Enter rank cut:')
			rank_cut = int(sys.stdin.readline().strip())
			print('Enter mode, "product" for gathering product information and "user" for gathering user information:')
			mode = sys.stdin.readline().strip()
			# Output user+query or product?
			if mode == 'product': # product
				print('Enter product idx (line number start from 0) or name ("asin"):')
				product_idx = data_set.get_idx(sys.stdin.readline().strip(), 'product')
				test_feed[model.relation_dict['product']['idxs'].name] = [product_idx]
				p_entity_list, _ = model.step(sess, test_feed, True, 'explain_product')
				# output results
				print('Product %d %s' % (product_idx, data_set.product_ids[product_idx]))
				for relation_name, entity_name, entity_scores in p_entity_list:
					data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, {})
			else: # user + query
				print('Enter user idx (line number start from 0) or name (user id):')
				user_idx = data_set.get_idx(sys.stdin.readline().strip(), 'user')
				test_feed[model.user_idxs.name] = [user_idx]
				up_entity_list, _ = model.step(sess, test_feed, True, 'explain_user_query')
				remove_map = {
					'product' : data_set.user_train_product_set_list[user_idx]
				}
				print('User %d %s' % (user_idx, data_set.user_ids[user_idx]))
				# output results
				for relation_name, entity_name, entity_scores in up_entity_list:
					data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, remove_map)
Ejemplo n.º 6
0
def train():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train')
	data_set.sub_sampling(FLAGS.subsampling_rate)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	# config.log_device_placement=True

	with tf.Session(config=config) as sess:
		# Create model.
		print("Creating model")
		model = create_model(sess, False, data_set, data_set.review_size)

		print('Start training')
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		current_words = 0.0
		previous_words = 0.0
		start_time = time.time()
		last_check_point_time = time.time()
		step_time, loss = 0.0, 0.0
		current_epoch = 0
		current_step = 0
		get_batch_time = 0.0
		training_seq = [i for i in xrange(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		while True:
			random.shuffle(training_seq)
			model.intialize_epoch(training_seq)
			has_next = True
			while has_next:
				time_flag = time.time()
				input_feed, has_next = model.get_train_batch()
				get_batch_time += time.time() - time_flag

				if len(input_feed[model.relation_dict['word']['idxs'].name]) > 0:
					time_flag = time.time()
					step_loss, _ = model.step(sess, input_feed, False)
					#step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
					loss += step_loss / FLAGS.steps_per_checkpoint
					current_step += 1
					step_time += time.time() - time_flag

				# Once in a while, we print statistics.
				if current_step % FLAGS.steps_per_checkpoint == 0:
					print("Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r" %
            				(current_epoch, model.finished_word_num, model.words_to_train, input_feed[model.learning_rate.name], loss, 
            					(model.finished_word_num- previous_words)/(time.time() - start_time), get_batch_time, step_time), end="")
					step_time, loss = 0.0, 0.0
					current_step = 1
					get_batch_time = 0.0
					sys.stdout.flush()
					previous_words = model.finished_word_num
					start_time = time.time()
					#print('time: ' + str(time.time() - last_check_point_time))
					#if time.time() - last_check_point_time > FLAGS.seconds_per_checkpoint:
					#	checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
					#	model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)

			current_epoch += 1
			#checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
			#model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
			if current_epoch >= FLAGS.max_train_epoch:	
				break
		checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
		model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
Ejemplo n.º 7
0
def find_explanation_path():
	print("Reading data in %s" % FLAGS.data_dir)

	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test')
	data_set.read_train_product_ids(FLAGS.input_train_dir)
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Read model")
		model = create_model(sess, True, data_set, data_set.train_review_size)
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		test_seq = [i for i in xrange(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		model.intialize_epoch(test_seq)
		model.prepare_test_epoch()
		input_feed, has_next, uqr_pairs = model.get_test_batch()

		test_feed = copy.deepcopy(input_feed)

		print('Generating explanations')

		with open(FLAGS.explanation_output_dir + 'explanation-output.csv', mode='w') as write_csv_file:
			csv_writer = csv.writer(write_csv_file, delimiter=',')
			csv_writer.writerow(['sample_id', 'user', 'query', 'product', 'explanation', 'attention_weight', 'previous_reviews'])
			count = 0
			for (user_idx, product_idx, query_idx, review_idx) in uqr_pairs:
				sample_id = '-'.join([str(user_idx), str(product_idx), str(query_idx), str(review_idx)])
				user_history_idx_dict, user_hist_len_dict =  model.get_history_and_length_dicts(review_idx)
				for key in user_history_idx_dict:
					test_feed[model.user_history_dict[key]['idxs'].name] = [user_history_idx_dict[key]]
					test_feed[model.user_history_dict[key]['length'].name] = [user_hist_len_dict[key]]

				test_feed[model.product_idxs.name] = [product_idx]
				query_word_idx = model.data_set.query_words[query_idx]
				test_feed[model.query_word_idxs.name] = [query_word_idx]

				attn_distribution_dict, _ = model.step(sess, test_feed, True, 'explanation_path')
				user = data_set.user_ids[user_idx]
				product = data_set.product_ids[product_idx]
				query = ' '.join([data_set.words[x] for x in query_word_idx if x < len(data_set.words)])
				review_idxs = [idx for idx, review in enumerate(data_set.review_info) if review[0] == user_idx]
				review_word_idxs = [data_set.review_text[idx] for idx in review_idxs]
				reviews = []
				for idx, review_word_idx in enumerate(review_word_idxs):
					if idx >= 5 :
						break
					review_txt = ' '.join([data_set.words[idx] for idx in review_word_idx if idx < len(data_set.words)])
					reviews.append(str(idx+1) + ') ' + review_txt)

				#get max attn from master to find which slave attn is more important
				indexed_attn_values = list(enumerate(attn_distribution_dict['master'][0]))
				top_values = sorted(indexed_attn_values, key=operator.itemgetter(1), reverse=True)[:3]
				explanation = ''
				expln_index = 1
				max_attn = top_values[0][1]

				# get explanation for top 3 attn scores from attention list
				for index, attn_score in top_values:
					curr_explanation = data_set.get_expln_with_max_attn(index, model.user_history_dict, user_history_idx_dict, attn_distribution_dict)
					if curr_explanation:
						explanation += str(expln_index) + '. ' + curr_explanation + '\n'
						expln_index += 1

				csv_writer.writerow([sample_id, user, query, product, explanation, max_attn, '\n'.join(reviews)])
				count+=1

			print("Generated " + str(count) + " explanations")
Ejemplo n.º 8
0
def interactive_explain_mode():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	FLAGS.batch_size = 1
	
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test')
	data_set.read_train_product_ids(FLAGS.input_train_dir)
	current_step = 0
	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Read model")
		model = create_model(sess, True, data_set, data_set.train_review_size)
		user_ranklist_map = {}
		user_ranklist_score_map = {}
		print('Start Interactive Process')
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		test_seq = [i for i in xrange(data_set.review_size)]
		model.setup_data_set(data_set, words_to_train)
		model.intialize_epoch(test_seq)
		model.prepare_test_epoch()
		has_next = True
		input_feed, has_next, uqr_pairs = model.get_test_batch()
		while True:
			# read information from stdin
			mode, user_idx, query_idx, product_idx = None, None, None, None
			test_feed = copy.deepcopy(input_feed)
			print('Enter rank cut:')
			rank_cut = int(sys.stdin.readline().strip())
			print('Enter mode:')
			mode = sys.stdin.readline().strip()
			# Output user+query or product?
			if mode == 'product': # product
				print('Enter product idx or name:')
				product_idx = data_set.get_idx(sys.stdin.readline().strip(), 'product')
				test_feed[model.product_idxs.name] = [product_idx]
				p_entity_list, _ = model.step(sess, test_feed, True, 'explain_product')
				# output results
				print('Product %d %s' % (product_idx, data_set.product_ids[product_idx]))
				for relation_name, entity_name, entity_scores in p_entity_list:
					data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, {})
			else: # user + query
				print('Enter user idx or name:')
				user_idx = data_set.get_idx(sys.stdin.readline().strip(), 'user')
				user_history_idx_dict =  data_set.get_user_history_idx(user_idx, model.max_history_length)
				print('Enter query idx:')
				query_idx = int(sys.stdin.readline().strip())
				query_word_idx = model.data_set.query_words[query_idx]

				for key in user_history_idx_dict:
					test_feed[model.user_history_dict[key]['idxs'].name] = user_history_idx_dict[key]

				test_feed[model.query_word_idxs.name] = [query_word_idx]
				uq_entity_list, _ = model.step(sess, test_feed, True, 'explain_user_query')
				remove_map = {
					'product' : data_set.user_train_product_set_list[user_idx]
				}
				print('User %d %s' % (user_idx, data_set.user_ids[user_idx]))
				print('Query %d %s' % (query_idx, '_'.join([data_set.words[x] for x in query_word_idx if x < len(data_set.words)])))
				# output results
				for relation_name, entity_name, entity_scores in uq_entity_list:
					data_set.print_entity_list(relation_name, entity_name, entity_scores[0], rank_cut, remove_map)

	return
Ejemplo n.º 9
0
def self_test():
    print("Self_test")
    FLAGS.data_dir = '/mnt/scratch/aiqy/MultiViewEmbedding/working/Amazon/small_sample/min_count1/'
    # Prepare data.
    print("Reading data in %s" % FLAGS.data_dir)

    data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir,
                                         'train')
    data_set.sub_sampling(FLAGS.subsampling_rate)

    # add image features
    data_set.read_image_features(FLAGS.data_dir)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Create model.
        print("Creating model")
        model = create_model(sess, False, data_set, data_set.review_size)

        # This is the training loop.

        print('Start training')
        words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
        current_words = 0.0
        previous_words = 0.0
        start_time = time.time()
        step_time, loss = 0.0, 0.0
        current_epoch = 0
        current_step = 0
        get_batch_time = 0.0
        training_seq = [i for i in xrange(data_set.review_size)]
        model.setup_data_set(data_set, words_to_train)
        while True:
            random.shuffle(training_seq)
            model.intialize_epoch(training_seq)
            has_next = True
            while has_next:
                time_flag = time.time()
                user_idxs, product_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_train_batch(
                )
                get_batch_time += time.time() - time_flag

                if len(word_idxs) > 0:
                    time_flag = time.time()
                    step_loss, _ = model.step(sess, learning_rate, user_idxs,
                                              product_idxs, review_idxs,
                                              word_idxs, context_word_idxs,
                                              False)
                    #step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
                    loss += step_loss / FLAGS.steps_per_checkpoint
                    current_step += 1
                    step_time += time.time() - time_flag

                # Once in a while, we print statistics.
                if current_step % FLAGS.steps_per_checkpoint == 0:
                    print(
                        "Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r"
                        % (current_epoch, model.finished_word_num,
                           model.words_to_train, learning_rate, loss,
                           (model.finished_word_num - previous_words) /
                           (time.time() - start_time), get_batch_time,
                           step_time),
                        end="")
                    step_time, loss = 0.0, 0.0
                    current_step = 1
                    get_batch_time = 0.0
                    sys.stdout.flush()
                    previous_words = model.finished_word_num
                    start_time = time.time()

            current_epoch += 1
            if current_epoch >= FLAGS.max_train_epoch:
                break
def output_embedding():
    # Prepare data.
    print("Reading data in %s" % FLAGS.data_dir)

    #if 'pv' in FLAGS.net_struct:
    data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir,
                                         'train', FLAGS.DF_sampling)
    #else:
    #	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'test', FLAGS.DF_sampling)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Create model.
        print("Read model")
        model = create_model(sess, True, data_set, data_set.doc_num)
        print('Start saving embeddings')
        words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
        test_seq = [i for i in xrange(data_set.doc_num)]
        model.setup_data_set(data_set, words_to_train)
        model.prepare_test_epoch(test_seq)
        has_next = True
        word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch(
        )

        part_1, part_2 = model.step(sess, learning_rate, word_idxs,
                                    context_word_idxs, doc_idxs, doc_word_idxs,
                                    doc_lengths, True, FLAGS.test_mode)

        # record the results
        word_emb = part_1[0]
        data_set.output_embedding(word_emb, data_set.words,
                                  FLAGS.train_dir + 'word_emb.txt')
        if 'pv' in FLAGS.net_struct:
            doc_emb = part_1[1]
            doc_names = [x[0] for x in data_set.doc_info]
            data_set.output_embedding(doc_emb, doc_names,
                                      FLAGS.train_dir + 'doc_emb.txt')
            if len(part_2) > 0:
                context_emb = part_2[0]
                data_set.output_embedding(context_emb, data_set.words,
                                          FLAGS.train_dir + 'context_emb.txt')
        else:
            context_emb = part_1[1]
            data_set.output_embedding(context_emb, data_set.words,
                                      FLAGS.train_dir + 'context_emb.txt')
            if FLAGS.use_local_context:
                local_context_emb = part_2[0]
                data_set.output_embedding(
                    local_context_emb, data_set.words,
                    FLAGS.train_dir + 'local_context_emb.txt')

            #need to compute doc embedding one by one
            words_to_train = float(
                FLAGS.max_train_epoch * data_set.word_count) + 1
            test_seq = [i for i in xrange(data_set.doc_num)]
            model.setup_data_set(data_set, words_to_train)
            model.prepare_test_epoch(test_seq)
            has_next = True
            current_step = 0
            doc_emb = [None for x in xrange(len(data_set.doc_info))]
            while has_next:
                word_idxs, context_word_idxs, doc_idxs, doc_word_idxs, doc_lengths, learning_rate, has_next = model.get_test_batch(
                )

                if len(doc_idxs) > 0:
                    doc_emb_output, _ = model.step(sess, learning_rate,
                                                   word_idxs,
                                                   context_word_idxs, doc_idxs,
                                                   doc_word_idxs, doc_lengths,
                                                   True,
                                                   'output_doc_embedding')
                    current_step += 1

                # record the results
                for i in xrange(len(doc_idxs)):
                    doc_idx = doc_idxs[i]
                    doc_emb[doc_idx] = doc_emb_output[i]

                if current_step % FLAGS.steps_per_checkpoint == 0:
                    print("Finish test doc %d/%d\r" %
                          (model.cur_doc_i, len(model.test_seq)),
                          end="")

            doc_names = [x[0] for x in data_set.doc_info]
            data_set.output_embedding(doc_emb, doc_names,
                                      FLAGS.train_dir + 'doc_emb.txt')

    return
Ejemplo n.º 11
0
def train():
	# Prepare data.
	print("Reading data in %s" % FLAGS.data_dir)
	# 处理数据集
	"""
	返回内容:
	['product_ids', 'product_size', 'user_ids', 
	'user_size', 'words', 'vocab_size', 'query_words', 
	'query_max_length', 'word_count', 'vocab_distribute', 
	'review_info', 'review_text', 'review_size', 'sub_sampling_rate', 
	'review_distribute', 'product_distribute', 'product_query_idx']
	"""
	data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir, 'train')
	# 计算得到再采样概率,去除高频词的影响在skip gram中使用过
	data_set.sub_sampling(FLAGS.subsampling_rate)

	config = tf.ConfigProto()
	config.gpu_options.allow_growth = True
	#config.log_device_placement=True
	with tf.Session(config=config) as sess:
		# Create model.
		print("Creating model")
		# 构造模型
		model = create_model(sess, False, data_set, data_set.review_size)

		print('Start training')
		# 训练用到的单词数
		words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
		# 已经训练的单词数
		current_words = 0.0
		previous_words = 0.0
		# 开始时间
		start_time = time.time()
		# 保存时间
		last_check_point_time = time.time()
		
		step_time, loss = 0.0, 0.0
		
		current_epoch = 0
		current_step = 0
		
		get_batch_time = 0.0
		
		# 所有的review的index
		training_seq = [i for i in range(data_set.review_size)]

		# model设置启动数据
		'''
		self.data_set = data_set
		self.words_to_train = words_to_train
		self.finished_word_num = 0
		'''
		model.setup_data_set(data_set, words_to_train)
		
		while True:
			# 吧review 的index的顺序打乱
			random.shuffle(training_seq)
			'''
			self.train_seq = training_seq
			self.review_size = len(self.train_seq)
			self.cur_review_i = 0
			self.cur_word_i = 0
			'''
			model.intialize_epoch(training_seq)


			has_next = True
			while has_next:
				time_flag = time.time()

				user_idxs, product_idxs, query_word_idxs, review_idxs, word_idxs, context_word_idxs, learning_rate, has_next = model.get_train_batch()
				
				get_batch_time += time.time() - time_flag

				if len(word_idxs) > 0:

					time_flag = time.time()
					
					step_loss, _ = model.step(sess, learning_rate, user_idxs, product_idxs, query_word_idxs,
								review_idxs, word_idxs, context_word_idxs, False)
					#step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
					loss += step_loss / FLAGS.steps_per_checkpoint
					
					current_step += 1
					step_time += time.time() - time_flag

				# Once in a while, we print statistics.
				if current_step % FLAGS.steps_per_checkpoint == 0:
					print("Epoch %d Words %d/%d: lr = %5.3f loss = %6.2f words/sec = %5.2f prepare_time %.2f step_time %.2f\r" %
            				(current_epoch, model.finished_word_num, model.words_to_train, learning_rate, loss, 
            					(model.finished_word_num- previous_words)/(time.time() - start_time), get_batch_time, step_time), end="")
					step_time, loss = 0.0, 0.0
					current_step = 1
					get_batch_time = 0.0
					sys.stdout.flush()
					previous_words = model.finished_word_num
					start_time = time.time()
					#print('time: ' + str(time.time() - last_check_point_time))
					#if time.time() - last_check_point_time > FLAGS.seconds_per_checkpoint:
					#	checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
					#	model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)

			current_epoch += 1
			#checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
			#model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
			if current_epoch >= FLAGS.max_train_epoch:	
				break
		checkpoint_path_best = os.path.join(FLAGS.train_dir, "ProductSearchEmbedding.ckpt")
		model.saver.save(sess, checkpoint_path_best, global_step=model.global_step)
Ejemplo n.º 12
0
def find_explanation_path():
    print("Reading data in %s" % FLAGS.data_dir)

    data_set = data_util.Tensorflow_data(FLAGS.data_dir, FLAGS.input_train_dir,
                                         'test')
    data_set.read_train_product_ids(FLAGS.input_train_dir)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Create model.
        print("Read model")
        model = create_model(sess, True, data_set, data_set.train_review_size)
        words_to_train = float(FLAGS.max_train_epoch * data_set.word_count) + 1
        test_seq = [i for i in xrange(data_set.review_size)]
        model.setup_data_set(data_set, words_to_train)
        model.intialize_epoch(test_seq)
        model.prepare_test_epoch()
        input_feed, has_next, uqr_pairs = model.get_test_batch()

        test_feed = copy.deepcopy(input_feed)
        print('Generating explanations')

        with open(FLAGS.explanation_output_dir + 'explanation-output.csv',
                  mode='w') as write_csv_file:
            csv_writer = csv.writer(write_csv_file, delimiter=',')
            csv_writer.writerow([
                'sample_id', 'user', 'query', 'product', 'explanation',
                'previous_reviews'
            ])
            count = 0
            for (user_idx, product_idx, query_idx, review_idx) in uqr_pairs:
                sample_id = '-'.join([
                    str(user_idx),
                    str(product_idx),
                    str(query_idx),
                    str(review_idx)
                ])
                test_feed[model.user_idxs.name] = [user_idx]
                test_feed[model.product_idxs.name] = [product_idx]
                query_word_idx = model.data_set.query_words[query_idx]
                test_feed[model.query_word_idxs.name] = [query_word_idx]

                up_entity_list, _ = model.step(sess, test_feed, True,
                                               'explain_user_product')
                user = data_set.user_ids[user_idx]
                product = data_set.product_ids[product_idx]
                query = ' '.join([
                    data_set.words[x] for x in query_word_idx
                    if x < len(data_set.words)
                ])
                review_idxs = [
                    idx for idx, review in enumerate(data_set.review_info)
                    if review[0] == user_idx
                ]
                review_word_idxs = [
                    data_set.review_text[idx] for idx in review_idxs
                ]
                reviews = []
                for idx, review_word_idx in enumerate(review_word_idxs):
                    if idx >= 5:
                        break
                    review_txt = ' '.join([
                        data_set.words[idx] for idx in review_word_idx
                        if idx < len(data_set.words)
                    ])
                    reviews.append(str(idx + 1) + ') ' + review_txt)

                #merge all entity scores into one list to get max 3 values
                overall_tuple_list = []
                for relation_name, entity_name, entity_scores in up_entity_list:
                    entity_scores = entity_scores[0]
                    indexed_scores = list(enumerate(entity_scores))
                    curr_tuple_list = [(relation_name, entity_name, index,
                                        value)
                                       for index, value in indexed_scores]
                    overall_tuple_list.extend(curr_tuple_list)

                #get top 3 values and generate explanation for them
                top_valued_tuples = sorted(overall_tuple_list,
                                           key=operator.itemgetter(3),
                                           reverse=True)[:3]
                explanation = ''

                for index, top_tuple in enumerate(top_valued_tuples):
                    relation_name, entity_name, max_index, _ = top_tuple
                    word = data_set.entity_vocab[entity_name][max_index]
                    if relation_name == 'write':
                        curr_explanation = EXPLANATION_TMPL_WRITE.format(
                            user=user, product=product, word=word)
                    elif relation_name == 'brand':
                        curr_explanation = EXPLANATION_TMPL_BRAND.format(
                            user=user, product=product, word=word)
                    elif relation_name == 'categories':
                        curr_explanation = EXPLANATION_TMPL_CATEGORY.format(
                            user=user, product=product, word=word)
                    else:
                        curr_explanation = EXPLANATION_TMPL_RELATED.format(
                            user=user, product=product, word=word)

                    explanation += str(index +
                                       1) + '. ' + curr_explanation + '\n'

                csv_writer.writerow([
                    sample_id, user, query, product, explanation,
                    '\n'.join(reviews)
                ])
                count += 1

            print("Generated " + str(count) + " explanations")