def predict_words(self, word1, word2): inpH = InputHelper() w1, w2, dim = inpH.getWords(word1, word2, vocab_filepath, 30) graph = tf.get_default_graph() with graph.as_default(): input_x1 = graph.get_operation_by_name("input_x1").outputs[0] input_x2 = graph.get_operation_by_name("input_x2").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] sim = graph.get_operation_by_name("accuracy/temp_sim").outputs[0] r1 = self.session.run([sim], { input_x1: w1, input_x2: w2, dropout_keep_prob: 1.0 }) r2 = self.session.run([sim], { input_x1: w2, input_x2: w1, dropout_keep_prob: 1.0 }) r = max(r1, r2) return r[0][0]
# 语句最多长度(包含多少个词) # MAX_DOCUMENT_LENGTH = 12 # MAX_DOCUMENT_LENGTH = 8 # MAX_DOCUMENT_LENGTH = 20(7th-June) MAX_DOCUMENT_LENGTH = 40 # 验证集比例 DEV_PERCENT = 10 # Misc Parameters ALLOW_SOFT_PLACEMENT = True LOG_DEVICE_PLACEMENT = False print('训练开始......................') start_time = datetime.datetime.now() inpH = InputHelper() # 将原始的训练文件转化为分词后的训练文件 # inpH.train_file_preprocess(TRAINING_FILES_RAW, TRAINING_FILES_FORMAT) # sys.exit(0) train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets( TRAINING_FILES_RAW, MAX_DOCUMENT_LENGTH, DEV_PERCENT, BATCH_SIZE) # dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), BATCH_SIZE, 1) # for index,dev_batch in enumerate(dev_batches): # print(index, dev_batch) # sys.exit(0) # for index, value in enumerate(dev_set[2]): # print(index, dev_set[0][index], dev_set[1][index], dev_set[2][index]) # sys.exit(0)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files==None: print "Input Files List is empty. use --training_files argument." exit() inpH = InputHelper() train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files, FLAGS.training_labeled_files, FLAGS.dev_files, FLAGS.dev_labeled_files, max_document_length, 10, FLAGS.batch_size) embedding_matrix = inpH.getEmbeddings(FLAGS.embedding_file,FLAGS.embedding_dim) entity_embedding_matrix = inpH.getEntityEmbeddings(FLAGS.entity_embedding_file,FLAGS.entity_embedding_dim) entity_vocab_size = len(entity_embedding_matrix) if mode == 'random': entity_embedding_matrix = np.asarray(None) # Training # ================================================== print("starting graph def") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files == None: print("Input Files List is empty. use --training_files argument.") exit() training_files = './data/questions' # Your train data and label label_path = './data/labels' inpH = InputHelper() train_set, dev_set, sum_no_of_batches = inpH.getDataSets(training_files, label_path, 10, FLAGS.batch_size) # 从scores中取出前五 get label using probs def get_label_using_probs(scores, top_number=5): index_list = np.argsort(scores)[-top_number:] index_list = index_list[::-1] return index_list # 计算f1的值 def f1_eval(predict_label_and_marked_label_list): """ :param predict_label_and_marked_label_list: 一个元组列表。例如 [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]), ([3, 2, 1, 4, 7], [5, 7, 3]) ]
FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files == None: print("Input Files List is empty. use --training_files argument.") exit() training_paths = FLAGS.training_files.split(",") multi_train_size = len(training_paths) max_document_length = FLAGS.max_document_words inpH = InputHelper() train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets( training_paths, max_document_length, FLAGS.filter_h_pad, 10, FLAGS.batch_size) inpH.loadW2V(FLAGS.word2vec, FLAGS.word2vec_format) # Training # ================================================== print("starting graph def") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) print("started session") with sess.as_default(): cnn = TextCNN(sequence_length=max_document_length,
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS.flag_values_dict() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files==None: print("Input Files List is empty. use --training_files argument.") exit() max_document_length=400 inpH = InputHelper() train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size, FLAGS.is_char_based) trainableEmbeddings=False if FLAGS.is_char_based==True: FLAGS.word2vec_model = False else: if FLAGS.word2vec_model==None: trainableEmbeddings=True print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "You are using word embedding based semantic similarity but " "word2vec model path is empty. It is Recommended to use --word2vec_model argument. " "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)" "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") else: inpH.loadW2V(FLAGS.word2vec_model, FLAGS.word2vec_format)
# Training parameters batch_size = 64 num_epochs = 300 evaluate_every = 1000 checkpoint_every = 1000 # Misc Parameters allow_soft_placement = True log_device_placement = False trainableEmbeddings = False training_files = "train_snli.txt" max_document_length = 15 inpH = InputHelper() train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets( training_files, 10, max_document_length, batch_size, is_char_based) trainableEmbeddings = False if is_char_based == True: word2vec_model = False inpH.loadW2V(word2vec_model, word2vec_format) vocab_size = len(vocab_processor.vocabulary_) vocab_size def _calculate_fan_in_and_fan_out(tensor): if tensor.ndimension() < 2: raise ValueError( "fan in and fan out can not be computed for tensor of size ",
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None: print("Eval or Vocab filepaths are empty.") exit() # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test, x2_test, ent_x1_test, ent_x2_test, y_test, x1_temp, x2_temp, add_fea_test = inpH.getTestDataSet( FLAGS.eval_filepath, FLAGS.eval_labeled_filepath, FLAGS.vocab_filepath, max_document_length) #embedding_matrix = inpH.getEmbeddings(FLAGS.embedding_file,FLAGS.embedding_dim) #entity_embedding_matrix = inpH.getEntityEmbeddings(FLAGS.entity_embedding_file,FLAGS.hidden_units) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath==None or FLAGS.vocab_filepath==None or FLAGS.model==None : print("Eval or Vocab filepaths are empty.") exit() # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test,x2_test,y_test = inpH.getTestDataSet(FLAGS.eval_filepath, FLAGS.vocab_filepath, 30) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():
print (EVAL_FILE) print (OUTPUT_FILE) # Eval Parameters BATCH_SIZE = 64 # 批大小 VOCAB_FILE = './vocab/vocab' # 训练使使用的词表 MODEL = './models/model-4000' # 加载训练模型 ALLOW_SOFT_PLACEMENT = True LOG_DEVICE_PLACEMENT = False # 语句最多长度(包含多少个词) MAX_DOCUMENT_LENGTH = 40 # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test, x2_test = inpH.getTestDataSet(EVAL_FILE, VOCAB_FILE, MAX_DOCUMENT_LENGTH) # for index, _ in enumerate(x1_test): # print(index, x1_test[index], x2_test[index]) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = MODEL print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=ALLOW_SOFT_PLACEMENT,
def getSentence_Embedding(self, x1, x2, max_document_length): checkpoint_dir = os.path.abspath( os.path.join(self.bilstm_dir, "checkpoints")) print(checkpoint_dir) ckpt = tf.train.get_checkpoint_state(checkpoint_dir) #print 'ckpt:',ckpt checkpoint_file = ckpt.model_checkpoint_path vocab_file = os.path.join(checkpoint_dir, "vocab") inpH = InputHelper() vocab_processor = MyVocabularyProcessor(max_document_length, min_frequency=0) vocab_processor = vocab_processor.restore(vocab_file) tmp = [] (x1_index, x2_index, mask_x1, mask_x2, tmp) = inpH.get_data(vocab_processor, x1, x2, tmp, max_document_length) idfModel = loadIDFModel(self.idfModel_file) # Extract word:id mapping from the object. vocab_dict = vocab_processor.vocabulary_._mapping vocab_id_w = dict((y, x) for x, y in vocab_dict.iteritems()) print("\nGenerating Sentence Embedding Result...\n") graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) sess.run(tf.initialize_all_variables()) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name # the output is a list with only one element input_x1 = graph.get_operation_by_name("input_x1").outputs[0] input_x2 = graph.get_operation_by_name("input_x2").outputs[0] sentence_representation1 = graph.get_operation_by_name( "sentence_embedding/Representation1").outputs[0] sentence_representation2 = graph.get_operation_by_name( "sentence_embedding/Representation2").outputs[0] print "Sentence vector shape after sentence modeling" r1, r2 = sess.run( [sentence_representation1, sentence_representation2], { input_x1: x1_index, input_x2: x2_index }) # Applied Attention_mechanism representation1 = self.getAttention_M(r1, mask_x1, x1, x1_index, vocab_id_w, idfModel) representation2 = self.getAttention_M(r2, mask_x2, x2, x2_index, vocab_id_w, idfModel) return representation1, representation2
batch_size = FLAGS.batch_size num_epochs = FLAGS.num_epochs print("\nParameters:") for attr, value in sorted(FLAGS.flag_values_dict().iteritems()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.database==None: print("Input Files List is empty. use -database argument.") exit() max_document_length=15 #max_document_length=sys.maxint # attempt to read all words in a document inpH = InputHelper() #train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.database,max_document_length, 10, # FLAGS.batch_size, FLAGS.is_char_based) num_docs = inpH.get_num_docs(FLAGS.training_folder) db = lite.connect(FLAGS.database) cursor = db.cursor() emb_map, vocab_processor = inpH.getEmbeddingsMap(cursor, max_document_length, num_docs) train_count, dev_count = inpH.get_counts(FLAGS.training_folder)[0:2] total_count = train_count + dev_count sum_no_of_batches = int(math.ceil(float(train_count) / batch_size)) dev_no_of_batches = int(math.ceil(float(dev_count) / batch_size)) train_set = inpH.my_train_batch(emb_map, train_count, FLAGS.batch_size, num_epochs)
# 语句最多长度(包含多少个词) # MAX_DOCUMENT_LENGTH = 12 # MAX_DOCUMENT_LENGTH = 8 # MAX_DOCUMENT_LENGTH = 20(7th-June) MAX_DOCUMENT_LENGTH = 40 # 验证集比例 DEV_PERCENT = 10 # Misc Parameters ALLOW_SOFT_PLACEMENT = True LOG_DEVICE_PLACEMENT = False print ('训练开始......................') start_time = datetime.datetime.now() inpH = InputHelper() # 将原始的训练文件转化为分词后的训练文件 # inpH.train_file_preprocess(TRAINING_FILES_RAW, TRAINING_FILES_FORMAT) # sys.exit(0) train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(TRAINING_FILES_RAW, MAX_DOCUMENT_LENGTH, DEV_PERCENT, BATCH_SIZE) # dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), BATCH_SIZE, 1) # for index,dev_batch in enumerate(dev_batches): # print(index, dev_batch) # sys.exit(0) # for index, value in enumerate(dev_set[2]):
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files==None: print("Input Files List is empty. use --training_files argument.") exit() max_document_length=15 inpH = InputHelper() train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size, FLAGS.is_char_based) trainableEmbeddings=False if FLAGS.is_char_based==True: FLAGS.word2vec_model = False else: if FLAGS.word2vec_model==None: trainableEmbeddings=True print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "You are using word embedding based semantic similarity but " "word2vec model path is empty. It is Recommended to use --word2vec_model argument. " "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)" "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") else: inpH.loadW2V(FLAGS.word2vec_model, FLAGS.word2vec_format)
tf.flags.DEFINE_string("database", '../plag.db', "Database path (default: ../plag.db)") tf.flags.DEFINE_boolean("auto_chunk", True, "Automatically set chunk_size (default: True") tf.flags.DEFINE_string("folder", "ds", "Folder in which datasets will be created. (default: ds") tf.flags.DEFINE_boolean("intra_only", True, "If true, combine sentences of same document only. If false, combines sentences between all documents of a same author. (default: True") FLAGS = tf.flags.FLAGS batch_size = FLAGS.batch_size percent_dev = FLAGS.percent_dev percent_test = FLAGS.percent_test database = FLAGS.database num_docs = FLAGS.num_docs print("\nParameters:") for attr, value in sorted(FLAGS.flag_values_dict().iteritems()): print("{}={}".format(attr.upper(), value)) print("") inpH = InputHelper() start_time = time.time() db = lite.connect(database) cursor = db.cursor() total_count = inpH.my_get_counts(cursor, FLAGS.intra_only, num_docs) train_count, dev_count, test_count = inpH.build_datasets(cursor, total_count, batch_size, percent_dev, percent_test, FLAGS.auto_chunk, FLAGS.folder, FLAGS.intra_only, num_docs) end_time = time.time() print('Time elapsed on dataset creation for {} documents: {} seconds.'.format('all' if num_docs < 0 else num_docs, round(end_time - start_time, 2)))
# 自己训练的word2vec模型 WORD2VEC_MODEL_SELF = './word2vec_model.bin' # word2vec模型(采用已训练好的中文模型) # WORD2VEC_MODEL = '../word2vecmodel/news_12g_baidubaike_20g_novel_90g_embedding_64.bin' WORD2VEC_MODEL = WORD2VEC_MODEL_SELF # 模型格式为bin WORD2VEC_FORMAT = 'bin' # 卷积filter大小 filter_size = [1, 2, SENTENCE_LENGTH] # 全连接层Dropout FULL_CONNECT_LAYER_DROPOUT = 0.8 inpH = InputHelper() # 训练自己的word2vec模型 # inpH.gen_word2vec(TRAINING_FILES_RAW, WORD2VEC_MODEL_SELF, EMBEDDING_DIM) # exit(0) train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets( TRAINING_FILES_RAW, SENTENCE_LENGTH, DEV_PERCENT, BATCH_SIZE) # print(type(vocab_processor.vocabulary_._mapping)) # for index, k in enumerate(vocab_processor.vocabulary_._mapping): # print('vocab-{}, {}:{}'.format(index, k,vocab_processor.vocabulary_._mapping[k])) # print('======:{}'.format(vocab_processor.vocabulary_.reverse(vocab_processor.vocabulary_._mapping[k]))) # origin_sentence='为啥我花呗叫话费都交不了' # print(origin_sentence) # sentence_list=list(vocab_processor.transform(np.asarray([origin_sentence])))
FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath==None or FLAGS.model==None : print("Eval or Vocab filepaths are empty.") exit() w2v, model_dict, index_to_word = load_word_2vec_model.get_model_embeddings() # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test,x2_test,ids = inpH.getTestDataSet(FLAGS.eval_filepath, 30, model_dict) wr = open('submissions_train.csv', 'w') print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
print("\nParameters:") for attr, value in sorted(FLAGS.flag_values_dict().iteritems()): print("{}={}".format(attr.upper(), value)) print("") # returns a list of tuples, each containing the id for each document found in the database. def get_document_ids(cursor): sql = 'select id from article' cursor.execute(sql) return cursor.fetchall() ########## main ########## inpH = InputHelper() db = lite.connect(FLAGS.database) cursor = db.cursor() doc_ids = get_document_ids(cursor) doc_count = len(doc_ids) if os.path.exists(FLAGS.output_dir): shutil.rmtree(FLAGS.output_dir, ignore_errors=True) os.mkdir(FLAGS.output_dir) i = 0 for doc_id in doc_ids: sql = 'select count(*) from sentence where fk_article_id = ?' cursor.execute(sql, doc_id) tuple_count = cursor.fetchall()[0][0]
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None: print("Eval or Vocab filepaths are empty.") exit() # load data and map id-transform based on training time vocabulary inpH = InputHelper() x_test, y_test = inpH.getTestDataSet(FLAGS.eval_filepath, FLAGS.vocab_filepath, 600, 5) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
BATCH_SIZE = 64 # 验证集文件 EVAL_FILEPATH = 'validation.txt0' # 词表(在训练过程中已生成) VOCAB_FILEPATH = 'runs/1528462228/checkpoints/vocab' # 模型文件 MODEL = 'runs/1528462228/checkpoints/model-10000' # 语句最多长度(包含多少个词) MAX_DOCUMENT_LENGTH = 30 # Misc Parameters ALLOW_SOFT_PLACEMENT = True LOG_DEVICE_PLACEMENT = False inpH = InputHelper() x1_test, x2_test, y_test = inpH.getTestDataSet(EVAL_FILEPATH, VOCAB_FILEPATH, MAX_DOCUMENT_LENGTH) # for index ,value in enumerate(x1_test): # print (index, x1_test[index], x2_test[index], y_test[index]) # sys.exit(0) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = MODEL print checkpoint_file graph = tf.Graph() with graph.as_default():
class Trainer(NLPApp): def __init__(self, FLAGs): self.FLAGS = FLAGs self.inpH = InputHelper() self.session_conf = tf.ConfigProto( allow_soft_placement=self.FLAGS.allow_soft_placement, log_device_placement=self.FLAGS.log_device_placement) def __load_word2vec(self): trainableEmbeddings = False if self.FLAGS.is_char_based == True: self.FLAGS.word2vec_model = False else: if self.FLAGS.word2vec_model == None: trainableEmbeddings = True print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" "You are using word embedding based semantic similarity but " "word2vec model path is empty. It is Recommended to use --word2vec_model argument. " "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)" "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") else: self.inpH.loadW2V(self.FLAGS.word2vec_model, self.FLAGS.word2vec_format) return trainableEmbeddings def __build_storage_path(self): ''' Ex. out_dir: runs/1548973755 checkpoint_dir_abs: runs/1548973755/checkpoints checkpoint_model_abs: runs/1548973755/checkpoints/model checkpoint_saved_model_abs: runs/1548973755/checkpoints/model-XXX vocab_path: runs/1548973755/checkpoints/vocab :return: ''' checkpoint_dir_abs = os.path.abspath(self.FLAGS.checkpoint_dir) # run/1412312455/checkpoints if self.FLAGS.checkpoint_dir and os.path.exists(checkpoint_dir_abs): # run/1412312455/ print( "Checkpoint dir:{} exists, loading vocab and weights from it". format(self.FLAGS.checkpoint_dir)) out_dir = os.path.join(checkpoint_dir_abs, os.pardir) else: # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) checkpoint_dir_abs = os.path.abspath( os.path.join(out_dir, "checkpoints")) os.makedirs(checkpoint_dir_abs) checkpoint_model_abs = os.path.join(checkpoint_dir_abs, "model") print("Writing to {}\n".format(out_dir)) checkpoint_saved_model_abs = os.path.join(checkpoint_dir_abs, self.FLAGS.model) vocab_path = os.path.join(checkpoint_dir_abs, "vocab") return out_dir, checkpoint_dir_abs, checkpoint_model_abs, checkpoint_saved_model_abs, vocab_path def run(self): ''' Main logic of the app :return: ''' # define all the path out_dir, checkpoint_dir_abs, checkpoint_model_abs, checkpoint_saved_model_abs, vocab_path = self.__build_storage_path( ) # splitting test and val data train_set, dev_set, vocab_processor, sum_no_of_batches = self.inpH.getDataSets( self.FLAGS.training_files, self.FLAGS.max_document_length, 10, self.FLAGS.batch_size, self.FLAGS.is_char_based, vocab_path) # structure the model either from build or reload if self.FLAGS.model and os.path.exists( "{}.meta".format(checkpoint_saved_model_abs)): print("loading trained model from check point:{}".format( checkpoint_saved_model_abs)) saver, sess, input_tensors, result_tensors, metric_ops = self.__launch_from_load( checkpoint_saved_model_abs, out_dir) else: trainableEmbeddings = self.__load_word2vec() initW = self.__init_embedding_matrix(vocab_processor) saver, sess, input_tensors, result_tensors, metric_ops = self.__launch_from_build( vocab_processor, trainableEmbeddings, out_dir, checkpoint_dir_abs, initW) # train batches self.__run_batches(sess, sum_no_of_batches, train_set, dev_set, saver, input_tensors, result_tensors, metric_ops, checkpoint_model_abs) # don't forget to close the session sess.close() def __launch_from_load(self, model_path, out_dir): graph = tf.Graph() # this with is necessary, even you set graph para to init sess with graph.as_default(): saver = tf.train.import_meta_graph("{}.meta".format(model_path)) sess = tf.Session(graph=graph, config=self.session_conf) with sess.as_default(): saver.restore(sess, model_path) # Get the placeholders from the graph by name input_x1 = graph.get_operation_by_name("input_x1").outputs[0] input_x2 = graph.get_operation_by_name("input_x2").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] global_step = graph.get_operation_by_name("global_step").outputs[0] loss = graph.get_operation_by_name("loss/loss_fun").outputs[0] accuracy = graph.get_operation_by_name( "accuracy/accuracy").outputs[0] distance = graph.get_operation_by_name( "output/distance").outputs[0] temp_sim = graph.get_operation_by_name( "accuracy/temp_sim").outputs[0] # Tensors we want to evaluate tr_op_set = graph.get_operation_by_name("tr_op_set").outputs[0] train_summary_op = graph.get_operation_by_name( "train_summary_op").outputs[0] dev_summary_op = graph.get_operation_by_name( "dev_summary_op").outputs[0] train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, graph) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, graph) input_tensors = InputTensors(input_x1, input_x2, input_y, dropout_keep_prob) result_tensors = ResultTensors(global_step, loss, accuracy, distance, temp_sim) metric_ops = MetricOps(tr_op_set, train_summary_op, dev_summary_op, train_summary_writer, dev_summary_writer) return saver, sess, input_tensors, result_tensors, metric_ops def __launch_from_build(self, vocab_processor, trainableEmbeddings, out_dir, checkpoint_dir_abs, initW): # ================================================== print("starting graph def") graph = tf.Graph() with graph.as_default(): # will use default_graph as input para, and current default_graph is the `graph` sess = tf.Session(graph=graph, config=self.session_conf) print("started session") with sess.as_default(): if self.FLAGS.is_char_based: siameseModel = SiameseLSTM( sequence_length=self.FLAGS.max_document_length, vocab_size=len(vocab_processor.vocabulary_), embedding_size=self.FLAGS.embedding_dim, hidden_units=self.FLAGS.hidden_units, l2_reg_lambda=self.FLAGS.l2_reg_lambda, batch_size=self.FLAGS.batch_size) else: siameseModel = SiameseLSTMw2v( sequence_length=self.FLAGS.max_document_length, vocab_size=len(vocab_processor.vocabulary_), embedding_size=self.FLAGS.embedding_dim, hidden_units=self.FLAGS.hidden_units, l2_reg_lambda=self.FLAGS.l2_reg_lambda, batch_size=self.FLAGS.batch_size, trainableEmbeddings=trainableEmbeddings) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) print("initialized siameseModel object") grads_and_vars = optimizer.compute_gradients(siameseModel.loss) tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='tr_op_set') print("defined training_ops") # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) print("defined gradient summaries") # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", siameseModel.loss) acc_summary = tf.summary.scalar("accuracy", siameseModel.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_op = tf.identity(train_summary_op, 'train_summary_op') train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_op = tf.identity(dev_summary_op, 'dev_summary_op') dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) sess.run(tf.global_variables_initializer()) if initW is not None: sess.run(siameseModel.W.assign(initW)) graphpb_txt = str(graph.as_graph_def()) with open(os.path.join(checkpoint_dir_abs, "graphpb.txt"), 'w') as f: f.write(graphpb_txt) input_tensors = InputTensors(siameseModel.input_x1, siameseModel.input_x2, siameseModel.input_y, siameseModel.dropout_keep_prob) result_tensors = ResultTensors(global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance, siameseModel.temp_sim) metric_ops = MetricOps(tr_op_set, train_summary_op, dev_summary_op, train_summary_writer, dev_summary_writer) return saver, sess, input_tensors, result_tensors, metric_ops def __run_batches(self, sess, sum_no_of_batches, train_set, dev_set, saver, input_tensors, result_tensors, metric_ops, checkpoint_prefix): # Generate batches,Seq of [question1_tokenized, question2_tokenized, label] batches = self.inpH.batch_iter( list(zip(train_set[0], train_set[1], train_set[2])), self.FLAGS.batch_size, self.FLAGS.num_epochs) max_validation_acc = 0.0 for nn in range(sum_no_of_batches * self.FLAGS.num_epochs): batch = next(batches) if len(batch) < 1: continue x1_batch, x2_batch, y_batch = zip(*batch) if len(y_batch) < 1: continue self.__train_step(sess, input_tensors, result_tensors, metric_ops, x1_batch, x2_batch, y_batch) current_step = tf.train.global_step(sess, result_tensors.global_step) sum_acc = 0.0 if current_step % self.FLAGS.evaluate_every == 0: print("\nEvaluation:") dev_batches = self.inpH.batch_iter( list(zip(dev_set[0], dev_set[1], dev_set[2])), self.FLAGS.batch_size, 1) for db in dev_batches: if len(db) < 1: continue x1_dev_b, x2_dev_b, y_dev_b = zip(*db) if len(y_dev_b) < 1: continue acc = self.__dev_step(sess, input_tensors, result_tensors, metric_ops, x1_dev_b, x2_dev_b, y_dev_b) sum_acc = sum_acc + acc print("") # 如果当前模型在validation数据上精确度提高了,那么打印metric并保存模型 if current_step % self.FLAGS.checkpoint_every == 0: if sum_acc >= max_validation_acc: max_validation_acc = sum_acc saver.save(sess, checkpoint_prefix, global_step=current_step) tf.train.write_graph(sess.graph.as_graph_def(), checkpoint_prefix, "graph" + str(nn) + ".pb", as_text=True) print( "Saved model {} with sum_accuracy={} checkpoint to {}\n" .format(nn, max_validation_acc, checkpoint_prefix)) def __train_step(self, sess, input_tensors, result_tensors, metric_ops, x1_batch, x2_batch, y_batch): """ A single training step """ # 为什么要不时的颠倒输入的句子对的顺序?损失函数应该和输入顺序无关啊? if random() > 0.5: x1_batch, x2_batch = x2_batch, x1_batch feed_dict = { input_tensors.input_x1: x1_batch, input_tensors.input_x2: x2_batch, input_tensors.input_y: y_batch, input_tensors.dropout_keep_prob: 1.0, } _, step, loss, accuracy, dist, sim, summaries = sess.run([ metric_ops.tr_op_set, result_tensors.global_step, result_tensors.loss, result_tensors.accuracy, result_tensors.distance, result_tensors.temp_sim, metric_ops.train_summary_op ], feed_dict) time_str = datetime.datetime.now().isoformat() print("TRAIN {}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) metric_ops.train_summary_writer.add_summary(summaries, step) def __dev_step(self, sess, input_tensors, result_tensors, metric_ops, x1_batch, x2_batch, y_batch): """ A single training step """ if random() > 0.5: x1_batch, x2_batch = x2_batch, x1_batch feed_dict = { input_tensors.input_x1: x1_batch, input_tensors.input_x2: x2_batch, input_tensors.input_y: y_batch, input_tensors.dropout_keep_prob: 1.0, } step, loss, accuracy, sim, summaries = sess.run([ result_tensors.global_step, result_tensors.loss, result_tensors.accuracy, result_tensors.temp_sim, metric_ops.dev_summary_op ], feed_dict) time_str = datetime.datetime.now().isoformat() print("DEV {}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) metric_ops.dev_summary_writer.add_summary(summaries, step) # print(y_batch, sim) return accuracy def __init_embedding_matrix(self, vocab_processor): if self.FLAGS.word2vec_model: # initial embedding matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(vocab_processor.vocabulary_), self.FLAGS.embedding_dim)) # initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim)) # load any vectors from the word2vec print("initializing initW with pre-trained word2vec embeddings") for w in vocab_processor.vocabulary_._mapping: arr = [] # 去掉词中所有非数字和字母的字符 s = re.sub('[^0-9a-zA-Z]+', '', w) if w in self.inpH.pre_emb: arr = self.inpH.pre_emb[w] elif w.lower() in self.inpH.pre_emb: arr = self.inpH.pre_emb[w.lower()] elif s in self.inpH.pre_emb: arr = self.inpH.pre_emb[s] elif s.isdigit(): arr = self.inpH.pre_emb["zero"] if len(arr) > 0: # sometime, the vector of the word may start with an offset, use the last embedding_dim numbers will solve the problem. if len(arr) > self.FLAGS.embedding_dim: arr = arr[-self.FLAGS.embedding_dim:] idx = vocab_processor.vocabulary_.get(w) initW[idx] = np.asarray(arr).astype(np.float32) # 如果arr是[],那么代表数据中的词在trained word2vec中不存在,那么就用最开始随机的weights来训练 print("Done assigning intiW. len=" + str(len(initW))) # initW 会作为新的embedding matrix在内存中运行, 把inpH中的PreEmb哈希表删除释放缓存! self.inpH.deletePreEmb() gc.collect() return initW
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath==None or FLAGS.vocab_filepath==None or FLAGS.model==None : print("Eval or Vocab filepaths are empty.") exit() # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test,x2_test,y_test = inpH.getTestDataSet1(FLAGS.eval_filepath, FLAGS.vocab_filepath, 30) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():
def __init__(self, FLAGs): self.FLAGS = FLAGs self.inpH = InputHelper() self.session_conf = tf.ConfigProto( allow_soft_placement=self.FLAGS.allow_soft_placement, log_device_placement=self.FLAGS.log_device_placement)
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.training_files==None: print "Input Files List is empty. use --training_files argument." exit() max_document_length=30 inpH = InputHelper() train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size) # Training # ================================================== print("starting graph def") with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) print("started session") with sess.as_default(): siameseModel = SiameseLSTM( sequence_length=max_document_length, vocab_size=len(vocab_processor.vocabulary_),
def infer(batch_size_infer, x1_infer, x2_infer): # Eval Parameters tf.flags.DEFINE_integer("batch_size", batch_size_infer, "Batch Size (default: 64)") tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") tf.flags.DEFINE_string("eval_filepath", "validation_short.txt0", "Evaluate on this data (Default: None)") tf.flags.DEFINE_string("vocab_filepath", "runs/1543141697/checkpoints/vocab", "Load training time vocabulary (Default: None)" ) # setze vocab filepath ein tf.flags.DEFINE_string("model", "runs/1543141697/checkpoints/model-2000", "Load trained model checkpoint (Default: None)" ) # setze model filepath ein # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None: print("Eval or Vocab filepaths are empty.") exit() all_predictions = [] for x1, x2 in zip(x1_infer, x2_infer): # load data and map id-transform based on training time vocabulary inpH = InputHelper() x1_test, x2_test = inpH.getTestDataSet_infer(x1, x2, FLAGS.vocab_filepath, 30) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = FLAGS.model print checkpoint_file graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) sess.run(tf.initialize_all_variables()) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x1 = graph.get_operation_by_name("input_x1").outputs[0] input_x2 = graph.get_operation_by_name("input_x2").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/distance").outputs[0] accuracy = graph.get_operation_by_name( "accuracy/accuracy").outputs[0] sim = graph.get_operation_by_name( "accuracy/temp_sim").outputs[0] #emb = graph.get_operation_by_name("embedding/W").outputs[0] #embedded_chars = tf.nn.embedding_lookup(emb,input_x) # Generate batches for one epoch batches = inpH.batch_iter(list(zip(x1_test, x2_test)), 2 * FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_d = [] for db in batches: x1_dev_b, x2_dev_b = zip(*db) # print('db ', db) # print('********************') # print('x1_dev_b') # print(x1_dev_b) # print('********************') batch_predictions = sess.run( [predictions], { input_x1: x1_dev_b, input_x2: x2_dev_b, dropout_keep_prob: 1.0 }) all_predictions.append(list(batch_predictions)) return all_predictions