if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("mode", type=str, help="type in mode") parser.add_argument("filename", type=str, help="type in the grammar filepath") parser.add_argument("sentence", type=str, help="filepath that contains sentences") args = parser.parse_args() gr = util.load_grammar(args.filename) if args.mode == 'RECOGNIZER': sent = util.load_sentence(args.sentence) for s in sent: length = len(s.split()) rslt = CKY(s, gr) ifroot = False for r in rslt[0][length - 1]: if r[0] == 'ROOT': ifroot = True if len(rslt[0][length - 1]) != 0 and ifroot: print('True') continue print('False') exit() if args.mode == 'BEST-PARSE': sent = util.load_sentence(args.sentence)
) if os.path.exists(init_model_name): serializers.load_npz(init_model_name, model) print("load model {}".format(init_model_name)) elif word2vec_init: # initialize embedding layer by word2vec import numpy as np if os.path.exists(word2vec_model_file): print("load word2vec model") word2vec_model = word2vec.Word2Vec.load(word2vec_model_file) else: print("start learning word2vec model") word2vec_model = word2vec.Word2Vec( load_sentence(sent_file), size=n_units, window=5, min_count=1, workers=4 ) print("save word2vec model") word2vec_model.save(word2vec_model_file) # initialize word embedding layer with word2vec initial_W = np.array([ word2vec_model[dictionary[wid]] if dictionary[wid] in word2vec_model else np.array( [np.random.random() for _ in range(n_units)], dtype=np.float32
def train_encoder(model, dictionary: corpora.Dictionary, sentence_file: str, model_dir: str, epoch_size: int = 100, batch_size: int = 30, dropout: bool = True, gpu: bool = False) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer opt = optimizers.SGD() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences sentences = load_sentence(sentence_file) data_size = len(sentences) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("epoch {}".format(epoch)) indexes = np.random.permutation(data_size) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32)) for index in indexes[bat_i:bat_i + batch_size]: input_words = sentences[index] # id のリストに変換する input_words_with_s = tokens2ids(input_words, dictionary, verbose=False) # フォワード try: new_loss = model(input_words_with_s, dropout=dropout, state=None, train=True) if model.xp.isnan(new_loss.data): sys.exit(1) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array(batch_size, dtype=model.xp.float32) # if gpu: # batch_size_array = cuda.to_gpu(batch_size_array) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time print_fmt = ("epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}") print( print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz(os.path.join(model_dir, "model.npz"), model) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model) print("finish epoch {}, loss {}".format(epoch, epoch_loss / epoch_size)) # save serializers.save_npz(os.path.join(model_dir, "model.npz"), model) serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S"))), model)
def train_encoder( model, dictionary: corpora.Dictionary, sentence_file: str, model_dir: str, epoch_size: int=100, batch_size: int=30, dropout: bool=True, gpu: bool=False ) -> None: if gpu >= 0: model.to_gpu() print(model.xp) # setup SGD optimizer opt = optimizers.SGD() opt.setup(model) # optimizer hooks clip_threshold = 5.0 print("set optimizer clip threshold: {}".format(clip_threshold)) opt.add_hook(chainer.optimizer.GradientClipping(clip_threshold)) # load conversation sentences sentences = load_sentence(sentence_file) data_size = len(sentences) print("data size: {}".format(data_size)) for epoch in range(epoch_size): print("epoch {}".format(epoch)) indexes = np.random.permutation(data_size) epoch_loss = 0 # int for bat_i in range(0, data_size, batch_size): forward_start_time = datetime.now() batch_loss = Variable(model.xp.zeros((), dtype=model.xp.float32)) for index in indexes[bat_i:bat_i + batch_size]: input_words = sentences[index] # id のリストに変換する input_words_with_s = tokens2ids( input_words, dictionary, verbose=False ) # フォワード try: new_loss = model( input_words_with_s, dropout=dropout, state=None, train=True ) if model.xp.isnan(new_loss.data): sys.exit(1) batch_loss += new_loss except Exception: print(index, input_words_with_s) import traceback traceback.print_exc() # 平均化 batch_size_array = model.xp.array( batch_size, dtype=model.xp.float32 ) # if gpu: # batch_size_array = cuda.to_gpu(batch_size_array) batch_loss = batch_loss / Variable(batch_size_array) epoch_loss += batch_loss.data # 時間計測 forward_end_time = datetime.now() # 最適化 opt_start_time = datetime.now() model.zerograds() batch_loss.backward() opt.update() opt_end_time = datetime.now() forward_delta = forward_end_time - forward_start_time opt_delta = opt_end_time - opt_start_time print_fmt = ( "epoch {} batch {}: " "loss {}, grad L2 norm: {}, forward {}, optimizer {}" ) print(print_fmt.format( epoch, int(bat_i / batch_size), batch_loss.data, opt.compute_grads_norm(), forward_delta, opt_delta, )) # save if ((bat_i / batch_size) + 1) % 100 == 0: serializers.save_npz( os.path.join( model_dir, "model.npz" ), model ) if ((bat_i / batch_size) + 1) % 1000 == 0: serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model ) print("finish epoch {}, loss {}".format( epoch, epoch_loss / epoch_size )) # save serializers.save_npz( os.path.join( model_dir, "model.npz" ), model ) serializers.save_npz( os.path.join( model_dir, "model_{}_{}_{}.npz".format( epoch, int(bat_i / batch_size) + 1, datetime.now().strftime("%Y%m%d-%H%M%S") ) ), model )
# load model init_model_name = os.path.join(model_dir, "model.npz") if os.path.exists(init_model_name): serializers.load_npz(init_model_name, model) print("load model {}".format(init_model_name)) elif word2vec_init: # initialize embedding layer by word2vec import numpy as np if os.path.exists(word2vec_model_file): print("load word2vec model") word2vec_model = word2vec.Word2Vec.load(word2vec_model_file) else: print("start learning word2vec model") word2vec_model = word2vec.Word2Vec(load_sentence(sent_file), size=n_units, window=5, min_count=1, workers=4) print("save word2vec model") word2vec_model.save(word2vec_model_file) # initialize word embedding layer with word2vec initial_W = np.array([ word2vec_model[dictionary[wid]] if dictionary[wid] in word2vec_model else np.array( [np.random.random() for _ in range(n_units)], dtype=np.float32) for wid in range(dim) ], dtype=np.float32)