def read_data(self, source, train_data=True): df = pd.read_csv(source, sep='\t', header=None) df.columns = [ "polarity", "aspect_category", "target_term", "character_offset", "sentence" ] df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else (0 if x == "neutral" else -1)) ### Formating output label = to_categorical(df['label'] + 1) # remove target sentence_red = [0] * len(df) for i in range(len(df)): sentence_red[i] = df["sentence"][i][:int(df["character_offset"][i].split(":")[0])] + \ df["sentence"][i][int( df["character_offset"][i].split(":")[1]):] df["sentence_red"] = sentence_red # remove stopwords # df["sentence_red"] = df["sentence_red"].apply(lambda x: self.remove_stopwords(x)) # word2vec embeddings PATH_TO_DATA = Path('../data') en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz' print(en_embeddings_path) if not en_embeddings_path.exists(): urlretrieve( 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path) w2vec = word2vec.Word2Vec(en_embeddings_path, vocab_size=50000) sentence2vec = word2vec.BagOfWords(w2vec) # Each word in a sentence will be encoded with its corresponding vector (size 300) in the embedding layer, # Each sentence contains N word -> N vector of size 300. In order to have the same input size for each sentence # we will pad the sequence of words with zeros vector until 100 words (for sentences with more than 100 words: # we will truncate the sentence at 100 words. (meaning we could possibly loss information) # If needed, this parameter could be set as more than 100 sentences = [ sentence2vec.encode(df["sentence"][i], ag_sentence=False, padding=100) for i in range(len(df["sentence"])) ] #### Transform as array sentences = np.stack(sentences) #### Encoding categorie (oneHotEncoding): if train_data == True: self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(df['aspect_category'].values.reshape(-1, 1)) categories = self.enc.transform(df['aspect_category'].values.reshape( -1, 1)) return (sentences, categories, label)
def read_data(self, source, train_data=True): df = pd.read_csv(source, sep='\t', header=None) df.columns = [ "polarity", "aspect_category", "target_term", "character_offset", "sentence" ] df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else (0 if x == "neutral" else -1)) # Formating output label = to_categorical(df['label'] + 1) # Remove target term from sentences sentence_red = [0] * len(df) for i in range(len(df)): sentence_red[i] = df["sentence"][i][:int(df["character_offset"][i].split(":")[0])] + \ df["sentence"][i][int( df["character_offset"][i].split(":")[1]):] df["sentence_red"] = sentence_red # Remove stopwords from sentences df["sentence_red"] = df["sentence_red"].apply( lambda x: self.remove_stopwords(x)) # word2vec embeddings PATH_TO_RESOURCES = Path('../resources') en_embeddings_path = PATH_TO_RESOURCES / 'cc.en.300.vec.gz' print(en_embeddings_path) if not en_embeddings_path.exists(): urlretrieve( 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path) w2vec = word2vec.Word2Vec(en_embeddings_path, vocab_size=50000) sentence2vec = word2vec.BagOfWords(w2vec) sentences = [ sentence2vec.encode(df["sentence"][i], ag_sentence=False, padding=100) for i in range(len(df["sentence"])) ] # Transform as array sentences = np.stack(sentences) # Encoding categories (oneHotEncoding): if train_data == True: self.enc = OneHotEncoder(handle_unknown='ignore') self.enc.fit(df['aspect_category'].values.reshape(-1, 1)) categories = self.enc.transform(df['aspect_category'].values.reshape( -1, 1)) return (sentences, categories, label)
def train_word2vec(pat_corpus, corpus='patents', seed=1, embed_dim=300): """Train the word2vec model""" # train the skipgram model; default window=5 model = word2vec.Word2Vec(pat_corpus, mtype='sg', hs=1, neg=13, embed_dim=embed_dim, seed=seed) # delete the huge stupid table again model.table = None # pickle the entire model to disk, so we can load&resume training later saven = "%s_sg_%i_hs0_neg13_seed%i.model" % (corpus, embed_dim, seed) print "saving model" pkl.dump(model, open("human_eval/models/%s" % saven, 'wb'), -1) return model
def train_model_word2vec(): file_name = './data/zh_classicalwiki_extracted_word_seg_result.txt' with open(file_name, 'r') as fp: words = utils.read_file(fp) w2v = word2vec.Word2Vec( batch_size=128, # 批训练大小 skip_window=3, # 单边窗口长 num_skips=2, # embedding_size=200, # 词向量的长度 vocabulary_size=50000, # 字典大小 num_sampled=64, # 负采样 learning_rate=1e-2, n_steps=100001, # 训练次数 logdir='./model/word2vec/tmp_word2vec') w2v.train_model(words) w2v.save_model("./model/word2vec")
def main(): embedding_dir = args.embedding+args.language print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention) #train_docs,dev_docs,test_docs = get_doc_data() train_docs = doc_data_generater("train") dev_docs = doc_data_generater("dev") test_docs = doc_data_generater("test") train_doc_mention_arrays,train_doc_pair_arrays,train_doc_gold_chains = array_generater(train_docs,"train",w2v) test_doc_mention_arrays,test_doc_pair_arrays,test_doc_gold_chains = array_generater(test_docs,"test",w2v) dev_doc_mention_arrays,dev_doc_pair_arrays,dev_doc_gold_chains = array_generater(dev_docs,"dev",w2v)
def callback(): wsdl_path = e1.get() k = e2.get() start = time.time() print(start) # 建立目录 wsdl_father_path = wsdl_path[0:wsdl_path.rfind('/')] service_name_path = wsdl_father_path + '/serviceName' service_name_stemmed_path = wsdl_father_path + '/serviceNameStemmed' if not os.path.exists(service_name_path): os.mkdir(service_name_path) if not os.path.exists(service_name_stemmed_path): os.mkdir(service_name_stemmed_path) subprocess.call(['java', '-jar', 'ServiceNameParsing.jar', wsdl_path]) service_num = WordProc.WordProc(service_name_path) word2vec.Word2Vec(service_name_stemmed_path) service_name_sim.service_name_sim(service_name_stemmed_path, service_num) clustering_result = spectral_clustering.spectral_clustering(k, service_num) print(clustering_result) end = time.time() lb0.insert(END, "服务数量: " + str(service_num)) lb0.insert(END, "聚类数量: " + str(k)) lb0.insert(END, "消耗时间: " + str(end - start) + "秒") for j in range(0, int(k)): lb0.insert( END, "第" + str(j + 1) + "个聚类中服务的数量: " + str(clustering_result.count(j))) lb1.insert(END, "服务 所在聚类") for i in range(0, service_num): if i < 9: lb1.insert( END, str(i + 1) + " " + str(clustering_result[i] + 1)) if i >= 9: lb1.insert( END, str(i + 1) + " " + str(clustering_result[i] + 1)) for i in range(0, int(k)): lb2.insert(END, "聚类" + str(i + 1) + "中的服务") re = [j for j, a in enumerate(clustering_result) if a == i] for k in range(0, len(re)): lb2.insert(END, " " + str(re[k] + 1))
def main(): embedding_dir = args.embedding + args.language print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention) #train_docs,dev_docs,test_docs = get_doc_data() train_docs = doc_data_generater("train") dev_docs = doc_data_generater("dev") test_docs = doc_data_generater("test") for cases, gold_chain in case_generater(train_docs, "train", w2v): print len(cases) for m_self, index_self, m_array, m_index in cases[1:]: print len(m_self[0]), len(index_self[0]) print len(m_array[0]) print gold_chain
def run(): """Runs evaluation in a loop, and logs summaries to TensorBoard.""" # Create the evaluation directory if it doesn't exist. eval_dir = FLAGS.eval_dir if not tf.gfile.IsDirectory(eval_dir): tf.logging.info("Creating eval directory: %s", eval_dir) tf.gfile.MakeDirs(eval_dir) # generate eval dump file dump_file = open(os.path.join(eval_dir, 'evaluation.json'), 'a') g = tf.Graph() with g.as_default(): # Build the model for evaluation. model_config = configuration.ModelConfig() model = word2vec.Word2Vec(model_config, mode="eval") model.build() # Create the Saver to restore model Variables. saver = tf.train.Saver() # Create the summary operation and the summary writer. summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(eval_dir) g.finalize() # Run a new evaluation run every eval_interval_secs. try: while True: start = time.time() tf.logging.info( "Starting evaluation at " + time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())) run_once(model, saver, summary_writer, summary_op, dump_file) time_to_next_eval = start + FLAGS.eval_interval_secs - time.time( ) if time_to_next_eval > 0: time.sleep(time_to_next_eval) except KeyboardInterrupt: dump_file.close()
def tocsv(trainword_file, testword_file, sg, hs, window, size, model_name, traincsv, testcsv, b, flag, iter, spmodel): if spmodel: print("loading model") model = gensim.models.KeyedVectors.load_word2vec_format(model_name, binary=False) else: sentences = word2vec.LineSentence(trainword_file) model = word2vec.Word2Vec(sentences, sg=sg, hs=hs, min_count=1, window=window, size=size, iter=iter) model.wv.save_word2vec_format(model_name, binary=False) save_csv(trainword_file, model, traincsv, b) if flag: save_csv(testword_file, model, testcsv, b)
def __init__(self, mes, trainable=True, truncated=False): self.mes = mes self.col_name = mes.train_col self.trainable = trainable self.truncated = truncated self.lang = self.mes.config['LANG'] self.fids = self.mes.config['DG_FIDS'] self.sentence_sz = self.mes.config['DG_SENTENCE_SZ'] self.label_num = self.mes.config['LABEL_NUM'] self.batch_sz = self.mes.config['DG_BATCH_SZ'] self.test_batch_sz = self.mes.config['DG_TEST_BATCH_SZ'] self.rnum = self.mes.config['DG_RNUM'] self.w2v = word2vec.Word2Vec(self.mes, trainable=False) print("Train mode:", trainable) if trainable and self.col_name is not None: self.fold_num = self.mes.config['DG_FOLD_NUM'] self.fold_test_id = self.mes.config['DG_FOLD_TEST_ID'] self.fold_valid_id = self.mes.config['DG_FOLD_VALID_ID'] self.docs = utils.get_docs(self.col_name) records = self.docs.find() records = [record for record in records] self.test_data, self.test_labels = DataGenerator.get_data_by_fold_ids( records, [self.fold_test_id]) self.valid_data, self.valid_labels = DataGenerator.get_data_by_fold_ids( records, [self.fold_valid_id]) self.train_data, self.train_labels = \ DataGenerator.get_data_by_fold_ids( records, [i for i in range(self.fold_num) if i != self.fold_test_id and i != self.fold_valid_id]) self.test_sz = len(self.test_data) self.valid_sz = len(self.valid_data) self.train_sz = len(self.train_data) self.test_inds = [0, 0, 0] self.valid_inds = [0, 0, 0] self.train_inds = [0, 0, self.rnum] elif not trainable: self.cutter = text_extractor.parser_holder.get_parser()
def __init__(self): self.parser = parser.Parser() self.w2v = word2vec.Word2Vec()
def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model vocab_type = args.vocab pretrain_w2v = args.pretrain_w2v data_path = args.data_path load_model = args.load_model """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) vocab_name = vocab_type if pretrain_w2v: vocab_name = 'p' + vocab_name if model_type == 'multi': base_dir = './{}_{}{}_{}_c{}/'.format(model_type, vocab_name, vocab_size, data_path[0], coefficient) else: base_dir = './{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size, data_path[0]) model_save_dir = base_dir if not os.path.exists(base_dir): os.mkdir(base_dir) shutil.copyfile(config_file, base_dir + config_file) config_file = base_dir + config_file config.read(config_file) if load_model is not None: model_save_dir = base_dir + load_model.replace('.npz', '') + '/' if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) valid_num = int(config['Parameter']['valid_num']) """LOGGER""" log_file = model_save_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] train_data_size = dataset.data_size(train_src_file) valid_data_size = dataset.data_size(valid_src_file) logger.info('train size: {}, valid size: {}'.format( train_data_size, valid_data_size)) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( base_dir, vocab_type, train_src_file, train_trg_file, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW = None trg_initialW = None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) logger.info('src_vocab size: {}, trg_vocab size: {}'.format( src_vocab_size, trg_vocab_size)) """ITERATOR""" _, src_label, src_text, _ = dataset.load_binary_score_file(train_src_file) trg_text = dataset.load(train_trg_file) train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) # train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) _, src_label, src_text, _ = dataset.load_binary_score_file(valid_src_file) trg_text = dataset.load(valid_trg_file) valid_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) correct_label, correct_binary_label, correct_text, correct_index = dataset.load_binary_score_file( test_src_file) trg_text = dataset.load(test_trg_file) test_iter = dataset.Iterator(correct_text, correct_binary_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) gridsearcher = gridsearch.GridSearch(valid_num) """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """PRETRAIN""" if model_type == 'pretrain' and load_model is None: logger.info('Pre-train start') pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('P{} ## train iter: {}, {}'.format( epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('P{} ## train iter {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('P{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] chainer.serializers.save_npz( model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in valid_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('P{} ## train loss: {}, val loss:{}'.format( epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE & LOAD""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'p_best_model.npz') logger.info('Pre-train finish') if load_model: logger.info('load model: {}'.format(load_model)) chainer.serializers.load_npz(model_save_dir + load_model, model) """TRAIN""" accuracy_dic = {} for epoch in range(1, n_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = optimizer.target(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('E{} ## train iter: {}, {}'.format(epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('E{} ## train iter: {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('E{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] chainer.serializers.save_npz( model_save_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV & TEST""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict(batch[0], sos, eos) except Exception as e: logger.info('E{} ## test iter: {}, {}'.format(epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('E{} ## test iter: {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('E{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type in ['multi', 'label', 'pretrain']: dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch( correct_label, correct_index, labels, alignments) else: dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch( correct_label, correct_index, alignments, []) accuracy_dic[epoch] = [dev_score, test_score] # log保存 logger.info('E{} ## loss:{}, dev: {}, test: {}'.format( epoch, train_loss, dev_score, test_score)) logger.info('E{} ## {}'.format( epoch, ' '.join(dataset.float_to_str(test_score_list[-1])))) for i, (l, p) in enumerate(zip(test_score_list[:-1], param_list), start=1): logger.info('E{} ## {}: {}\t{}'.format( epoch, i, p, ' '.join(dataset.float_to_str(l)))) # 結果保存 dataset.save_output(model_save_dir, epoch, labels, alignments, outputs, s_result_list) """MODEL SAVE""" best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][0])) logger.info('best_epoch:{}, dev: {}, test: {}, {}'.format( best_epoch, accuracy_dic[best_epoch][0], accuracy_dic[best_epoch][1], model_dir)) shutil.copyfile(model_save_dir + 'model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'best_model.npz')
def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model pretrain_w2v = args.pretrain_w2v data_path = args.data_path load_model = args.load_model """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) shuffle_data = bool(config['Parameter']['shuffle']) if pretrain_w2v: vocab_size = 'p' + str(vocab_size) if model_type == 'multi': if shuffle_data: base_dir = './pseudo_{}_{}_{}_c{}_shuffle/'.format( model_type, vocab_size, data_path[0], coefficient) else: base_dir = './pseudo_{}_{}_{}_c{}/'.format(model_type, vocab_size, data_path[0], coefficient) else: if shuffle_data: base_dir = './pseudo_{}_{}_{}_shuffle/'.format( model_type, vocab_size, data_path[0]) else: base_dir = './pseudo_{}_{}_{}/'.format(model_type, vocab_size, data_path[0]) model_save_dir = base_dir if not os.path.exists(base_dir): os.mkdir(base_dir) shutil.copyfile(config_file, base_dir + config_file) config_file = base_dir + config_file config.read(config_file) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) valid_num = int(config['Parameter']['valid_num']) shuffle_data = bool(config['Parameter']['shuffle']) """LOGGER""" log_file = model_save_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] train_data = dataset.load_label_corpus_file(train_src_file, train_trg_file) qa_data_sub_lit = dataset.split_valid_data(train_data, valid_num) valid_data = dataset.load_label_corpus_file(valid_src_file, valid_trg_file) test_data = dataset.load_label_corpus_file(test_src_file, test_trg_file) test_data_sub_lit = dataset.split_valid_data(test_data, valid_num) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( base_dir, train_data, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW, trg_initialW = None, None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) logger.info('src_vocab size: {}, trg_vocab size: {}'.format( src_vocab_size, trg_vocab_size)) evaluater = evaluate.Evaluate() """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() cross_valid_result = [] for ite in range(1, valid_num + 1): model_valid_dir = base_dir + 'valid{}/'.format(ite) if not os.path.exists(model_valid_dir): os.mkdir(model_valid_dir) qa_train_data, qa_dev_data, qa_test_data = dataset.separate_train_dev_test( qa_data_sub_lit, ite) train_data, dev_data, test_data = dataset.separate_train_dev_test( test_data_sub_lit, ite) test_data_id = [t['id'] for t in test_data] qa_iter = dataset.Iterator(qa_train_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) valid_iter = dataset.Iterator(valid_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) train_iter = dataset.Iterator(train_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) dev_iter = dataset.Iterator(dev_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) test_iter = dataset.Iterator(test_data, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) qa_size = len(qa_train_data) train_size = len(train_data) logger.info('V{} ## QA:{}, train:{}, dev:{} ,test:{}'.format( ite, qa_size, train_size, len(dev_data), len(test_data))) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) if gpu_id >= 0: model.to_gpu() """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """PRETRAIN""" if model_type == 'pretrain' and load_model is None: logger.info('Pre-train start') pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('P{} ## train iter: {}, {}'.format( epoch, i, e)) chainer.serializers.save_npz( model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in valid_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('P{} ## train loss: {}, val loss:{}'.format( epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE & LOAD""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'p_best_model.npz') logger.info('Pre-train finish') if load_model: logger.info('load model: {}'.format(load_model)) chainer.serializers.load_npz(base_dir + load_model, model) """TRAIN""" epoch_info = {} for epoch in range(1, n_epoch + 1): train_loss = 0 mix_train_iter = dataset.MixIterator(qa_iter, train_iter, seed=0, shuffle=shuffle_data) for i, batch in enumerate(mix_train_iter.generate(), start=1): try: loss = optimizer.target(*batch[0]) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## E{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV""" labels, alignments = [], [] for i, batch in enumerate(dev_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): _, label, align = model.predict(batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## dev iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for l, a in zip(label, align): labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for a in align: alignments.append(chainer.cuda.to_cpu(a)) best_param_dic = evaluater.param_search(labels, alignments, dev_data) param = max(best_param_dic, key=lambda x: best_param_dic[x]['macro']) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param]['macro'], 3) """TEST""" outputs, labels, alignments = [], [], [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## test iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for l, a in zip(label, align): labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for a in align: alignments.append(chainer.cuda.to_cpu(a)) rate, count, tf_lit, macro, micro = evaluater.eval_param( labels, alignments, test_data, init, mix) test_macro_score = round(macro, 3) test_micro_score = round(micro, 3) logger.info( 'V{} ## E{} ## loss: {}, dev: {}, param: {}, micro: {}, macro: {}' .format(ite, epoch, train_loss, dev_score, param, test_micro_score, test_macro_score)) epoch_info[epoch] = { 'id': test_data_id, 'label': labels, 'align': alignments, 'hypo': outputs, 'epoch': epoch, 'dev_score': dev_score, 'param': param, 'rate': rate, 'count': count, 'tf': tf_lit, 'macro': test_macro_score, 'micro': test_micro_score } dataset.save_output(model_valid_dir, epoch_info[epoch]) """MODEL SAVE""" best_epoch = max(epoch_info, key=(lambda x: epoch_info[x]['dev_score'])) cross_valid_result.append(epoch_info[best_epoch]) logger.info( 'V{} ## best_epoch: {}, dev: {}, micro: {}, macro: {}'.format( ite, best_epoch, epoch_info[best_epoch]['dev_score'], epoch_info[best_epoch]['micro'], epoch_info[best_epoch]['macro'])) shutil.copyfile( model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'best_model.npz') logger.info('') ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0 ave_test_score = [0 for _ in range(len(cross_valid_result[0]['rate']))] id_total, label_total, align_total, tf_total = [], [], [], [] for v, r in enumerate(cross_valid_result, start=1): ave_dev_score += r['dev_score'] ave_macro_score += r['macro'] ave_micro_score += r['micro'] for i, rate in enumerate(r['rate']): ave_test_score[i] += rate logger.info(' {}: e{}, {}\tdev: {}, micro: {}, macro: {} {}'.format( v, r['epoch'], r['param'], r['dev_score'], r['micro'], dataset.float_to_str(r['rate']), r['macro'])) id_total.extend(r['id']) label_total.extend(r['label']) align_total.extend(r['align']) tf_total.extend(r['tf']) ave_dev_score = round(ave_dev_score / valid_num, 3) ave_macro_score = round(ave_macro_score / valid_num, 3) ave_micro_score = round(ave_micro_score / valid_num, 3) ave_test_score = [ ave_test_score[i] / valid_num for i in range(len(ave_test_score)) ] logger.info('dev: {}, micro: {}, macro: {} {}'.format( ave_dev_score, ave_micro_score, dataset.float_to_str(ave_test_score), ave_macro_score)) label, align, tf = dataset.sort_multi_list(id_total, label_total, align_total, tf_total) dataset.save_list(base_dir + 'label.txt', label) dataset.save_list(base_dir + 'align.txt', align) dataset.save_list(base_dir + 'tf.txt', tf)
def main(): embedding_dir = args.embedding+args.language print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention) #network_model net_dir = "./model/pretrain_ana/network_model_pretrain.cn.3" #net_dir = "./model/pretrain/network_model_pretrain.cn.10" #net_dir = "./model/nets/network_model.cn.1" #net_dir = './model/network_model.cn' #read_f = file('./model/network_model_pretrain.'+args.language, 'rb') print >> sys.stderr,"Read model from ./model/network_model."+args.language read_f = file(net_dir, 'rb') network_model = cPickle.load(read_f) #network_model = network.NetWork(1738,855,1000) train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") MAX=5 train4test = [] # add 5 items for testing the training performance ## test performance after pretraining dev_docs_for_test = [] num = 0 for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v): #for cases,gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v): ev_doc = policy_network.generate_policy_test(cases,gold_chain,network_model) dev_docs_for_test.append(ev_doc) train4test.append((cases,gold_chain)) num += 1 if num >= MAX: break print "Performance on DATA after PreTRAINING" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print "#################################################" sys.stdout.flush() print >> sys.stderr,"Pre Train done" ##train add2train = True ran_p = 0.0 l2_lambda = 0.0000003 #l2_lambda = 0.0001 lr = 0.0002 #lr = 0.0 #lr = 0.0001 #ce_lmbda = 0.1 ce_lmbda = 0.0 for echo in range(50): start_time = timeit.default_timer() cost_this_turn = 0.0 average_reward = 0.0 done_case_num = 0 #for cases,gold_chain in DataGenerate.case_generater_trick(train_docs,"train",w2v): for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v): #for single_mention_array,train_list,lable_list in pretrain.generate_pretrain_case(cases,gold_chain,network_model): # print lable_list this_reward = 0.0 reward_baseline = [] zero_num = 0 for single, train, action, reward in policy_network.generate_policy_case(cases,gold_chain,network_model,ran_p): #for single, train, action, reward , acp in policy_network.generate_policy_case_trick(cases,gold_chain,network_model,ran_p): reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline)) norm_reward = reward - reward_b if reward > reward_b else 0.00001 this_reward = reward this_cost = network_model.train_step(single,train,action,reward,lr,l2_lambda,ce_lmbda,0.0)[0] #this_cost = network_model.train_step(single,train,action,norm_reward,lr,l2_lambda,ce_lmbda)[0] #print reward,this_cost cost_this_turn += this_cost #print this_cost,acp,reward #print this_cost reward_baseline.append(this_reward) if len(reward_baseline) >= 32: reward_baselin = reward_baseline[1:] average_reward += this_reward done_case_num += 1 if done_case_num >= MAX: break print network_model.get_weight_sum() end_time = timeit.default_timer() print >> sys.stderr, "Total cost:",cost_this_turn print >> sys.stderr, "Average Reward:",average_reward/float(done_case_num) print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time) ran_p = ran_p*0.5 ## test training performance train_docs_for_test = [] start_time = timeit.default_timer() for train_cases,train_doc_gold_chain in train4test: ev_doc = policy_network.generate_policy_test(train_cases,train_doc_gold_chain,network_model) train_docs_for_test.append(ev_doc) print "** Echo: %d **"%echo print "TRAIN" mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print sys.stdout.flush() '''
def main(): embedding_dir = args.embedding + args.language print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention) #network_model #net_dir = "./model/pretrain/network_model_pretrain.cn.19" #net_dir = "./model/pretrain_manu_dropout/network_model_pretrain.cn.10" if os.path.isfile("./model/network_model." + args.language): read_f = file('./model/network_model.' + args.language, 'rb') #read_f = file('./model/network_model_pretrain.'+args.language, 'rb') #read_f = file('./model/network_model_pretrain.cn.best', 'rb') #read_f = file(net_dir, 'rb') network_model = cPickle.load(read_f) print >> sys.stderr, "Read model from ./model/network_model." + args.language else: inpt_dimention = 1738 single_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 network_model = network.NetWork(inpt_dimention, single_dimention, 1000) print >> sys.stderr, "save model ..." save_f = file('./model/network_model.' + args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") #pretrain l2_lambda = 0.0000001 lr = 0.03 ce_lambda = 0.0001 dropout_rate = 0.2 print "Weight Sum", network_model.get_weight_sum() times = 0 #for echo in range(11,40): for echo in range(10): start_time = timeit.default_timer() print "Pretrain ECHO:", echo cost_this_turn = 0.0 #print >> sys.stderr, network_model.get_weight_sum() done_num = 0 pos_num = 0 neg_num = 0 for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case( cases, gold_chain, network_model): #cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,lable_list,lr,l2_lambda,dropout_rate)[0] if lable_list[0] == 1: neg_num += 1 ana_cost, ana_result = network_model.ana_train_step( single_mention_array, 1, lr, l2_lambda, dropout_rate) else: pos_num += 1 ana_cost, ana_result = network_model.ana_train_step( single_mention_array, 0, lr, l2_lambda, dropout_rate) for intance, lable in zip(train_list, lable_list): mention_cost, mention_result = network_model.mention_train_step( intance, lable, lr, l2_lambda, dropout_rate) done_num += 1 if done_num == 10: break lr = lr * 0.99 save_f = file( './model/pretrain_manu_new/network_model_pretrain_pair.%s.%d' % (args.language, echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn print >> sys.stderr, "POS:NEG", pos_num, neg_num print >> sys.stderr, "lr", lr print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print "Weight Sum", network_model.get_weight_sum() ## test performance after pretraining dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater( dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test( cases, gold_chain, network_model) dev_docs_for_test.append(ev_doc) num += 1 if num == 10: break print "Performance on DEV after PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "#################################################" sys.stdout.flush() print >> sys.stderr, "Begin Normal Training" for echo in range(30): start_time = timeit.default_timer() print "Pretrain ECHO:", echo cost_this_turn = 0.0 #print >> sys.stderr, network_model.get_weight_sum() done_num = 0 pos_num = 0 neg_num = 0 for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case( cases, gold_chain, network_model): cost_this_turn += network_model.pre_train_step( single_mention_array, train_list, lable_list, lr, l2_lambda, dropout_rate)[0] #cost_this_turn += network_model.pre_top_train_step(single_mention_array,train_list,lable_list,lr,l2_lambda)[0] if lable_list[0] == 1: neg_num += 1 else: pos_num += 1 done_num += 1 #if done_num == 10: # break lr = lr * 0.99 save_f = file( './model/pretrain_manu_new/network_model_pretrain.%s.%d' % (args.language, echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn print >> sys.stderr, "POS:NEG", pos_num, neg_num print >> sys.stderr, "lr", lr print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print "Weight Sum", network_model.get_weight_sum() ## test performance after pretraining dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater( dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test( cases, gold_chain, network_model) dev_docs_for_test.append(ev_doc) num += 1 if num == 10: break print "Performance on DEV after PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "#################################################" sys.stdout.flush() return for echo in range(30, 50): start_time = timeit.default_timer() cost_this_turn = 0.0 for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case( cases, gold_chain, network_model): cost_this_turn += network_model.pre_ce_train_step( single_mention_array, train_list, lable_list, lr, l2_lambda, ce_lambda)[0] end_time = timeit.default_timer() print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) print "Weight Sum", network_model.get_weight_sum() ## test performance after pretraining dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater( dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test( cases, gold_chain, network_model) dev_docs_for_test.append(ev_doc) print "Performance on DEV after PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "#################################################" sys.stdout.flush() save_f = file( './model/pretrain_manu_new/network_model_pretrain.%s.%d' % (args.language, echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() ## test performance after pretraining print >> sys.stderr, "Begin test on DEV after pertraining" dev_docs_for_test = [] num = 0 #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test(cases, gold_chain, network_model) dev_docs_for_test.append(ev_doc) print "Performance on DEV after PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "#################################################" sys.stdout.flush() print >> sys.stderr, "Pre Train done"
def main(): embedding_dir = args.embedding+args.language print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention) #network_model net_dir = "./model/pretrain_batch/network_model_pretrain.cn.9" if os.path.isfile("./model/network_model_batch."+args.language): #read_f = file('./model/network_model_batch.'+args.language, 'rb') #read_f = file('./model/network_model_pretrain.'+args.language, 'rb') #read_f = file('./model/network_model_pretrain.cn.best', 'rb') read_f = file(net_dir, 'rb') network_model = cPickle.load(read_f) print >> sys.stderr,"Read model from ./model/network_model_batch."+args.language else: inpt_dimention = 1738 single_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 network_model = network.NetWork(inpt_dimention,single_dimention,1000) print >> sys.stderr,"save model ..." save_f = file('./model/network_model_batch.'+args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") #pretrain l2_lambda = 0.0000003 lr = 0.00002 ce_lambda = 0.005 times = 0 for echo in range(0): start_time = timeit.default_timer() print "Pretrain ECHO:",echo cost_this_turn = 0.0 #print >> sys.stderr, network_model.get_weight_sum() for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v): if len(cases) >= 700: continue for train_list,single_mention_array,mask_list,lable_list in pretrain.generate_pretrain_case_batch(cases,gold_chain,network_model): cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,mask_list,lable_list,lr,l2_lambda)[0] end_time = timeit.default_timer() print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time) save_f = file('./model/pretrain_batch/network_model_pretrain_noNorm.%s.%d'%(args.language,echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() for echo in range(0): start_time = timeit.default_timer() cost_this_turn = 0.0 for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v): if len(cases) >= 700: continue for train_list,single_mention_array,mask_list,lable_list in pretrain.generate_pretrain_case_batch(cases,gold_chain,network_model): cost_this_turn += network_model.pre_ce_train_step(single_mention_array,train_list,mask_list,lable_list,lr,l2_lambda,ce_lambda)[0] save_f = file('./model/pretrain_batch/network_model_pretrain.%s.%d'%(args.language,echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time) print >> sys.stderr,"Begin test on DEV after pertraining" ## test performance after pretraining dev_docs_for_test = [] num = 0 #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) for cases,gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v): ev_doc = policy_network.generate_policy_test(cases,gold_chain,network_model) dev_docs_for_test.append(ev_doc) print "Performance on DEV after PreTRAINING" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print "#################################################" sys.stdout.flush() print >> sys.stderr,"Pre Train done" ##train train4test = [] # add 5 items for testing the training performance add2train = True for echo in range(10): start_time = timeit.default_timer() reward_baseline = [] cost_this_turn = 0.0 average_reward = 0.0 done_case_num = 0 l2_lambda = 0.000003 lr = 0.000002 ce_lambda = 0.0 for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v): if len(cases) >= 700: continue if add2train: if random.randint(1,200) == 10: #if not random.randint(1,200) == 10: #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain)) train4test.append((cases,gold_chain)) if len(train4test) == 20: add2train = False this_reward = 0.0 reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline)) for train, single, mask, action, reward in policy_network.generate_policy_case(cases,gold_chain,network_model): if len(train) <= 1: continue #for single, train, action, reward , acp in policy_network.generate_policy_case_trick(cases,gold_chain,network_model): norm_reward = reward - reward_b this_reward = reward this_cost = network_model.train_step(single,train,mask,action,reward*100,lr,l2_lambda,ce_lambda)[0] #print this_cost,acp,reward cost_this_turn += this_cost average_reward += this_reward done_case_num += 1 #if done_case_num >= 1: # break print network_model.get_weight_sum() end_time = timeit.default_timer() print >> sys.stderr, "Total cost:",cost_this_turn print >> sys.stderr, "Average Reward:",average_reward/float(done_case_num) print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time) reward_baseline.append(this_reward) if len(reward_baseline) >= 64: reward_baselin = reward_baseline[1:] ## test training performance train_docs_for_test = [] start_time = timeit.default_timer() for train_cases,train_doc_gold_chain in train4test: ev_doc = policy_network.generate_policy_test(train_cases,train_doc_gold_chain,network_model) train_docs_for_test.append(ev_doc) print "** Echo: %d **"%echo print "TRAIN" mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print ## dev dev_docs_for_test = [] start_time = timeit.default_timer() for dev_cases,dev_doc_gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v): ev_doc = policy_network.generate_policy_test(dev_cases,dev_doc_gold_chain,network_model) dev_docs_for_test.append(ev_doc) print "DEV" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print end_time = timeit.default_timer() print >> sys.stderr, "DEV Use %.3f seconds"%(end_time-start_time) sys.stdout.flush() ## test test_docs_for_test = [] start_time = timeit.default_timer() #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v): for test_cases,test_doc_gold_chain in DataGenerate.case_generater(test_docs,"test",w2v): ev_doc = policy_network.generate_policy_test(test_cases,test_doc_gold_chain,network_model) test_docs_for_test.append(ev_doc) print "TEST" mp,mr,mf = evaluation.evaluate_documents(test_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(test_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(test_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print end_time = timeit.default_timer() print >> sys.stderr, "TEST Use %.3f seconds"%(end_time-start_time) sys.stdout.flush() save_f = file('./model/nets/network_model_batch.%s.%d'%(args.language,echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close()
def main(): embedding_dir = args.embedding + args.language print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir + ".filtered", embedding_dimention) #network_model if os.path.isfile("./model/network_model_index." + args.language): read_f = file('./model/network_model_index.' + args.language, 'rb') #read_f = file('./model/network_model_pretrain.'+args.language, 'rb') network_model = cPickle.load(read_f) print >> sys.stderr, "Read model from ./model/network_model_index." + args.language else: inpt_dimention = 1738 single_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 network_model = network.NetWork(inpt_dimention, single_dimention, 1000, embedding_dir + ".filtered", embedding_dimention) print >> sys.stderr, "save model ..." save_f = file('./model/network_model_index.' + args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") #pretrain times = 0 for echo in range(20): start_time = timeit.default_timer() print "Pretrain ECHO:", echo cost_this_turn = 0.0 for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, single_index, train_list, train_index, lable_list in pretrain.generate_pretrain_case( cases, gold_chain, network_model): print network_model.fff(train_list, train_index) cost_this_turn += network_model.pre_train_step( single_mention_array, single_index, train_list, train_index, lable_list, 0.0001)[0] end_time = timeit.default_timer() print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time - start_time) if echo % 4 == 0: save_f = file( './model/network_model_pretrain_index.' + args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() save_f = file('./model/network_model_pretrain_index.' + args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() print >> sys.stderr, "Begin test on DEV after pertraining" ## test performance after pretraining dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test(cases, gold_chain, network_model) dev_docs_for_test.append(ev_doc) print "Performance on DEV after PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "##################################################" sys.stdout.flush() print >> sys.stderr, "Pre Train done" ##train train4test = [] # add 5 items for testing the training performance add2train = True for echo in range(20): start_time = timeit.default_timer() reward_baseline = [] cost_this_turn = 0.0 for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if add2train: if random.randint(1, 200) == 10: #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain)) train4test.append((cases, gold_chain)) if len(train4test) == 5: add2train = False this_reward = 0.0 for single, sindex, train, tindex, action, reward in policy_network.generate_policy_case( cases, gold_chain, network_model): cost_this_turn += network_model.train_step( single, sindex, train, tindex, action, reward, 0.0001)[0] end_time = timeit.default_timer() print >> sys.stderr, "Total cost:", cost_this_turn print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time - start_time) ## test training performance train_docs_for_test = [] start_time = timeit.default_timer() for train_cases, train_doc_gold_chain in train4test: ev_doc = policy_network.generate_policy_test( train_cases, train_doc_gold_chain, network_model) train_docs_for_test.append(ev_doc) print "** Echo: %d **" % echo print "TRAIN" mp, mr, mf = evaluation.evaluate_documents(train_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(train_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(train_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print ## dev dev_docs_for_test = [] start_time = timeit.default_timer() #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) for dev_cases, dev_doc_gold_chain in DataGenerate.case_generater( dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test( dev_cases, dev_doc_gold_chain, network_model) dev_docs_for_test.append(ev_doc) print "DEV" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print end_time = timeit.default_timer() print >> sys.stderr, "DEV Use %.3f seconds" % (end_time - start_time) sys.stdout.flush() ## test test_docs_for_test = [] start_time = timeit.default_timer() #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v): for test_cases, test_doc_gold_chain in DataGenerate.case_generater( test_docs, "test", w2v): ev_doc = policy_network.generate_policy_test( test_cases, test_doc_gold_chain, network_model) test_docs_for_test.append(ev_doc) print "TEST" mp, mr, mf = evaluation.evaluate_documents(test_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(test_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(test_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print end_time = timeit.default_timer() print >> sys.stderr, "TEST Use %.3f seconds" % (end_time - start_time) sys.stdout.flush() '''
import numpy import pyximport pyximport.install(inplace=True, setup_args={"include_dirs": numpy.get_include()}) # import test_sdot import word2vec import itertools sentences = list( itertools.islice( word2vec.Text8Corpus('/Users/kofola/workspace/word2vec/text8'), 100)) model = word2vec.Word2Vec(sentences[:1], size=10, min_count=0) print model.syn0
def main(): embedding_dir = args.embedding+args.language print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention) #network_model if os.path.isfile("./model/network_model."+args.language): read_f = file('./model/network_model.'+args.language, 'rb') #read_f = file('./model/network_model_pretrain.'+args.language, 'rb') network_model = cPickle.load(read_f) print >> sys.stderr,"Read model from ./model/network_model."+args.language else: inpt_dimention = 1738 single_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 network_model = network.NetWork(inpt_dimention,single_dimention,1000) print >> sys.stderr,"save model ..." save_f = file('./model/network_model.'+args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") most_time = 100 most_time_test = 50 #pretrain for echo in range(10): start_time = timeit.default_timer() print "Pretrain ECHO:",echo cost_this_turn = 0.0 num = most_time #print >> sys.stderr, network_model.get_weight_sum() for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v): num -= 1 if num <= 0: break for single_mention_array,train_list,lable_list in pretrain.generate_pretrain_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model): #print single_mention_array cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,lable_list,0.0003)[0] for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v): ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) break print network_model.get_weight_sum() end_time = timeit.default_timer() print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time) save_f = file('./model/network_model_pretrain.'+args.language, 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() print >> sys.stderr,"Begin test on DEV after pertraining" ## test performance on dev after pretraining dev_docs_for_test = [] num = most_time_test #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v): num -= 1 if num <= 0: break ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) dev_docs_for_test.append(ev_doc) print "Performance on TRAIN after PreTRAINING" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print "##################################################" sys.stdout.flush() print >> sys.stderr,"Pre Train done" ## test performance on dev after pretraining dev_docs_for_test = [] num = most_time_test for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): num -= 1 if num <= 0: break ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) dev_docs_for_test.append(ev_doc) print "Performance on DEV after PreTRAINING" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print "##################################################" sys.stdout.flush() print >> sys.stderr,"Pre Train done" return ##train train4test = [] # add 5 items for testing the training performance add2train = True for echo in range(20): start_time = timeit.default_timer() reward_baseline = [] cost_this_turn = 0.0 trick_num = 0 for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v): #trick_num += 1 #if trick_num < 80: # continue if add2train: if random.randint(1,200) == 100: train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain)) if len(train4test) == 5: add2train = False this_reward = 0.0 #for train_batch, mask_batch, action_batch, reward_batch in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model): for single, train, action, reward in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model): #this_reward = reward_batch #reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline)) #norm_reward = numpy.array(reward_batch) - reward_b #cost_this_turn += network_model.train_step(train_batch,mask_batch,action_batch,norm_reward,0.0001)[0] cost_this_turn += network_model.train_step(single,train,action,reward,0.0001)[0] end_time = timeit.default_timer() print >> sys.stderr, "Total cost:",cost_this_turn print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time) #reward_baseline.append(this_reward) #if len(reward_baseline) >= 32: # reward_baselin = reward_baseline[1:] ## test training performance train_docs_for_test = [] start_time = timeit.default_timer() for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in train4test: ev_doc = policy_network.generate_policy_test(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model) train_docs_for_test.append(ev_doc) print "** Echo: %d **"%echo print "TRAIN" mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print ## dev dev_docs_for_test = [] start_time = timeit.default_timer() for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) dev_docs_for_test.append(ev_doc) print "DEV" mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print end_time = timeit.default_timer() print >> sys.stderr, "DEV Use %.3f seconds"%(end_time-start_time) sys.stdout.flush() ## test test_docs_for_test = [] start_time = timeit.default_timer() for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v): ev_doc = policy_network.generate_policy_test(test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain,network_model) test_docs_for_test.append(ev_doc) print "TEST" mp,mr,mf = evaluation.evaluate_documents(test_docs_for_test,evaluation.muc) print "MUC: recall: %f precision: %f f1: %f"%(mr,mp,mf) bp,br,bf = evaluation.evaluate_documents(test_docs_for_test,evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f"%(br,bp,bf) cp,cr,cf = evaluation.evaluate_documents(test_docs_for_test,evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f"%(cr,cp,cf) print end_time = timeit.default_timer() print >> sys.stderr, "TEST Use %.3f seconds"%(end_time-start_time) sys.stdout.flush() save_f = file('./model/nets/network_model.%s.%d'%(args.language,echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close()
def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model vocab_type = args.vocab pretrain_w2v = args.pretrain_w2v data_path = args.data_path """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) vocab_name = vocab_type if pretrain_w2v: vocab_name = 'p' + vocab_name if model_type == 'multi': model_dir = './super_{}_{}{}_{}_c{}/'.format(model_type, vocab_name, vocab_size, data_path[0], coefficient) else: model_dir = './super_{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size, data_path[0]) if not os.path.exists(model_dir): os.mkdir(model_dir) shutil.copyfile(config_file, model_dir + config_file) config_file = model_dir + config_file config.read(config_file) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) valid_num = int(config['Parameter']['valid_num']) """LOGGER""" log_file = model_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] correct_label, src_label, src_text, correct_index = dataset.load_binary_score_file( test_src_file) trg_text = dataset.load(test_trg_file) slice_size = len(correct_label) // valid_num correct_label, src_label, src_text, trg_text, correct_index = gridsearch.shuffle_list( correct_label, src_label, src_text, trg_text, correct_index) correct_label = gridsearch.slice_list(correct_label, slice_size) src_label = gridsearch.slice_list(src_label, slice_size) src_text = gridsearch.slice_list(src_text, slice_size) trg_text = gridsearch.slice_list(trg_text, slice_size) correct_index = gridsearch.slice_list(correct_index, slice_size) evaluater = evaluate.Evaluate() cross_valid_result = [] for ite in range(1, valid_num + 1): model_valid_dir = model_dir + 'valid{}/'.format(ite) if not os.path.exists(model_valid_dir): os.mkdir(model_valid_dir) index = ite - 1 c_label_train, c_label_dev, c_label_test = gridsearch.split_train_dev_test( correct_label, index) label_train, label_dev, label_test = gridsearch.split_train_dev_test( src_label, index) src_train, src_dev, src_test = gridsearch.split_train_dev_test( src_text, index) trg_train, trg_dev, trg_test = gridsearch.split_train_dev_test( trg_text, index) c_index_train, c_index_dev, c_index_test = gridsearch.split_train_dev_test( correct_index, index) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( model_valid_dir, vocab_type, src_train, trg_train, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW = None trg_initialW = None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) """ITERATOR""" train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) # train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) dev_iter = dataset.Iterator(src_dev, label_dev, trg_dev, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) test_iter = dataset.Iterator(src_test, label_test, trg_test, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) logger.info( 'V{} ## train:{}, dev:{}, test:{}, src_vocab:{}, trg_vocab:{}'. format(ite, len(label_train), len(label_dev), len(label_test), src_vocab_size, trg_vocab_size)) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """GPU""" if gpu_id >= 0: chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """PRETRAIN""" if model_type == 'pretrain': logger.info('Pre-train start') logger.info('train size: {}, valid size: {}'.format( len(label_train), len(label_dev))) pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## P{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in dev_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('V{} ## P{} ## train loss: {}, val loss:{}'.format( ite, epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_valid_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'p_best_model.npz') logger.info('Pre-train finish') """TRAIN""" accuracy_dic = {} for epoch in range(1, n_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = optimizer.target(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## E{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(dev_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## dev iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type == 'encdec': best_param_dic = evaluater.param_search( alignments, [], c_label_dev) else: best_param_dic = evaluater.param_search( labels, alignments, c_label_dev) param = max(best_param_dic, key=lambda x: best_param_dic[x]) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param], 3) """TEST""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## test iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type in ['multi', 'label', 'pretrain']: s_rate, s_count, _, _, s_result = evaluater.eval_param( labels, alignments, c_label_test, c_index_test, init, mix) else: s_rate, s_count, _, _, s_result = evaluater.eval_param( alignments, [], c_label_test, c_index_test, init, mix) test_score = round(s_rate[-1], 3) logger.info('V{} ## E{} ## loss:{}, dev: {}, test: {}'.format( ite, epoch, train_loss, dev_score, test_score)) dataset.save_output(model_valid_dir, epoch, labels, alignments, outputs, s_result) accuracy_dic[epoch] = [ epoch, dev_score, test_score, param, s_rate, s_result ] """MODEL SAVE""" best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][1])) cross_valid_result.append(accuracy_dic[best_epoch]) logger.info('V{} ## best_epoch:{}, dev:{}, test:{}'.format( ite, best_epoch, accuracy_dic[best_epoch][1], accuracy_dic[best_epoch][2])) shutil.copyfile( model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'best_model.npz') logger.info('') average_dev_score = 0 average_test_score = [0 for _ in range(len(cross_valid_result[0][4]))] s_result_total = [] for i, r in enumerate(cross_valid_result, start=1): epoch = r[0] dev_score = r[1] param = r[3] test_score_list = [round(rr, 3) for rr in r[4]] s_result = r[5] average_dev_score += dev_score average_test_score = [ average_test_score[i] + test_score_list[i] for i in range(len(average_test_score)) ] logger.info(' {}: epoch{}, {}\t{}'.format( i, epoch, param, ' '.join(dataset.float_to_str(test_score_list)))) s_result_total.extend(s_result) average_dev_score = round(average_dev_score / len(cross_valid_result), 3) average_test_score = [ round(average_test_score[i] / len(cross_valid_result), 3) for i in range(len(average_test_score)) ] logger.info('dev: {}, test: {}'.format( average_dev_score, ' '.join(dataset.float_to_str(average_test_score)))) with open(model_dir + 's_res.txt', 'w') as f: [ f.write('{}\n'.format(l[1])) for l in sorted(s_result_total, key=lambda x: x[0]) ]
def parseFile(filepath): output = [] fp = open(filepath, 'r') lines = fp.readlines() #discard headers lines = lines[1:] #Engagements/Followers at Posting/Created/Type/Description data = [] expected = [] captions = [] joined = True while joined: joined = False for li in range(0, len(lines)): splitline = lines[li].split(',', 4) if len(splitline) < 5: lines[li - 1:li + 1] = [''.join(lines[li - 1:li + 1])] joined = True break for line in lines: splitline = line.split(',', 4) caption = splitline[4] captions = captions + caption.split() w2v = word2vec.Word2Vec(captions, True) for line in lines: splitline = line.split(',', 4) if len(splitline) == 5: parsedline = [] expected.append(int(splitline[0])) #engagements - add this to training input parsedline.append(int(splitline[1])) #followers parsedline = parsedline + dateparser.DateParser.getDateTime( splitline[2]) #created time parsedline = parsedline + posttypeparser.PostTypeParser.getPostType( splitline[3]) #post type parsedline = parsedline + w2v.getVector(splitline[4]) #description data.append(parsedline) #add parsed line to data set fp.close() output = [data, expected] return output #of = open("output.txt", 'w') #o = FileParser.parseFile("data/business/training_set.csv") #writestring = "" #for entry in o[0]: # writestring += str(entry) + '\n' #of.write(writestring)
else: return numpy.array([0.5] * (n - 4) + p_list) if args.type == "nn_train": if os.path.isfile("./model/save_data"): print >> sys.stderr, "Read from file ./model/save_data" read_f = file('./model/save_data', 'rb') training_instances = cPickle.load(read_f) anaphorics_result = cPickle.load(read_f) test_instances = cPickle.load(read_f) read_f.close() else: print >> sys.stderr, "Read W2V" w2v = word2vec.Word2Vec(args.embedding) ### Training #### path = args.data training_instances = generate_instance.generate_training_instances( path, w2v) #### Test process #### path = args.test_data test_instances, anaphorics_result = generate_instance.generate_test_instances( path, w2v) w2v = None # 释放空间 print >> sys.stderr, "Save file ./model/save_data"
def main(): embedding_dir = args.embedding + args.language print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir embedding_dimention = 50 if args.language == "cn": embedding_dimention = 64 w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention) #network_model_manager if os.path.isfile("./model/network_model_manager." + args.language): #read_f = file('./model/network_model_manager.'+args.language, 'rb') read_f = file( './model/network_model_pretrain_manager.' + args.language, 'rb') network_manager = cPickle.load(read_f) print >> sys.stderr, "Read model from ./model/network_model_manager." + args.language else: inpt_dimention = 1738 single_dimention = 855 cluster_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 cluster_dimention = 855 network_manager = network.Manager(inpt_dimention, single_dimention, 1000) print >> sys.stderr, "save model network_manager..." save_f = file('./model/network_model_manager.' + args.language, 'wb') cPickle.dump(network_manager, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() #network_model_worker if os.path.isfile("./model/network_model_worker." + args.language): read_f = file('./model/network_model_worker.' + args.language, 'rb') #read_f = file('./model/network_model_pretrain_worker.'+args.language, 'rb') network_worker = cPickle.load(read_f) print >> sys.stderr, "Read model from ./model/network_model_worker." + args.language else: inpt_dimention = 1738 single_dimention = 855 cluster_dimention = 855 if args.language == "en": inpt_dimention = 1374 single_dimention = 673 cluster_dimention = 855 network_worker = network.Worker(inpt_dimention, single_dimention, cluster_dimention, 1000) print >> sys.stderr, "save model network_worker..." save_f = file('./model/network_model_worker.' + args.language, 'wb') cPickle.dump(network_worker, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() train_docs = DataGenerate.doc_data_generater("train") dev_docs = DataGenerate.doc_data_generater("dev") test_docs = DataGenerate.doc_data_generater("test") #pretrain_manager times = 0 best_cost = 99999999 step = 0 lr = 0.00009 for echo in range(10): start_time = timeit.default_timer() print "Pretrain ECHO:", echo cost_this_turn = 0.0 #print >> sys.stderr, network_model.get_weight_sum() for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case( cases, gold_chain): #cost_this_turn += network_manager.pre_train_step(single_mention_array,train_list,lable_list,0.0001)[0] cost_this_turn += network_manager.pre_train_step( single_mention_array, train_list, lable_list, lr)[0] step += 1 if step % 128 == 0: lr = lr * 0.99 end_time = timeit.default_timer() print >> sys.stderr, "PreTrain for Manager", echo, "Total cost:", cost_this_turn print >> sys.stderr, "PreTraining for Manager Use %.3f seconds" % ( end_time - start_time) if cost_this_turn <= best_cost: save_f = file( './model/network_model_pretrain_manager_best.' + args.language, 'wb') cPickle.dump(network_manager, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() best_cost = cost_this_turn save_f = file( './model/network_model_pretrain_manager.' + args.language, 'wb') cPickle.dump(network_manager, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() ## test performance after pretraining print >> sys.stderr, "Begin test on DEV after Manager pertraining" dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v): ev_doc = pretrain.generate_pretrain_test(cases, gold_chain, network_manager) dev_docs_for_test.append(ev_doc) print "Performance on DEV after Manager PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "##################################################" sys.stdout.flush() print >> sys.stderr, "Manager Pre Train done" return #pretrain_worker times = 0 best_cost = 99999999 for echo in range(20): start_time = timeit.default_timer() print "Pretrain ECHO:", echo cost_this_turn = 0.0 #print >> sys.stderr, network_model.get_weight_sum() for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if len(cases) >= 700: continue for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case( cases, gold_chain, network_model): cost_this_turn += network_manager.pre_train_step( single_mention_array, train_list, lable_list, 0.0001)[0] end_time = timeit.default_timer() print >> sys.stderr, "PreTrain4Manager", echo, "Total cost:", cost_this_turn print >> sys.stderr, "PreTraining4Manager Use %.3f seconds" % ( end_time - start_time) if cost_this_turn <= best_cost: save_f = file( './model/network_model_pretrain_manager_best.' + args.language, 'wb') cPickle.dump(network_manager, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() best_cost = cost_this_turn save_f = file( './model/network_model_pretrain_manager.' + args.language, 'wb') cPickle.dump(network_manager, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close() ## test performance after pretraining print >> sys.stderr, "Begin test on DEV after Manager pertraining" dev_docs_for_test = [] num = 0 for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test(cases, gold_chain, network_manager) dev_docs_for_test.append(ev_doc) print "Performance on DEV after Manager PreTRAINING" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print "##################################################" sys.stdout.flush() print >> sys.stderr, "Manager Pre Train done" ##train train4test = [] # add 5 items for testing the training performance add2train = True for echo in range(20): start_time = timeit.default_timer() reward_baseline = [] cost_this_turn = 0.0 #for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v): for cases, gold_chain in DataGenerate.case_generater( train_docs, "train", w2v): if add2train: if random.randint(1, 200) == 10: #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain)) train4test.append((cases, gold_chain)) if len(train4test) == 5: add2train = False this_reward = 0.0 #for single, train, action, reward in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model): for single, train, action, reward in policy_network.generate_policy_case( cases, gold_chain, network_model): #reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline)) #norm_reward = numpy.array(reward_batch) - reward_b cost_this_turn += network_model.train_step( single, train, action, reward, 0.0001)[0] end_time = timeit.default_timer() print >> sys.stderr, "Total cost:", cost_this_turn print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time - start_time) #reward_baseline.append(this_reward) #if len(reward_baseline) >= 32: # reward_baselin = reward_baseline[1:] ## test training performance train_docs_for_test = [] start_time = timeit.default_timer() for train_cases, train_doc_gold_chain in train4test: ev_doc = policy_network.generate_policy_test( train_cases, train_doc_gold_chain, network_model) train_docs_for_test.append(ev_doc) print "** Echo: %d **" % echo print "TRAIN" mp, mr, mf = evaluation.evaluate_documents(train_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(train_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(train_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print ## dev dev_docs_for_test = [] start_time = timeit.default_timer() #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v): #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model) for dev_cases, dev_doc_gold_chain in DataGenerate.case_generater( dev_docs, "dev", w2v): ev_doc = policy_network.generate_policy_test( dev_cases, dev_doc_gold_chain, network_model) dev_docs_for_test.append(ev_doc) print "DEV" mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print end_time = timeit.default_timer() print >> sys.stderr, "DEV Use %.3f seconds" % (end_time - start_time) sys.stdout.flush() ## test test_docs_for_test = [] start_time = timeit.default_timer() #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v): for test_cases, test_doc_gold_chain in DataGenerate.case_generater( test_docs, "test", w2v): ev_doc = policy_network.generate_policy_test( test_cases, test_doc_gold_chain, network_model) test_docs_for_test.append(ev_doc) print "TEST" mp, mr, mf = evaluation.evaluate_documents(test_docs_for_test, evaluation.muc) print "MUC: recall: %f precision: %f f1: %f" % (mr, mp, mf) bp, br, bf = evaluation.evaluate_documents(test_docs_for_test, evaluation.b_cubed) print "BCUBED: recall: %f precision: %f f1: %f" % (br, bp, bf) cp, cr, cf = evaluation.evaluate_documents(test_docs_for_test, evaluation.ceafe) print "CEAF: recall: %f precision: %f f1: %f" % (cr, cp, cf) print end_time = timeit.default_timer() print >> sys.stderr, "TEST Use %.3f seconds" % (end_time - start_time) sys.stdout.flush() save_f = file( './model/nets/network_model.%s.%d' % (args.language, echo), 'wb') cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL) save_f.close()