num_pass += 1 #print num_pass for token in sentence: fp.write(token[0]) if len(token) == 2: fp.write('\t' + token[1]) else: fp.write('\t' + token[-1]) fp.write('\n') fp.write('\n') fp.close() #查看由于标签产生的关系错误的位置 from GenarateXY import GetXY from utils import label2answer, loadtokens, computeFr testtokens = loadtokens(u'../data/DDI_test_%s_Y.txt' % (label_mode)) xtest, y_test, xctest = GetXY(u'../data/DDI_test_%s_Y.txt' % (label_mode), mask=True) predict_e, predict_r = label2answer(y_test, testtokens) for i in range(len(gold_relation)): if cmp(gold_relation[i], predict_r[i]) != 0: print senindex2file[i] print ' '.join(testtokens[i]) print gold_relation[i] print predict_r[i] print '\n' pr, rr, fr, frlabel = computeFr(gold_relation, predict_r) print u'由goldy得到的关系的PRF为%f %f %f' % (pr, rr, fr) ##计算重叠关系的数量
from keras.layers import TimeDistributed from keras.layers import LSTM from keras.utils import np_utils from keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad from GenarateXY import GetXY from utils import save_model, load_model, LoadGoldEntity, loadtokens, SaveGoldEntity, GetModel from utils import ypre2label, label2answer, computeFe from constants import num_class, bils, ls, label_mode, len_sentence import pickle #统计信息 maxf_e = 0.0 mask = 1 #导入所有句子、标注的实体位置、标注的关系、训练集和测试集 wordvecmodel = GetModel( u'../../token2vec/medline_chemdner_pubmed_biov5_drug.token4_d50', mask) testtokens = loadtokens(u'../data/DDI_test_%s.txt' % (label_mode)) traintokens_all = loadtokens(u'../data/DDI_train_%s.txt' % (label_mode)) gold_e = LoadGoldEntity('../data/goldEntityAnswer.txt') #标注的实体位置 gold_e_train_all = LoadGoldEntity('../data/goldEntityAnswer_train.txt') xtrain_all, y_train_all = GetXY(u'../data/DDI_train_%s.txt' % (label_mode), mask) xtest, y_test = GetXY(u'../data/DDI_test_%s.txt' % (label_mode), mask) numtest = len(y_test) numtrain = len(y_train_all) ytest = np_utils.to_categorical(y_test, num_class) #将数值型标签转换为多分类数组 ytest = np.reshape(ytest, (numtest, len_sentence, num_class)) #重新reshape ytrain_all = np_utils.to_categorical(y_train_all, num_class) ytrain_all = np.reshape(ytrain_all, (numtrain, len_sentence, num_class)) xvaild = [] yvaild = []
# -*- coding: utf-8 -*- import codecs as cs from utils import LoadGoldEntity, LoadGoldRelation, loadtokens testtokens = loadtokens(u'../data/CPR_test_Y.txt') gold_e = LoadGoldEntity('../data/goldEntityAnswer_test.txt') #标注的实体位置 pre_e = LoadGoldEntity('../data/predictE_test.txt') gold_r = LoadGoldRelation( '../data/goldRelationAnswer_test.txt') #标注的关系的实体位置及类别 pre_r = LoadGoldRelation('../data/predictR_test.txt') fp = cs.open('../data/CompareAns.txt', 'w', 'utf-8') for i in range(len(testtokens)): fp.write(' '.join(testtokens[i]) + '\n') text = ''.join(testtokens[i]) fp.write(u'【GOLD_E】\n') for e in gold_e[i]: fp.write(text[e[0]:e[1] + 1] + '\t' + e[2] + '\n') fp.write(u'【PRE_E】\n') for e in pre_e[i]: fp.write(text[e[0]:e[1] + 1] + '\t' + e[2] + '\n') fp.write(u'【GOLD_R】\n') for r in gold_r[i]: fp.write(text[r[0]:r[1] + 1] + '\t' + text[r[2]:r[3] + 1] + '\t' + r[4] + '\n') fp.write(u'【PRE_R】\n')
from keras.optimizers import RMSprop #, SGD, Adam, Adadelta, Adagrad from ChainCRF import ChainCRF from utils import ypre2label, label2answer, computeFe from GenerateXY import GetXY from utils import save_model, load_model, LoadGoldEntity from utils import loadtokens, SaveGoldRelation, SaveGoldEntity, GetModel from constants import num_class, len_sentence, len_word from constants import bils, ls, wv import pickle #根据超参数来选择词向量模型 mask = 0 wordvec = u'../../token2vec/chemdner_pubmed_biov5_drug.token4_d100_CPR' #wordvec = u'../../token2vec/medline_chemdner_pubmed_biov5_drug.token4_d50' wordvecmodel = GetModel(wordvec, mask=mask) #得到所有token的词向量,按索引排列,python-list #导入训练集和测试集对应的tokens,二维python-list,用于根据预测标签来得到实体位置 testtokens = loadtokens(u'../data/CPR_test_Y.txt') traintokens = loadtokens(u'../data/CPR_train_Y.txt') vaildtokens = loadtokens(u'../data/CPR_vaild_Y.txt') #导入标注的实体和关系 二维python-list gold_e = LoadGoldEntity('../data/goldEntityAnswer_test.txt') #标注的实体位置 gold_e_train = LoadGoldEntity('../data/goldEntityAnswer_train.txt') gold_e_vaild = LoadGoldEntity('../data/goldEntityAnswer_vaild.txt') #导入用于训练的X,Y,并将Y转化为one-hot形式 xtrain, y_train, xctrain = GetXY(u'../data/CPR_train_Y.txt', mask) xtest, y_test, xctest = GetXY(u'../data/CPR_test_Y.txt', mask) xvaild, y_vaild, xcvaild = GetXY(u'../data/CPR_vaild_Y.txt', mask) num_train = len(y_train) num_test = len(y_test) num_vaild = len(y_vaild) ytest = np_utils.to_categorical(y_test, num_class) #将数值型标签转换为多分类数组 ytest = np.reshape(ytest, (num_test, len_sentence, num_class)) #重新reshape
# -*- coding: utf-8 -*- import pickle import codecs as cs f = open('../data/predict/en2t_crf+bils250+ls100.pkl', 'rb') eindex2tindex = pickle.load(f) f.close() from utils import loadtokens testtokens = loadtokens(u'../data/DDI_test_BIOES.txt') fp = cs.open('../data/testsample.txt', 'w', 'utf-8') for i in range(len(testtokens)): enum = len(eindex2tindex[i]) if enum > 1: for j in range(0, enum - 1): for k in range(j + 1, enum): fp.write( str(eindex2tindex[i][j][0]) + ' ' + str(eindex2tindex[i][j][-1]) + '|') fp.write( str(eindex2tindex[i][k][0]) + ' ' + str(eindex2tindex[i][k][-1]) + '|') fp.write(testtokens[i][0]) for t in testtokens[i][1:]: fp.write(' ' + t) fp.write('\n') fp.close()
if_chunk = 0 if_ner = 0 dimdic = { 'chardim': 15, 'posdim': 20, 'chunkdim': 10, 'nerdim': 10, 'convdim': 25 } mask = 0 wordvecmodel = GetModel( u'../../token2vec/chemdner_pubmed_biov5_drug.token4_d100', mask=mask) #得到所有token的词向量,按索引排列,python-list #导入训练集和测试集对应的tokens,二维python-list,用于根据预测标签来得到实体位置 testtokens = loadtokens(u'../data/DDI_test_BIOES.genia_fea') traintokens_all = loadtokens(u'../data/DDI_train_BIOES.genia_fea') traintokens = traintokens_all[0:6066] vaildtokens = traintokens_all[6066:] #导入标注的实体和关系 二维python-list gold_e = LoadGoldEntity('../data/goldEntityAnswer.txt') #标注的实体位置 gold_r = LoadGoldRelation('../data/goldRelationAnswer.txt') #标注的关系的实体位置及类别 gold_e_train_all = LoadGoldEntity('../data/goldEntityAnswer_train.txt') gold_r_train_all = LoadGoldRelation('../data/goldRelationAnswer_train.txt') gold_e_train = gold_e_train_all[:6066] gold_e_vaild = gold_e_train_all[6066:] gold_r_train = gold_r_train_all[:6066] gold_r_vaild = gold_r_train_all[6066:] #导入用于训练的X,Y,并将Y转化为one-hot形式 Data_train_all = GetXY(u'../data/DDI_train_BIOES.genia_fea', mask, if_lemma, if_pos, if_chunk, if_ner)