Ejemplo n.º 1
0
        num_pass += 1
        #print num_pass
        for token in sentence:
            fp.write(token[0])
            if len(token) == 2:
                fp.write('\t' + token[1])
            else:
                fp.write('\t' + token[-1])
            fp.write('\n')
        fp.write('\n')
fp.close()

#查看由于标签产生的关系错误的位置
from GenarateXY import GetXY
from utils import label2answer, loadtokens, computeFr
testtokens = loadtokens(u'../data/DDI_test_%s_Y.txt' % (label_mode))
xtest, y_test, xctest = GetXY(u'../data/DDI_test_%s_Y.txt' % (label_mode),
                              mask=True)
predict_e, predict_r = label2answer(y_test, testtokens)

for i in range(len(gold_relation)):
    if cmp(gold_relation[i], predict_r[i]) != 0:
        print senindex2file[i]
        print ' '.join(testtokens[i])
        print gold_relation[i]
        print predict_r[i]
        print '\n'
pr, rr, fr, frlabel = computeFr(gold_relation, predict_r)
print u'由goldy得到的关系的PRF为%f %f %f' % (pr, rr, fr)

##计算重叠关系的数量
Ejemplo n.º 2
0
from keras.layers import TimeDistributed
from keras.layers import LSTM
from keras.utils import np_utils
from keras.optimizers import RMSprop, SGD, Adam, Adadelta, Adagrad
from GenarateXY import GetXY
from utils import save_model, load_model, LoadGoldEntity, loadtokens, SaveGoldEntity, GetModel
from utils import ypre2label, label2answer, computeFe
from constants import num_class, bils, ls, label_mode, len_sentence
import pickle
#统计信息
maxf_e = 0.0
mask = 1
#导入所有句子、标注的实体位置、标注的关系、训练集和测试集
wordvecmodel = GetModel(
    u'../../token2vec/medline_chemdner_pubmed_biov5_drug.token4_d50', mask)
testtokens = loadtokens(u'../data/DDI_test_%s.txt' % (label_mode))
traintokens_all = loadtokens(u'../data/DDI_train_%s.txt' % (label_mode))
gold_e = LoadGoldEntity('../data/goldEntityAnswer.txt')  #标注的实体位置
gold_e_train_all = LoadGoldEntity('../data/goldEntityAnswer_train.txt')
xtrain_all, y_train_all = GetXY(u'../data/DDI_train_%s.txt' % (label_mode),
                                mask)
xtest, y_test = GetXY(u'../data/DDI_test_%s.txt' % (label_mode), mask)
numtest = len(y_test)
numtrain = len(y_train_all)
ytest = np_utils.to_categorical(y_test, num_class)  #将数值型标签转换为多分类数组
ytest = np.reshape(ytest, (numtest, len_sentence, num_class))  #重新reshape
ytrain_all = np_utils.to_categorical(y_train_all, num_class)
ytrain_all = np.reshape(ytrain_all, (numtrain, len_sentence, num_class))

xvaild = []
yvaild = []
Ejemplo n.º 3
0
# -*- coding: utf-8 -*-

import codecs as cs
from utils import LoadGoldEntity, LoadGoldRelation, loadtokens

testtokens = loadtokens(u'../data/CPR_test_Y.txt')
gold_e = LoadGoldEntity('../data/goldEntityAnswer_test.txt')  #标注的实体位置
pre_e = LoadGoldEntity('../data/predictE_test.txt')
gold_r = LoadGoldRelation(
    '../data/goldRelationAnswer_test.txt')  #标注的关系的实体位置及类别
pre_r = LoadGoldRelation('../data/predictR_test.txt')

fp = cs.open('../data/CompareAns.txt', 'w', 'utf-8')
for i in range(len(testtokens)):
    fp.write(' '.join(testtokens[i]) + '\n')
    text = ''.join(testtokens[i])

    fp.write(u'【GOLD_E】\n')
    for e in gold_e[i]:
        fp.write(text[e[0]:e[1] + 1] + '\t' + e[2] + '\n')

    fp.write(u'【PRE_E】\n')
    for e in pre_e[i]:
        fp.write(text[e[0]:e[1] + 1] + '\t' + e[2] + '\n')

    fp.write(u'【GOLD_R】\n')
    for r in gold_r[i]:
        fp.write(text[r[0]:r[1] + 1] + '\t' + text[r[2]:r[3] + 1] + '\t' +
                 r[4] + '\n')

    fp.write(u'【PRE_R】\n')
Ejemplo n.º 4
0
from keras.optimizers import RMSprop  #, SGD, Adam, Adadelta, Adagrad
from ChainCRF import ChainCRF
from utils import ypre2label, label2answer, computeFe
from GenerateXY import GetXY
from utils import save_model, load_model, LoadGoldEntity
from utils import loadtokens, SaveGoldRelation, SaveGoldEntity, GetModel
from constants import num_class, len_sentence, len_word
from constants import bils, ls, wv
import pickle
#根据超参数来选择词向量模型
mask = 0
wordvec = u'../../token2vec/chemdner_pubmed_biov5_drug.token4_d100_CPR'
#wordvec = u'../../token2vec/medline_chemdner_pubmed_biov5_drug.token4_d50'
wordvecmodel = GetModel(wordvec, mask=mask)  #得到所有token的词向量,按索引排列,python-list
#导入训练集和测试集对应的tokens,二维python-list,用于根据预测标签来得到实体位置
testtokens = loadtokens(u'../data/CPR_test_Y.txt')
traintokens = loadtokens(u'../data/CPR_train_Y.txt')
vaildtokens = loadtokens(u'../data/CPR_vaild_Y.txt')
#导入标注的实体和关系 二维python-list
gold_e = LoadGoldEntity('../data/goldEntityAnswer_test.txt')  #标注的实体位置
gold_e_train = LoadGoldEntity('../data/goldEntityAnswer_train.txt')
gold_e_vaild = LoadGoldEntity('../data/goldEntityAnswer_vaild.txt')
#导入用于训练的X,Y,并将Y转化为one-hot形式
xtrain, y_train, xctrain = GetXY(u'../data/CPR_train_Y.txt', mask)
xtest, y_test, xctest = GetXY(u'../data/CPR_test_Y.txt', mask)
xvaild, y_vaild, xcvaild = GetXY(u'../data/CPR_vaild_Y.txt', mask)
num_train = len(y_train)
num_test = len(y_test)
num_vaild = len(y_vaild)
ytest = np_utils.to_categorical(y_test, num_class)  #将数值型标签转换为多分类数组
ytest = np.reshape(ytest, (num_test, len_sentence, num_class))  #重新reshape
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-

import pickle
import codecs as cs
f = open('../data/predict/en2t_crf+bils250+ls100.pkl', 'rb')
eindex2tindex = pickle.load(f)
f.close()
from utils import loadtokens
testtokens = loadtokens(u'../data/DDI_test_BIOES.txt')
fp = cs.open('../data/testsample.txt', 'w', 'utf-8')
for i in range(len(testtokens)):
    enum = len(eindex2tindex[i])
    if enum > 1:
        for j in range(0, enum - 1):
            for k in range(j + 1, enum):
                fp.write(
                    str(eindex2tindex[i][j][0]) + ' ' +
                    str(eindex2tindex[i][j][-1]) + '|')
                fp.write(
                    str(eindex2tindex[i][k][0]) + ' ' +
                    str(eindex2tindex[i][k][-1]) + '|')
                fp.write(testtokens[i][0])
                for t in testtokens[i][1:]:
                    fp.write(' ' + t)
                fp.write('\n')
fp.close()
Ejemplo n.º 6
0
if_chunk = 0
if_ner = 0
dimdic = {
    'chardim': 15,
    'posdim': 20,
    'chunkdim': 10,
    'nerdim': 10,
    'convdim': 25
}
mask = 0

wordvecmodel = GetModel(
    u'../../token2vec/chemdner_pubmed_biov5_drug.token4_d100',
    mask=mask)  #得到所有token的词向量,按索引排列,python-list
#导入训练集和测试集对应的tokens,二维python-list,用于根据预测标签来得到实体位置
testtokens = loadtokens(u'../data/DDI_test_BIOES.genia_fea')
traintokens_all = loadtokens(u'../data/DDI_train_BIOES.genia_fea')
traintokens = traintokens_all[0:6066]
vaildtokens = traintokens_all[6066:]
#导入标注的实体和关系 二维python-list
gold_e = LoadGoldEntity('../data/goldEntityAnswer.txt')  #标注的实体位置
gold_r = LoadGoldRelation('../data/goldRelationAnswer.txt')  #标注的关系的实体位置及类别
gold_e_train_all = LoadGoldEntity('../data/goldEntityAnswer_train.txt')
gold_r_train_all = LoadGoldRelation('../data/goldRelationAnswer_train.txt')
gold_e_train = gold_e_train_all[:6066]
gold_e_vaild = gold_e_train_all[6066:]
gold_r_train = gold_r_train_all[:6066]
gold_r_vaild = gold_r_train_all[6066:]
#导入用于训练的X,Y,并将Y转化为one-hot形式
Data_train_all = GetXY(u'../data/DDI_train_BIOES.genia_fea', mask, if_lemma,
                       if_pos, if_chunk, if_ner)