Exemple #1
0
################
# process data #
################

data_path = 'datapath/'

cdr_path = data_path + 'cdr/'
bc_path = data_path + 'bc/'

embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file, window_size=window_size)

dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt',
             'cdr_train_weak',
             'weak',
             update=True)

dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt',
             'cdr_train_gold',
             'cdr',
             update=True)
dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev',
             'cdr')
dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr')

dp.read_file(bc_path + 'ner_CDR_train.txt', 'bc_train', 'bc', update=True)
dp.read_file(bc_path + 'ner_CDR_dev.txt', 'bc_dev', 'bc')
dp.read_file(bc_path + 'ner_CDR_test.txt', 'bc_test', 'bc')

###############
################
# process data #
################

data_path = 'datapath/'

cdr_path = '/iesl/data/meta/pubtator/ner_paper/processed/train_peng_10000/'
bc_path = data_path + 'BC_VI_Task5/ner_CDR_BC_VI_'

embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file, window_size=window_size)

dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt',
             'cdr_train',
             'cdr',
             update=True)
dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev',
             'cdr')
dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr')

dp.read_file(bc_path + 'train.txt', 'bc_train', 'bc', update=True)
dp.read_file(bc_path + 'dev.txt', 'bc_dev', 'bc')
dp.read_file(bc_path + 'test.txt', 'bc_test', 'bc')

##############
# build model #
##############

vocab_size = len(dp.token_map)
labels_cdr_size = len(dp.label_maps['cdr'])
Exemple #3
0
################
# process data #
################

data_path = 'datapath/'

cdr_path = data_path + 'cdr/ner_CDR_'
bc_path = data_path + 'BC_VI_Task5/ner_CDR_BC_VI_'

embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file, window_size=window_size)

# dp.read_file(cdr_path + 'train.txt', 'cdr_train', 'cdr', update=True)
cdr_train_path = '/iesl/data/meta/pubtator/ner_paper/processed/train_peng_10000/ner_CID_Training_mine_PubTator.txt'
dp.read_file(cdr_train_path, 'cdr_train', 'cdr', update=True)
dp.read_file(cdr_path + 'dev.txt', 'cdr_dev', 'cdr')
dp.read_file(cdr_path + 'test.txt', 'cdr_test', 'cdr')

dp.read_file(bc_path + 'train.txt', 'bc_train', 'bc', update=True)
dp.read_file(bc_path + 'dev.txt', 'bc_dev', 'bc')
dp.read_file(bc_path + 'test.txt', 'bc_test', 'bc')

###############
# build model #
###############

vocab_size = len(dp.token_map)
labels_cdr_size = len(dp.label_maps['cdr'])
labels_bc_size = len(dp.label_maps['bc'])
shape_domain_size = len(dp.shape_map)
Exemple #4
0
batch_size = 32

################
# process data #
################

data_path = 'datapath/'

path = data_path + 'pubmed/'

embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file, window_size=window_size)

dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True)
dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True)

dp.read_file(path + 'ner_dev', 'dev', 'full')
dp.read_file(path + 'ner_test', 'test', 'full')

###############
# build model #
###############

vocab_size = len(dp.token_map)
labels_A_size = len(dp.label_maps['A'])
labels_B_size = len(dp.label_maps['B'])
labels_full_size = len(dp.label_maps['full'])
shape_domain_size = len(dp.shape_map)
char_domain_size = len(dp.char_map)