clip_norm = 5 batch_size = 32 ################ # process data # ################ embeddings_file = '/home/nathan/Programming/research/data/embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file) dp.read_file( '/home/nathan/Programming/research/data/cdr/ner_CDR_train.txt', '/home/nathan/Programming/research/sandbox/protos/cdr_train.proto', 'cdr', update=True) # dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_test.txt', # '/home/nathan/Programming/research/sandbox/protos/cdr_test.proto', # 'cdr') # dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_dev.txt', # '/home/nathan/Programming/research/sandbox/protos/cdr_dev.proto', # 'cdr') # dp.read_file('/home/nathan/Programming/research/data/BC_VI_Task5/ner_CDR_BC_VI_train.txt', # '/home/nathan/Programming/research/sandbox/protos/bc_train.proto', # 'BC_VI_Task5', update=True)
set_labels = { 'cdr': ['Chemical', 'Disease'], 'bc': ['Chemical', 'Gene'], 'weak': [ 'Disease', 'Chemical', 'Species', 'Gene', 'ProteinMutation', 'DNAMutation', 'SNP' ] } dp = DataProcessor(set_labels=set_labels, vocab=embeddings_file, window_size=window_size) dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt', 'cdr_train_weak', 'weak', update=True) dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt', 'cdr_train', 'cdr', update=True) dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev', 'cdr') dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr') dp.read_file(bc_path + 'ner_CDR_train.txt', 'bc_train', 'bc', update=True) dp.read_file(bc_path + 'ner_CDR_dev.txt', 'bc_dev', 'bc') dp.read_file(bc_path + 'ner_CDR_test.txt', 'bc_test', 'bc') ###############
'T170' ], 'B': ['T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204'], 'full': [ 'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168', 'T170', 'T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204' ] } dp = DataProcessor(set_labels=set_labels, vocab=embeddings_file, window_size=window_size) dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True) dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True) dp.read_file(path + 'ner_dev', 'dev', 'full') dp.read_file(path + 'ner_test', 'test', 'full') ############### # build model # ############### vocab_size = len(dp.token_map) labels_size = len(dp.label_map) shape_domain_size = len(dp.shape_map) char_domain_size = len(dp.char_map) print('Loading embeddings from ' + embeddings_file)