################ # process data # ################ data_path = 'datapath/' cdr_path = data_path + 'cdr/' bc_path = data_path + 'bc/' embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file, window_size=window_size) dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt', 'cdr_train_weak', 'weak', update=True) dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt', 'cdr_train_gold', 'cdr', update=True) dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev', 'cdr') dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr') dp.read_file(bc_path + 'ner_CDR_train.txt', 'bc_train', 'bc', update=True) dp.read_file(bc_path + 'ner_CDR_dev.txt', 'bc_dev', 'bc') dp.read_file(bc_path + 'ner_CDR_test.txt', 'bc_test', 'bc') ###############
################ # process data # ################ data_path = 'datapath/' cdr_path = '/iesl/data/meta/pubtator/ner_paper/processed/train_peng_10000/' bc_path = data_path + 'BC_VI_Task5/ner_CDR_BC_VI_' embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file, window_size=window_size) dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt', 'cdr_train', 'cdr', update=True) dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev', 'cdr') dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr') dp.read_file(bc_path + 'train.txt', 'bc_train', 'bc', update=True) dp.read_file(bc_path + 'dev.txt', 'bc_dev', 'bc') dp.read_file(bc_path + 'test.txt', 'bc_test', 'bc') ############## # build model # ############## vocab_size = len(dp.token_map) labels_cdr_size = len(dp.label_maps['cdr'])
################ # process data # ################ data_path = 'datapath/' cdr_path = data_path + 'cdr/ner_CDR_' bc_path = data_path + 'BC_VI_Task5/ner_CDR_BC_VI_' embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file, window_size=window_size) # dp.read_file(cdr_path + 'train.txt', 'cdr_train', 'cdr', update=True) cdr_train_path = '/iesl/data/meta/pubtator/ner_paper/processed/train_peng_10000/ner_CID_Training_mine_PubTator.txt' dp.read_file(cdr_train_path, 'cdr_train', 'cdr', update=True) dp.read_file(cdr_path + 'dev.txt', 'cdr_dev', 'cdr') dp.read_file(cdr_path + 'test.txt', 'cdr_test', 'cdr') dp.read_file(bc_path + 'train.txt', 'bc_train', 'bc', update=True) dp.read_file(bc_path + 'dev.txt', 'bc_dev', 'bc') dp.read_file(bc_path + 'test.txt', 'bc_test', 'bc') ############### # build model # ############### vocab_size = len(dp.token_map) labels_cdr_size = len(dp.label_maps['cdr']) labels_bc_size = len(dp.label_maps['bc']) shape_domain_size = len(dp.shape_map)
batch_size = 32 ################ # process data # ################ data_path = 'datapath/' path = data_path + 'pubmed/' embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file, window_size=window_size) dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True) dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True) dp.read_file(path + 'ner_dev', 'dev', 'full') dp.read_file(path + 'ner_test', 'test', 'full') ############### # build model # ############### vocab_size = len(dp.token_map) labels_A_size = len(dp.label_maps['A']) labels_B_size = len(dp.label_maps['B']) labels_full_size = len(dp.label_maps['full']) shape_domain_size = len(dp.shape_map) char_domain_size = len(dp.char_map)