コード例 #1
0
ファイル: process_test.py プロジェクト: ngreenberg/em-crf
clip_norm = 5

batch_size = 32

################
# process data #
################

embeddings_file = '/home/nathan/Programming/research/data/embeddings/glove.6B/glove.6B.100d.txt'

dp = DataProcessor(vocab=embeddings_file)

dp.read_file(
    '/home/nathan/Programming/research/data/cdr/ner_CDR_train.txt',
    '/home/nathan/Programming/research/sandbox/protos/cdr_train.proto',
    'cdr',
    update=True)

# dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_test.txt',
#              '/home/nathan/Programming/research/sandbox/protos/cdr_test.proto',
#              'cdr')

# dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_dev.txt',
#              '/home/nathan/Programming/research/sandbox/protos/cdr_dev.proto',
#              'cdr')

# dp.read_file('/home/nathan/Programming/research/data/BC_VI_Task5/ner_CDR_BC_VI_train.txt',
#              '/home/nathan/Programming/research/sandbox/protos/bc_train.proto',
#              'BC_VI_Task5', update=True)
コード例 #2
0
set_labels = {
    'cdr': ['Chemical', 'Disease'],
    'bc': ['Chemical', 'Gene'],
    'weak': [
        'Disease', 'Chemical', 'Species', 'Gene', 'ProteinMutation',
        'DNAMutation', 'SNP'
    ]
}

dp = DataProcessor(set_labels=set_labels,
                   vocab=embeddings_file,
                   window_size=window_size)

dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt',
             'cdr_train_weak',
             'weak',
             update=True)

dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt',
             'cdr_train',
             'cdr',
             update=True)
dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev',
             'cdr')
dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr')

dp.read_file(bc_path + 'ner_CDR_train.txt', 'bc_train', 'bc', update=True)
dp.read_file(bc_path + 'ner_CDR_dev.txt', 'bc_dev', 'bc')
dp.read_file(bc_path + 'ner_CDR_test.txt', 'bc_test', 'bc')

###############
コード例 #3
0
ファイル: exp_split.py プロジェクト: ngreenberg/em-crf
        'T170'
    ],
    'B':
    ['T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204'],
    'full': [
        'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168',
        'T170', 'T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201',
        'T204'
    ]
}

dp = DataProcessor(set_labels=set_labels,
                   vocab=embeddings_file,
                   window_size=window_size)

dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True)
dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True)

dp.read_file(path + 'ner_dev', 'dev', 'full')
dp.read_file(path + 'ner_test', 'test', 'full')

###############
# build model #
###############

vocab_size = len(dp.token_map)
labels_size = len(dp.label_map)
shape_domain_size = len(dp.shape_map)
char_domain_size = len(dp.char_map)

print('Loading embeddings from ' + embeddings_file)