Example #1
0
np.random.seed(0)

# train hyperparameters
feature_length = conf.ner_feature_length_2

batch_size = conf.word_batch_size
nb_epoch = conf.word_nb_epoch

model_name = os.path.basename(__file__)[:-3]

folder_path = 'model/%s' % model_name
if not os.path.isdir(folder_path):
    os.makedirs(folder_path)

# the data, shuffled and split between train and dev sets
train_data = load_data.load_ner(dataset='eng.train')
dev_data = load_data.load_ner(dataset='eng.testa')

train_samples = len(train_data)
dev_samples = len(dev_data)
print('train shape:', train_samples)
print('dev shape:', dev_samples)
print()

word_train_data = []
word_dev_data = []
# all train sample, combine train and dev
[word_train_data.extend(list(each[0])) for each in train_data]
[word_dev_data.extend(list(each[0])) for each in dev_data]

word_train_samples = len(word_train_data)
Example #2
0
# add path
sys.path.append('../')
sys.path.append('../tools')

from tools import conf
from tools import load_data
from tools import prepare

# input sentence dimensions
step_length = conf.ner_step_length
pos_length = conf.ner_pos_length
chunk_length = conf.ner_chunk_length

IOB = conf.ner_BIOES_decode

test_data = load_data.load_ner(dataset='eng.testb')

best_epoch = sys.argv[1]

model_name = os.path.basename(__file__)[9:-3]
folder_path = './model/%s' % model_name

model_path = '%s/model_epoch_%s.h5' % (folder_path, best_epoch)
result = open('%s/predict.txt' % folder_path, 'w')


def convert(chunktags):
    # convert BIOES to BIO
    for p, q in enumerate(chunktags):
        if q.startswith("E-"):
            chunktags[p] = "I-" + q[2:]
Example #3
0
from tools import prepare

# input sentence dimensions
step_length = conf.ner_step_length
pos_length = conf.ner_pos_length
chunk_length = conf.ner_chunk_length
gazetteer_length = conf.gazetteer_length

IOB = conf.ner_BIOES_decode

data = sys.argv[1]

best_epoch = sys.argv[2]

if data=="dev":
    test_data = load_data.load_ner(dataset='eng.testa', form='BIOES')
elif data == "test":
    test_data = load_data.load_ner(dataset='eng.testb', form='BIOES')
tokens = [len(x[0]) for x in test_data]
print(sum(tokens))
print('%s shape:'%data, len(test_data))

model_name = os.path.basename(__file__)[9:-3]
folder_path = './model/%s'%model_name

model_path = '%s/model_epoch_%s.h5'%(folder_path, best_epoch)
result = open('%s/predict.txt'%folder_path, 'w')

def convert(chunktags):
    # convert BIOES to BIO
    for p, q in enumerate(chunktags):
Example #4
0
hash_vocab = conf.ner_hash_vocab
hash_length = conf.ner_hash_length

output_length = conf.ner_BIOES_length

batch_size = conf.batch_size
nb_epoch = 50  #conf.nb_epoch

model_name = os.path.basename(__file__)[:-3]

folder_path = 'model/%s' % model_name
if not os.path.isdir(folder_path):
    os.makedirs(folder_path)

# the data, shuffled and split between train and test sets
train_data = load_data.load_ner(dataset='eng.train', form='BIOES')
dev_data = load_data.load_ner(dataset='eng.testa', form='BIOES')

train_samples = len(train_data)
dev_samples = len(dev_data)
print('train shape:', train_samples)
print('dev shape:', dev_samples)
print()

word_embedding = pd.read_csv('../preprocessing/senna/embeddings.txt',
                             delimiter=' ',
                             header=None)
word_embedding = word_embedding.values
word_embedding = np.concatenate([
    np.zeros((1, emb_length)), word_embedding,
    np.random.uniform(-1, 1, (1, emb_length))