Exemple #1
0
    start = datetime.now()

    print('-' * 60)
    print('epoch %d start at %s' % (epoch, str(start)))

    log.write('-' * 60 + '\n')
    log.write('epoch %d start at %s\n' % (epoch, str(start)))
    train_loss = 0
    dev_loss = 0

    np.random.shuffle(word_train_data)

    for i in range(number_of_train_batches):
        train_batch = word_train_data[i * batch_size:(i + 1) * batch_size]
        X_train_batch = prepare.prepare_auto_encoder(batch=train_batch,
                                                     task='ner',
                                                     gram='bi')
        X_train_batch = X_train_batch.toarray()
        train_metrics = model.train_on_batch(X_train_batch, X_train_batch)
        train_loss += train_metrics[0]
    all_train_loss.append(train_loss)

    for j in range(number_of_dev_batches):
        dev_batch = word_dev_data[j * batch_size:(j + 1) * batch_size]
        X_dev_batch = prepare.prepare_auto_encoder(batch=dev_batch,
                                                   task='ner',
                                                   gram='bi')
        X_dev_batch = X_dev_batch.toarray()
        dev_metrics = model.test_on_batch(X_dev_batch, X_dev_batch)
        dev_loss += dev_metrics[0]
    all_dev_loss.append(dev_loss)
Exemple #2
0
        test_word_dict[each] += 1
    else:
        test_word_dict[each] = 1

train_word = train_word_dict.keys()
dev_word = dev_word_dict.keys()
test_word = test_word_dict.keys()

if test == 'dev':
    word = dev_word[:20]
elif test == 'test':
    word = test_word[:20]
else:
    word = train_word[:20]

word_hashing = prepare.prepare_auto_encoder(batch=word, task='ner')
word_hashing = word_hashing.toarray()
output = model.predict_on_batch(word_hashing)

while True:
    number = input('please input word index: ')
    exist = word[number]
    print('word is: ' + exist)
    if exist in train_word_dict:
        print('    in train: ' + str(train_word_dict[exist]) + ' times.')
    if exist in dev_word_dict:
        print('    in dev: ' + str(dev_word_dict[exist]) + ' times.')
    if exist in test_word_dict:
        print('    in test: ' + str(test_word_dict[exist]) + ' times.')
    print('-' * 60)
    ind = []
    start = datetime.now()

    print('-' * 60)
    print('epoch %d start at %s' % (epoch, str(start)))

    log.write('-' * 60 + '\n')
    log.write('epoch %d start at %s\n' % (epoch, str(start)))
    train_loss = 0
    dev_loss = 0

    np.random.shuffle(word_train_data)

    for i in range(number_of_train_batches):
        train_batch = word_train_data[i * batch_size:(i + 1) * batch_size]
        X_train_batch = prepare.prepare_auto_encoder(batch=train_batch,
                                                     task='chunk')
        X_train_batch = X_train_batch.toarray()
        train_metrics = model.train_on_batch(X_train_batch, X_train_batch)
        train_loss += train_metrics[0]
    all_train_loss.append(train_loss)

    for j in range(number_of_dev_batches):
        dev_batch = word_dev_data[j * batch_size:(j + 1) * batch_size]
        X_dev_batch = prepare.prepare_auto_encoder(batch=dev_batch,
                                                   task='chunk')
        X_dev_batch = X_dev_batch.toarray()
        dev_metrics = model.test_on_batch(X_dev_batch, X_dev_batch)
        dev_loss += dev_metrics[0]
    all_dev_loss.append(dev_loss)

    if dev_loss < min_loss:
test_data = load_data.load_chunk(dataset='test.txt')

all_word = []

# all word
[all_word.extend(list(each[0])) for each in train_data]
[all_word.extend(list(each[0])) for each in dev_data]
[all_word.extend(list(each[0])) for each in test_data]

all_word = [each.strip().lower() for each in all_word]
all_word = list(set(all_word))

for i, word in enumerate(all_word):
    w.write(word + '\n')
    word_hashing = prepare.prepare_auto_encoder(batch=[word],
                                                task='chunk',
                                                gram='bi')
    word_hashing = word_hashing.toarray()
    representation = encoder.predict_on_batch(word_hashing)
    #normalization = (representation-np.mean(representation))/np.std(representation)
    normalization = (representation - np.min(representation)) / (
        np.max(representation) - np.min(representation))
    embeddings.loc[i] = normalization[0]

embeddings.to_csv(
    '../preprocessing/chunk-auto-encoder-2/auto-encoder-embeddings.txt',
    sep=' ',
    header=False,
    index=False,
    float_format='%.6f')
w.close()