start = datetime.now() print('-' * 60) print('epoch %d start at %s' % (epoch, str(start))) log.write('-' * 60 + '\n') log.write('epoch %d start at %s\n' % (epoch, str(start))) train_loss = 0 dev_loss = 0 np.random.shuffle(word_train_data) for i in range(number_of_train_batches): train_batch = word_train_data[i * batch_size:(i + 1) * batch_size] X_train_batch = prepare.prepare_auto_encoder(batch=train_batch, task='ner', gram='bi') X_train_batch = X_train_batch.toarray() train_metrics = model.train_on_batch(X_train_batch, X_train_batch) train_loss += train_metrics[0] all_train_loss.append(train_loss) for j in range(number_of_dev_batches): dev_batch = word_dev_data[j * batch_size:(j + 1) * batch_size] X_dev_batch = prepare.prepare_auto_encoder(batch=dev_batch, task='ner', gram='bi') X_dev_batch = X_dev_batch.toarray() dev_metrics = model.test_on_batch(X_dev_batch, X_dev_batch) dev_loss += dev_metrics[0] all_dev_loss.append(dev_loss)
test_word_dict[each] += 1 else: test_word_dict[each] = 1 train_word = train_word_dict.keys() dev_word = dev_word_dict.keys() test_word = test_word_dict.keys() if test == 'dev': word = dev_word[:20] elif test == 'test': word = test_word[:20] else: word = train_word[:20] word_hashing = prepare.prepare_auto_encoder(batch=word, task='ner') word_hashing = word_hashing.toarray() output = model.predict_on_batch(word_hashing) while True: number = input('please input word index: ') exist = word[number] print('word is: ' + exist) if exist in train_word_dict: print(' in train: ' + str(train_word_dict[exist]) + ' times.') if exist in dev_word_dict: print(' in dev: ' + str(dev_word_dict[exist]) + ' times.') if exist in test_word_dict: print(' in test: ' + str(test_word_dict[exist]) + ' times.') print('-' * 60) ind = []
start = datetime.now() print('-' * 60) print('epoch %d start at %s' % (epoch, str(start))) log.write('-' * 60 + '\n') log.write('epoch %d start at %s\n' % (epoch, str(start))) train_loss = 0 dev_loss = 0 np.random.shuffle(word_train_data) for i in range(number_of_train_batches): train_batch = word_train_data[i * batch_size:(i + 1) * batch_size] X_train_batch = prepare.prepare_auto_encoder(batch=train_batch, task='chunk') X_train_batch = X_train_batch.toarray() train_metrics = model.train_on_batch(X_train_batch, X_train_batch) train_loss += train_metrics[0] all_train_loss.append(train_loss) for j in range(number_of_dev_batches): dev_batch = word_dev_data[j * batch_size:(j + 1) * batch_size] X_dev_batch = prepare.prepare_auto_encoder(batch=dev_batch, task='chunk') X_dev_batch = X_dev_batch.toarray() dev_metrics = model.test_on_batch(X_dev_batch, X_dev_batch) dev_loss += dev_metrics[0] all_dev_loss.append(dev_loss) if dev_loss < min_loss:
test_data = load_data.load_chunk(dataset='test.txt') all_word = [] # all word [all_word.extend(list(each[0])) for each in train_data] [all_word.extend(list(each[0])) for each in dev_data] [all_word.extend(list(each[0])) for each in test_data] all_word = [each.strip().lower() for each in all_word] all_word = list(set(all_word)) for i, word in enumerate(all_word): w.write(word + '\n') word_hashing = prepare.prepare_auto_encoder(batch=[word], task='chunk', gram='bi') word_hashing = word_hashing.toarray() representation = encoder.predict_on_batch(word_hashing) #normalization = (representation-np.mean(representation))/np.std(representation) normalization = (representation - np.min(representation)) / ( np.max(representation) - np.min(representation)) embeddings.loc[i] = normalization[0] embeddings.to_csv( '../preprocessing/chunk-auto-encoder-2/auto-encoder-embeddings.txt', sep=' ', header=False, index=False, float_format='%.6f') w.close()