def main(args): if args.filelist is not None: paths_file = utils.read_lines(args.filelist) else: paths_file = args.files path_vocab = args.vocab utils.build_vocabulary(paths_file=paths_file, path_vocab=path_vocab, prune_at=50000, min_count=-1, special_words=["<root>"])
def main(): config = utils.Config() utils.mkdir(os.path.join(config.getpath("data"), "scidtb-vocab")) relation_mapper = treetk.rstdt.RelationMapper(corpus_name="scidtb") filenames = [] for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "train")): filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "train", filename)) for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "gold")): filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "gold", filename)) for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "second_annotate")): filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "dev", "second_annotate", filename)) for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "gold")): filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "gold", filename)) for filename in os.listdir(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "second_annotate")): filenames.append(os.path.join(config.getpath("data"), "scidtb", "preprocessed", "test", "second_annotate", filename)) filenames = [n for n in filenames if n.endswith(".edus.tokens")] filenames.sort() tmp_f_path = os.path.join(config.getpath("data"), "scidtb-vocab", "tmp_f.txt") tmp_c_path = os.path.join(config.getpath("data"), "scidtb-vocab", "tmp_c.txt") with open(tmp_f_path, "w") as ff, open(tmp_c_path, "w") as fc: for filename in filenames: lines = utils.read_lines(filename.replace(".edus.tokens", ".arcs"), process=lambda line: line.split()) assert len(lines) == 1 line = lines[0] arcs = treetk.hyphens2arcs(line) fine_relations = [l for h,d,l in arcs] coarse_relations = [relation_mapper.f2c(l) for l in fine_relations] fine_relations = " ".join(fine_relations) coarse_relations = " ".join(coarse_relations) ff.write("%s\n" % fine_relations) fc.write("%s\n" % coarse_relations) utils.build_vocabulary(paths_file=[tmp_f_path], path_vocab=os.path.join(config.getpath("data"), "scidtb-vocab", "relations.fine.vocab.txt"), prune_at=50000, min_count=-1, special_words=["<root>"], with_unk=False) utils.build_vocabulary(paths_file=[tmp_c_path], path_vocab=os.path.join(config.getpath("data"), "scidtb-vocab", "relations.coarse.vocab.txt"), prune_at=50000, min_count=-1, special_words=["<root>"], with_unk=False)
def make_partitions_quora(self): self.shuffle() vocab_non_sim = self._non_sim_data[:231027] vocab_sim = self._non_sim_data[:133263] vocab_processor, sequence_length = build_vocabulary( vocab_sim, vocab_non_sim) train_non_sim = [ self.to_index_data(data, vocab_processor) for data in self._non_sim_data[:207026] ] train_sim = [ self.to_index_data(data, vocab_processor) for data in self._sim_data[:117262] ] dev_non_sim = [ self.to_index_data(data, vocab_processor) for data in self._non_sim_data[207027:231027] ] dev_sim = [ self.to_index_data(data, vocab_processor) for data in self._sim_data[117263:133263] ] test_non_sim = [ self.to_index_data(data, vocab_processor) for data in self._non_sim_data[231027:] ] test_sim = [ self.to_index_data(data, vocab_processor) for data in self._sim_data[133263:] ] return train_non_sim, train_sim, dev_non_sim, dev_sim, \ test_non_sim, test_sim, vocab_processor, sequence_length
def __init__(self, batch_size, sequence_length, data_path = './ptb_data/', seed = 123): np.random.seed(seed) self.batch_size = batch_size self.seq_len = sequence_length train_file = data_path + 'ptb.train_small.txt' valid_file = data_path + 'ptb.valid.txt' test_file = data_path + 'ptb.test.txt' #word->vector based on the word frequency of training data #ex) 'the' -> 0, <unk> -> 1, ..., 'wachter' -> 9999 #whole word->vector dictionary is saved in ptb_data/ptb_word_to_id.txt self.word_to_id, self.id_to_word = build_vocabulary(train_file) train_data = file_to_word_ids(train_file, self.word_to_id) valid_data = file_to_word_ids(valid_file, self.word_to_id) test_data = file_to_word_ids(test_file, self.word_to_id) #make x, y n_chunk = int((len(train_data)-1)//self.seq_len) self.train_x = np.reshape(train_data[:n_chunk*self.seq_len], [n_chunk, self.seq_len]) self.train_y = np.reshape(train_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len]) n_chunk = int((len(valid_data)-1)//self.seq_len) self.valid_x = np.reshape(valid_data[:n_chunk*self.seq_len], [n_chunk,self.seq_len]) self.valid_y = np.reshape(valid_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len]) n_chunk = int((len(test_data)-1)//self.seq_len) self.test_x = np.reshape(test_data[:n_chunk*self.seq_len], [n_chunk, self.seq_len]) self.test_y = np.reshape(test_data[1:n_chunk*self.seq_len+1], [n_chunk, self.seq_len]) self.mode = '' #train, valid, or test self.counter = 0 self.n_data = self.train_x.shape[0] self.n_batch = int(self.n_data // self.batch_size) self.data_idx_perm = np.random.permutation(self.n_data)
def originalVocabulary(filename: str) -> dict: ''' The following function will create the Original Vocabulary. We keep all the information we need using the following format: term -> [frequency, # of yes's, # of no's] ''' data = utils.load_data(filename) vocabulary = dict() for tokens in data: tokensList = tokens[1].lower().split() sentiment = tokens[2] utils.build_vocabulary(vocabulary, tokensList, sentiment) return vocabulary
def load_ibm(): """ Load the train and dev datasets """ IBM_PATH = '/home/mgimenez/Dev/corpora/Quora/IBM' TRAIN_PATH = join(IBM_PATH, 'train.tsv') train = Corpus('ibm', TRAIN_PATH) DEV_PATH = join(IBM_PATH, 'dev.tsv') dev = Corpus('ibm', DEV_PATH) TEST_PATH = join(IBM_PATH, 'test.tsv') test = Corpus('ibm', TEST_PATH) vocab_processor, seq_len = build_vocabulary(train.sim_data, train.non_sim_data) train.to_index(vocab_processor) dev.to_index(vocab_processor) test.to_index(vocab_processor) return train.non_sim_data, train.sim_data, \ dev.non_sim_data, dev.sim_data, \ test.sim_data, test.non_sim_data, \ vocab_processor, seq_len
def create_vocabularies(self, num_sim_sentences, num_nonsim_sentences, partitions_path=None): """ Create and save the vocabularies :param partitions_path: path where the binarized files should be saved if this is not present the vocabularies won't be saved. :return: the vocabulary processor. """ vocab_non_sim = self._non_sim_data[:num_nonsim_sentences] vocab_sim = self._sim_data[:num_sim_sentences] vocab_processor, sequence_length = build_vocabulary( vocab_sim, vocab_non_sim) if partitions_path: if not isdir(partitions_path): makedirs(partitions_path) pickle.dump(vocab_processor, open(join(partitions_path, "vocab.train"), "wb")) pickle.dump(sequence_length, open(join(partitions_path, "sequence.len"), "wb")) return vocab_processor
random_seed=0) num_train, num_dev, fr_train, fr_dev = train_test_split(numbers, french_numbers, test_size=0.5, random_state=0) num_val, num_test, fr_val, fr_test = train_test_split(num_dev, fr_dev, test_size=0.5, random_state=0) tokenized_fr_train = [tokenize(s, word_level=True) for s in fr_train] tokenized_num_train = [tokenize(s, word_level=False) for s in num_train] fr_vocab, rev_fr_vocab = build_vocabulary(fr_train) num_vocab, rev_num_vocab = build_vocabulary(num_train, word_level=False) shared_vocab, rev_shared_vocab = build_vocabulary_token(tokenized_fr_train + tokenized_num_train) np.save('data_npy/fr_train', fr_train) np.save('data_npy/num_train', num_train) np.save('data_npy/fr_val', fr_val) np.save('data_npy/num_val', num_val) np.save('data_npy/fr_test', fr_test) np.save('data_npy/num_test', num_test) # np.save('data_npy/fr_vocab',fr_vocab) np.save('data_npy/rev_fr_vocab', rev_fr_vocab) # np.save('data_npy/num_vocab',num_vocab) np.save('data_npy/rev_num_vocab', rev_num_vocab) # np.save('data_npy/shared_vocab',shared_vocab)
def main(): """ Entry point for training and evaluation. """ args = parse_arguments() # Summaries summaries_dir = '{0}/{1}'.format( FLAGS.summaries_dir, datetime.datetime.now().strftime('%d_%b_%Y-%H_%M_%S')) train_writer = tf.summary.FileWriter(summaries_dir + '/train') validation_writer = tf.summary.FileWriter(summaries_dir + '/validation') # Model directory model_name = str(int(time.time())) model_dir = '{0}/{1}'.format(FLAGS.checkpoints_dir, model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) # Save configuration FLAGS(sys.argv) # config = FLAGS.__dict__['__flags'] config = FLAGS with open('{}/config.pkl'.format(model_dir), 'wb') as f: pickle.dump(config, f) # Generate vocabulary and load compressed word embeddings model vocabulary = utils.build_vocabulary() ft_model = embeddings.get_fastText_embedding() word_embeddings = utils.compress_word_embedding(vocabulary, ft_model) # word_embeddings = None with tf.Session() as sess: model = BiLSTM(hidden_size=[FLAGS.hidden_size], word_embeddings=word_embeddings, embedding_size=300, vocabulary_size=len(vocabulary), max_seq_length=FLAGS.max_seq_length, learning_rate=FLAGS.learning_rate) # Saver object saver = tf.train.Saver() # Restore checkpoint if args.checkpoint: saver.restore(sess, FLAGS.checkpoints_dir + '155') # Train model global_step = 0 sess.run(tf.global_variables_initializer()) # TODO implement tf.Dataset. # sess.run(model.dataset_iterator.make_initializer(train_dataset)) for epoch in range(FLAGS.epochs): X_train, y_train, seq_lengths = utils.generate_data_batch( batch_size=FLAGS.batch_size, max_seq_length=FLAGS.max_seq_length, vocabulary=vocabulary, embeddings=word_embeddings) feeds_train = [ model.loss, model.train_step, model.merged # model.embedding_lookup ] feed_dict_train = { model.input: X_train, model.target: y_train, model.seq_len: seq_lengths, model.keep_prob: FLAGS.keep_prob # model.embedding_init } try: train_loss, _, summary = sess.run(feeds_train, feed_dict_train) except Exception as e: # utils.debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab) raise e train_writer.add_summary(summary, global_step) print('{0}/{1} train loss: {2:.4f}'.format(global_step + 1, FLAGS.train_steps, train_loss)) # Check validation performance if (global_step + 1) % 101 == 0: # TODO implement tf.Dataset. # validation_init_op = iterator.make_initializer(valid_dataset) X_val, y_val, val_seq_len = utils.generate_data_batch( max_seq_length=FLAGS.max_seq_length, vocabulary=vocabulary, train=False) feed_val = [ model.loss, model.accuracy, model.merged # model.embedding_lookup ] feed_dict_val = { model.input: X_val, model.target: y_val, model.seq_len: val_seq_len, model.keep_prob: 1 # model.embedding_init } try: val_loss, accuracy, summary = sess.run( feed_val, feed_dict_val) except Exception as e: # utils.debug_data(seq_source_ids, seq_ref_ids, seq_source_len, seq_ref_len, id_to_vocab) raise e validation_writer.add_summary(summary, global_step) print(' validation loss: {0:.4f} (accuracy {1:.4f})'.format( val_loss, accuracy)) global_step += 1 # End train batch save_path = saver.save(sess, '{}/model.ckpt'.format(model_dir), global_step=global_step)
def write_partitions_mixed(self, partitions_path, one_hot=False): """ Create the partitions and write them in csv """ # Shuffle the dataset # This was commented because the pipeline handles the shuffle. # self.shuffle() # Create and save the vocabularies vocab_non_sim = self._non_sim_data[:231027] vocab_sim = self._non_sim_data[:133263] vocab_processor, sequence_length = build_vocabulary( vocab_sim, vocab_non_sim) if not isdir(partitions_path): makedirs(partitions_path) pickle.dump(vocab_processor, open(join(partitions_path, "vocab.train"), "wb")) pickle.dump(sequence_length, open(join(partitions_path, "sequence.len"), "wb")) # Create and save the TRAIN FILE writer = tf.python_io.TFRecordWriter( join(partitions_path, "train.tfrecords")) lines = 0 for i in range(133263): # Write a non similar sentence data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) # Write a similar sentence data = self._sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 2 for i in range(133263, 231027): data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 1 print("Saved {} data examples for training".format(lines)) # Create and save the DEV FILE writer = tf.python_io.TFRecordWriter( join(partitions_path, "dev.tfrecords")) lines = 0 # Mixed part: similar and non similar sentences for i, j in zip(range(231027, 239027), range(133263, 141263)): data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) data = self._sim_data[j] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 2 for i in range(239027, 243027): data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 1 print("Saved {} data examples for development".format(lines)) # Create and save the TEST FILE writer = tf.python_io.TFRecordWriter( join(partitions_path, "test.tfrecords")) lines = 0 # Mixed part: similar and non similar sentences for i, j in zip(range(243027, 251027), range(141263, 149263)): data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) data = self._sim_data[j] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 2 for i in range(251027, 255027): data = self._non_sim_data[i] data_idx = self.to_index_data(data, vocab_processor) write_tfrecord(writer, data_idx, one_hot) lines += 1 print("Saved {} data examples for testing".format(lines))
import datetime import os import sys import zipfile import numpy as np import tensorflow as tf from utils import build_vocabulary MODELS_FOLDER = os.path.join(os.path.dirname(__file__), "models") TIMESTAMP = datetime.datetime.now().strftime("%Y-%m-%d-%H%M%S") input_filenames = sys.argv[1:] _, indices, count, dictionary, reverse_dictionary = build_vocabulary( input_filenames) vocab_size = len(dictionary) print(indices[:7]) window_size = 3 vector_dim = 300 epochs = 200000 valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(indices, vocab_size, window_size=window_size,
MODELS_FOLDER = os.path.join(os.path.dirname(__file__), "models") LSTM_MODEL_PATH = MODELS_FOLDER + "/lstm_model_2018-12-12-005552.json" LSTM_WEIGHTS_PATH = MODELS_FOLDER + "/lstm_model_2018-12-12-005552.h5" SEQUENCE_LEN = 20 NUM_PERIODS_UNTIL_STOP = 50 if __name__ == "__main__": # Load the LSTM model. with open(LSTM_MODEL_PATH, "r") as json_file: lstm = model_from_json(json_file.read()) lstm.load_weights(LSTM_WEIGHTS_PATH) # Build out our input vocabulary from the set of 'Data' files. input_filenames = glob.glob(DATA_FOLDER + "/*") _, _, _, dictionary, reverse_dictionary = build_vocabulary(input_filenames) # Capture an initial input sequence of 20 words or less. print("\n\n\n\n") print("How would you like to start your Sherlock Holmes story?") print( "Please input up to 20 'words' (note that all punctuation will also be considered a 'word')." ) print("Type your words here and press <ENTER> when you are done:") print(">>> ", end="", flush=True) input_seq = _preprocess(sys.stdin.readline()) print("\nProcessing...") # Copy the input sequence as the initial output (capping at SEQUENCE_LEN). output = input_seq[:SEQUENCE_LEN]
def main(): # Загрузка данных. Разделение на тренировочные и тестовые full = pyconll.load_from_file('./data/postag_sakha.conllu') full_train = full[:1700] full_test = full[1700:] print('Количество тренировочных предложений = ', len(full_train)) print('Количество тестовых предложений = ', len(full_test)) # Посчитаем максимальную длину слова и предложения MAX_SENT_LEN = max(len(sent) for sent in full_train) MAX_ORIG_TOKEN_LEN = max( len(token.form) for sent in full_train for token in sent) print('Наибольшая длина предложения', MAX_SENT_LEN) print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN) all_train_texts = [ ' '.join(token.form for token in sent) for sent in full_train ] # Создаем словарь символов train_char_tokenized = tokenize_corpus(all_train_texts, tokenizer=character_tokenize) char_vocab, word_doc_freq = build_vocabulary(train_char_tokenized, max_doc_freq=1.0, min_count=5, pad_word='<PAD>') print("Количество уникальных символов", len(char_vocab)) print(list(char_vocab.items())[:10]) # Создаем словарь тегов UNIQUE_TAGS = ['<NOTAG>'] + sorted( {token.upos for sent in full_train for token in sent if token.upos}) label2id = {label: i for i, label in enumerate(UNIQUE_TAGS)} print(label2id) # Преобразование данных в числа train_inputs, train_labels = pos_corpus_to_tensor(full_train, char_vocab, label2id, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN) train_dataset = TensorDataset(train_inputs, train_labels) test_inputs, test_labels = pos_corpus_to_tensor(full_test, char_vocab, label2id, MAX_SENT_LEN, MAX_ORIG_TOKEN_LEN) test_dataset = TensorDataset(test_inputs, test_labels) sentence_level_model = SentenceLevelPOSTagger( len(char_vocab), len(label2id), embedding_size=64, single_backbone_kwargs=dict(layers_n=4, kernel_size=5, dropout=0.3), context_backbone_kwargs=dict(layers_n=4, kernel_size=3, dropout=0.3)) print('Количество параметров', sum(np.product(t.shape) for t in sentence_level_model.parameters())) (best_val_loss, best_sentence_level_model) = train_eval_loop( sentence_level_model, train_dataset, test_dataset, F.cross_entropy, lr=5e-3, epoch_n=30, batch_size=64, device='cuda', early_stopping_patience=5, max_batches_per_epoch_train=500, max_batches_per_epoch_val=100, lr_scheduler_ctor=lambda optim: torch.optim.lr_scheduler. ReduceLROnPlateau(optim, patience=2, factor=0.5, verbose=True)) torch.save(best_sentence_level_model, './models/cnn_pos') UNIQUE_TAGS1 = [ 'ADJ', 'ADV', 'AUX', 'CONJ', 'INTJ', 'NOUN', 'NUM', 'PART', 'PR', 'PRON', 'VERB' ] from sklearn.metrics import confusion_matrix train_pred = predict_with_model(sentence_level_model, train_dataset) train_loss = F.cross_entropy(torch.tensor(train_pred), torch.tensor(train_labels)) print('Среднее значение функции потерь на обучении', float(train_loss)) print( classification_report(train_labels.view(-1), train_pred.argmax(1).reshape(-1), labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13], target_names=UNIQUE_TAGS1)) print() test_pred = predict_with_model(sentence_level_model, test_dataset) test_loss = F.cross_entropy(torch.tensor(test_pred), torch.tensor(test_labels)) print('Среднее значение функции потерь на валидации', float(test_loss)) print( classification_report(test_labels.view(-1), test_pred.argmax(1).reshape(-1), labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13], target_names=UNIQUE_TAGS1))
path_loss = os.path.join(PLOTS_FOLDER, "lstm_plot_loss_" + TIMESTAMP) plt.savefig(path_loss, bbox_inches='tight') if __name__ == "__main__": # Create folders for any generated files. os.makedirs(GENTEXT_FOLDER, exist_ok=True) os.makedirs(PLOTS_FOLDER, exist_ok=True) os.makedirs(MODELS_FOLDER, exist_ok=True) # PREPROCESS THE DATA # pass in the text file name as the first argument # e.g. `$ python lstm_rnn.py sample1.txt` input_filenames = sys.argv[1:] words, indices, count, word_indices, indices_word = build_vocabulary(input_filenames) # SEQUENCE THE TEXT sequences = [] next_words = [] for i in range(0, len(words) - SEQUENCE_LEN, STEP): sequences.append(words[i:i+SEQUENCE_LEN]) next_words.append(words[i+SEQUENCE_LEN]) # SPLIT DATA INTO TRAIN AND TEST DATA (sequences_train, next_words_train), (sequences_test, next_words_test) = shuffle_and_split_training_set(sequences, next_words) # BUILD AND COMPILE THE MODEL model = get_model() model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])