def build_cooccur(self, vocab, corpus, window=10): helper._print_subheader("Building cooccurrence matrix") vocab_size = len(vocab) idx2word = {i: word for word, i in vocab.items()} cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64) helper._print('Enumerating through the corpus...') for i, sent in enumerate(corpus): if i % 10000 == 0 and i != 0: helper._print(f"{i}/{len(corpus)} sentences processed") if i == 500000: break token_ids = [vocab[word] for word in sent if word in vocab.keys()] for center_i, center_id in enumerate(token_ids): # Collect all word IDs in left window of center word context_ids = token_ids[max(0, center_i - window):center_i] contexts_len = len(context_ids) for left_i, left_id in enumerate(context_ids): # Distance from center word distance = contexts_len - left_i # Weight by inverse of distance between words increment = 1.0 / float(distance) # Build co-occurrence matrix symmetrically (pretend we # are calculating right contexts as well) cooccurrences[center_id, left_id] += increment cooccurrences[left_id, center_id] += increment return cooccurrences
def build_trained_embeddings(self): helper._print_header('Getting word2vec trained on Enron corpus...') if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) sentences = self.get_enron_sentences() model_logger = Word2VecLogger() path = directories.WORD2VEC_DIR + 'trained_word2vec.model' if os.path.isfile(path): helper._print('Loading previously trained model...') word2vec_model = KeyedVectors.load(path) else: helper._print_subheader('Building model...') word2vec_model = gensim.models.Word2Vec( sentences, size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) pool = multiprocessing.Pool() word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs) helper._print(f'Saving model to {path}') word2vec_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
def write_and_reset(self, data_set, _print=False): avg_loss = self.loss[data_set] / self.rounds[data_set] avg_acc = self.acc[data_set] / self.rounds[data_set] self.rounds[data_set] = 0 self.acc[data_set] = 0 self.loss[data_set] = 0 self.history[data_set].append((self.speed["epoch"], avg_acc, avg_loss)) if math.isnan(avg_loss): return False self.write_to_summary(data_set, avg_acc, avg_loss, self.speed["epoch"]) if avg_acc >= self.best_acc[data_set]: self.best_acc[data_set] = avg_acc self._new_best_acc[data_set] = True helper.save_dict(self.best_acc, placement=directories.BEST_ACC_FILE(self.model_name)) else: self._new_best_acc[data_set] = False if avg_loss <= self.best_loss[data_set]: self.best_loss[data_set] = avg_loss self._new_best_loss[data_set] = True helper.save_dict(self.best_loss, placement=directories.BEST_LOSS_FILE(self.model_name)) else: self._new_best_loss[data_set] = False if _print: helper._print(data_set.capitalize(), "-", "acc:", avg_acc, "loss:", avg_loss) return True
def word2vec_index_keyed_vector(self, keyed_vector, vocab): helper._print_subheader('Creating index files!') vocab_keys = keyed_vector.vocab.keys() ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = ['ZERO'] weights = [np.zeros(self.dimensions)] pbar = tqdm( bar_format='Indexing keyed_vector |{bar}| Elapsed: {elapsed} | ({n_fmt}/{total_fmt})', total=len(vocab_keys)) i = 0 for word in vocab_keys: if word in vocab.keys(): i += 1 word2idx[word] = i idx2word.append(word) weights.append(keyed_vector[word]) pbar.update(1) pbar.close() print() UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print('Index files ready!') # self.get_TSNE_plot(weights, [key for key in word2idx.keys()]) return np.array(weights, dtype=np.float32), word2idx, idx2word
def glove_generate_indexes(self): helper._print_subheader('Generating indexes for embeddings') ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = {ZERO_TOKEN: 'ZERO'} weights = [np.zeros(self.dimensions)] with open(self.word_embed_file_path, 'r', encoding="utf8") as file: for index, line in enumerate(file): values = line.split() # Word and weights separated by space word = values[0] # Word is first symbol on each line word_weights = np.asarray( values[1:], dtype=np.float32) # Remainder of line is weights for word word2idx[ word] = index + 1 # ZERO is our zeroth index so shift by one weights.append(word_weights) idx2word[index + 1] = word weights.append(word_weights) if index % FLAGS.word_embed_subset_size == 0 and index != 0: helper._print(f'{index} words indexed') if FLAGS.word_embed_subset: break UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print_subheader('Indexes done!') return np.array(weights, dtype=np.float32), word2idx, idx2word
def converging_tick(self): if self.best_acc[self.VAL] - self.speed["converging_acc"] > FLAGS.acc_min_delta_conv: self.speed["converging_acc"] = self.best_acc[self.VAL] self.speed["converging_count"] = 0 else: self.speed["converging_count"] += 1 helper._print( f"Converging in {self.speed['converging_count']}/{FLAGS.conv_cond} epochs. Prev best val acc: {self.speed['converging_acc']}")
def cluster(self, inputs): t = time() helper._print('Training clusters (KMeans)...') kmeans = KM(n_clusters=self.num_clusters, init=self.cluster_init, max_iter=1000, tol=0.000001) cluster_pred = kmeans.fit_predict(inputs) helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!') return cluster_pred
def cluster(self, inputs): t = time() helper._print('Training clusters (Agglomerative clustering)...') agglo = AgglomerativeClustering(n_clusters=self.num_clusters) cluster_pred = agglo.fit_predict(inputs) helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!') return cluster_pred
def handle_val_test(self, history, sess, test_writer, total_step, validation_writer): val_acc, val_loss, val_time = self.compute_acc_loss(self.data.val_trees, sess, validation_writer, total_step) helper._print("Validation - acc:", val_acc, "loss:", val_loss, "time:", val_time) history["val"].append((total_step, val_acc, val_loss)) test_acc, test_loss, test_time = self.compute_acc_loss(self.data.test_trees, sess, test_writer, total_step, data_set="test") helper._print("Test - acc:", test_acc, "loss:", test_loss, "time:", test_time) history["test"].append((total_step, test_acc, test_loss)) return val_acc
def write_history_to_summary(self, history, train_writer, validation_writer, test_writer): helper._print("Restoring summary...") def write_history(point_list, writer): for point in point_list: steps, acc, loss = point self.write_to_summary(acc, loss, steps, writer) write_history(history["train"], train_writer) write_history(history["val"], validation_writer) write_history(history["test"], test_writer) helper._print("Summary restored!")
def make_tree_text_file(self): if not os.path.isfile(directories.ENRON_TRAIN_SENTENCES_TXT_PATH): helper._print( f'Create .txt file for sentences in {directories.ENRON_TRAIN_SENTENCES_TXT_PATH}' ) if FLAGS.dataset == 'all': all_train_trees = self.train_trees else: all_train_trees = tree_util.parse_trees(dataset='all', type='train') tree_util.trees_to_textfile( list(all_train_trees), directories.ENRON_TRAIN_SENTENCES_TXT_PATH)
def make_needed_dir(self): helper._print("Constructing directories...") directory = FLAGS.logs_dir + FLAGS.model_name if os.path.exists(directory): shutil.rmtree(directory) os.mkdir(directory) os.mkdir(directory + 'train') os.mkdir(directory + 'validation') os.mkdir(directory + 'test') if not os.path.exists(FLAGS.histories_dir + FLAGS.model_name): os.mkdir(FLAGS.histories_dir + FLAGS.model_name) helper._print("Directories constructed!")
def construct_dir(self): model_name = self.model_name helper._print("Constructing directories...") if not os.path.exists(directories.TRAINED_MODELS_DIR): os.mkdir(directories.TRAINED_MODELS_DIR) if FLAGS.load_model: if not os.path.exists(directories.TMP_MODEL_DIR(model_name)): self.make_model_dirs(model_name) else: if os.path.exists(directories.MODEL_DIR(model_name)): shutil.rmtree(directories.MODEL_DIR(model_name)) self.make_model_dirs(model_name) helper._print("Directories constructed!")
def get_enron_sentences(self): helper._print_subheader('Reading ' + FLAGS.enron_emails_txt_path + '...') if not os.path.isfile(FLAGS.enron_emails_txt_path): self.load_enron_txt_data() with open(FLAGS.enron_emails_txt_path, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def build_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model' pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if os.path.isfile(path): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(path) else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() if not os.path.isfile(pretrained_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() helper._print_subheader('Unpacking ' + pretrained_path) model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) helper._print_subheader('Done unpacking!') finetuned_model = gensim.models.Word2Vec( size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') finetuned_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)
def get_enron_sentences(self): """ Generator for getting the enron data as individual sentences. """ helper._print_subheader('Reading ' + directories.ENRON_TRAIN_SENTENCES_TXT_PATH + '...') with open(directories.ENRON_TRAIN_SENTENCES_TXT_PATH, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def load_enron_txt_data(self): helper._print_header("Loading Enron emails") try: if os.name == 'nt': """ Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64 should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long' """ csv.field_size_limit((2**31) - 1) else: csv.field_size_limit(sys.maxsize) except OverflowError as e: # skip setting the limit for now pass if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH): data = 'wcukierski/enron-email-dataset' helper._print_subheader(f'Downloading enron emails from Kaggle') helper.download_from_kaggle(data, directories.ENRON_DIR) helper._print_subheader('Download finished! Unzipping...') with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH, 'r') as zip: zip.extractall(path=directories.ENRON_DIR) if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH): helper._print_subheader('Processing emails into .txt file!') with open(directories.ENRON_EMAILS_CSV_PATH, 'r', encoding='utf-8') as emails_csv: with open(directories.ENRON_EMAILS_TXT_PATH, 'w', encoding='utf-8') as text_file: email_reader = csv.reader(emails_csv, delimiter=",") for index, row in enumerate(email_reader): if index == 0: continue sentences = nltk.sent_tokenize( self.format_email_body(row)) for sent in sentences: if len(sent.split(' ')) > 2: text_file.write(sent + '\n') if index % 100000 == 0 and index != 0: helper._print(f'{index} emails processed') helper._print_subheader('Enron email data loaded!')
def build_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() else: helper._print_subheader('Unpacking ' + path) model = KeyedVectors.load_word2vec_format(path, binary=True) helper._print_subheader('Done unpacking!') vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
def word2vec_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model') else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') sentences = self.get_enron_sentences() finetuned_model = Word2Vec(size=300, min_count=3) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(binary_file_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_finetuned_mode_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model') return self.word2vec_index_keyed_vector(finetuned_model.wv)
def get_enron_sentences(self, kaggle=True, all=True): if kaggle: path = directories.ENRON_EMAILS_TXT_PATH if not os.path.isfile(path): self.load_enron_txt_data() else: if all: path = directories.TREE_ALL_SENTENCES_TXT_PATH else: path = directories.TREE_SENTENCES_TXT_PATH helper._print_subheader('Reading ' + path + '...') with open(path, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def word2vec_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt' if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(self.word_embed_file_path): binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() else: helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') return self.word2vec_index_keyed_vector(model)
def train_and_save_finetuned_embeddings(self): sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH): # idx2word = {i: word for word, i in word2idx.items()} cooccur = self.build_cooccur(vocab, sentences) pretrained_embeddings = self.glove2dict( directories.GLOVE_EMBEDDING_FILE_PATH) helper._print( f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set' ) helper._print_subheader('Building model...') mittens_dir = directories.GLOVE_DIR + 'mittens/' if not os.path.isdir(mittens_dir): os.makedirs(mittens_dir) mittens_model = Mittens(n=self.dimensions, xmax=100, max_iter=10000, display_progress=10, learning_rate=0.05, alpha=0.75, tol=1e-4, log_dir=mittens_dir, mittens=0.1) helper._print_subheader('Training Mittens model...') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print() helper._print_subheader( 'Done training finetuned embeddings! Merging with pre-trained embeddings...' ) resulting_embeddings = pretrained_embeddings for word, weights in zip(vocab.keys(), finetuned_embeddings): resulting_embeddings[word] = weights self.dict2glove(resulting_embeddings, directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH) return vocab, cooccur, resulting_embeddings return vocab, None, None
def build_vocab(self, corpus, min_count=FLAGS.glove_min_count): """ Credit to https://github.com/hans/glove.py/blob/master/glove.py Returns a dictionary `w -> (i, f)`, mapping word strings to pairs of word ID and word corpus frequency. """ helper._print_subheader('Building vocabulary from corpus') vocab = Counter() for i, doc in enumerate(corpus): if i % 100000 == 0 and i != 0: helper._print(f"{i}/{len(corpus)} sentences processed") break vocab.update(doc) helper._print_subheader('Done building vocabulary') i = 0 word2index = {} for word, freq in vocab.items(): if freq >= min_count: word2index[word] = i i += 1 return word2index
def word2vec_index_keyed_vector(self, keyed_vector): helper._print_subheader('Creating index files!') vocab_keys = keyed_vector.vocab.keys() ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = {ZERO_TOKEN: 'ZERO'} weights = [np.zeros(self.dimensions)] for index, word in enumerate(vocab_keys): word2idx[word] = index + 1 idx2word[index + 1] = word weights.append(keyed_vector[word]) if index % FLAGS.word_embed_subset_size == 0 and index != 0: helper._print(f'{index} words indexed') if FLAGS.word_embed_subset: break UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print_subheader('Index files ready!') return np.array(weights, dtype=np.float32), word2idx, idx2word
def build_vocab(self, corpus, min_count=FLAGS.word_min_count): helper._print_subheader('Building vocabulary from corpus') vocab = Counter() pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(corpus)) for i, doc in enumerate(corpus): if (i + 1) % 1000 == 0 and i != 0: pbar.update(1000) vocab.update(doc) pbar.update(len(corpus) % 1000) pbar.close() print() i = 0 word2index = {} for word, freq in vocab.items(): if freq >= min_count: word2index[word] = i i += 1 helper._print(f'Done building vocabulary. Length: {len(word2index)}') return word2index
def build_cooccur(self, vocab, corpus, window=10): helper._print_subheader("Building cooccurrence matrix") vocab_size = len(vocab) cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64) pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(corpus)) for i, sent in enumerate(corpus): if (i + 1) % 10000 == 0 and i != 0: pbar.update(10000) token_ids = [vocab[word] for word in sent if word in vocab.keys()] for center_i, center_id in enumerate(token_ids): # Collect all word IDs in left window of center word context_ids = token_ids[max(0, center_i - window):center_i] contexts_len = len(context_ids) for left_i, left_id in enumerate(context_ids): # Distance from center word distance = contexts_len - left_i # Weight by inverse of distance between words increment = 1.0 / float(distance) # Build co-occurrence matrix symmetrically (pretend we # are calculating right contexts as well) cooccurrences[center_id, left_id] += increment cooccurrences[left_id, center_id] += increment pbar.update(len(corpus) % 10000) pbar.close() print() helper._print( f'Done building cooccurrence matrix. Shape: {np.shape(cooccurrences)}' ) return cooccurrences
def train(self, train_data): helper._print("Learning rate:", self.sess.run(self.model.lr)) done = False run_time = 0 while not done: batches = helper.batches(train_data, self.batch_size, perm=True) pbar = tqdm( bar_format= "(Training) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})", total=len(batches)) for step, batch in enumerate(batches): self.summary.batch_inc() feed_dict, _ = self.model.build_feed_dict(batch, train=True) start_run_time = time.time() _, acc, loss = self.sess.run( [self.model.train_op, self.model.acc, self.model.loss], feed_dict=feed_dict) self.summary.add(self.summary.TRAIN, acc, loss) end_run_time = time.time() run_time += end_run_time - start_run_time pbar.update(1) pbar.close() print() # pbar = tqdm( # bar_format="(Accuracy) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})", # total=len(batches)) # for step, batch in enumerate(batches): # acc_feed_dict, _ = self.model.build_feed_dict(batch) # acc, loss = self.sess.run([self.model.acc, self.model.loss], # feed_dict=acc_feed_dict) # self.summary.add(self.summary.TRAIN, acc, loss) # pbar.update(1) # pbar.close() # print() # loading and saving tmp model - just in case something goes wrong if not self.summary.write_and_reset(self.summary.TRAIN, _print=True): helper._print("Nan loss encountered, trying again...") self.model.load_tmp(self.sess, self.saver) else: done = True self.model.save_tmp(self.sess, self.saver) helper._print( "Training time:", str(int(run_time / 60)) + "m " + str(int(run_time % 60)) + "s") return run_time
if FLAGS.word_embed_model == constants.WORD2VEC: word_embeddings = Word2Vec(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) else: # FLAGS.word_embed_model == constants.GLOVE word_embeddings = GloVe(mode=FLAGS.word_embed_mode, dimensions=FLAGS.word_embedding_size) model_placement = directories.TRAINED_MODELS_DIR + FLAGS.model_name + "model.ckpt" if FLAGS.model == constants.DEEP_RNN: model = deepRNN(data, word_embeddings, model_name) elif FLAGS.model == constants.BATCH_TREE_RNN: model = treeRNN_batch(data, word_embeddings, model_name) elif FLAGS.model == constants.NEERBEK_TREE_RNN: model = treeRNN_neerbek(data, word_embeddings, model_name) elif FLAGS.model == constants.TREE_LSTM: model = treeLSTM(data, word_embeddings, model_name) elif FLAGS.model == constants.TRACKER_TREE_RNN: model = treeRNN_tracker(data, word_embeddings, model_name) elif FLAGS.model == constants.TRACKER_TREE_LSTM: model = treeLSTM_tracker(data, word_embeddings, model_name) elif FLAGS.model == constants.LSTM: model = LSTM(data, word_embeddings, model_name) with tf.Session() as sess: saver = tf.train.Saver() model.load(sess, saver) helper._print("Acc:", model.accuracy(data.test_trees, sess)) p = Performance(data.test_trees, model, sess) p.plot_ROC()
def on_epoch_begin(self, model): helper._print(f"Epoch {self.epoch} / {model.iter}") self.epoch += 1
def select_data(self, data, cut_off, cluster_predictions=None): roots_size = [tree_util.size_of_tree(root) for root in data] data = np.array(helper.sort_by(data, roots_size)) t = time() if cluster_predictions is None: # Get representations representations, predictions, labels, permutations = [], [], [], [] batch_size = 500 batches = helper.batches(data, batch_size, perm=False) pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ', total=len(batches)) for i, batch in enumerate(batches): feed_dict, permuts = self.model.build_feed_dict(batch, sort=True) reps, labs = self.session.run( [self.model.sentence_representations, self.model.labels], feed_dict=feed_dict) representations.extend(reps) labels.extend(labs) permutations.extend(list(i * batch_size + np.array(permuts))) pbar.update(1) pbar.close() print() self.representations = np.array(representations)[permutations] self.labels = np.array(performance.get_prediction( np.array(labels)))[permutations] # Get clusters try_cluster = True tries = 10 while try_cluster: tries -= 1 self.cluster_predictions = self.cluster_model.cluster( self.representations) if np.bincount(self.cluster_predictions).max() <= 0.8 * len( self.representations) or tries >= 0: try_cluster = False else: self.cluster_predictions = cluster_predictions self.labels = tree_util.get_labels(data) # Get acc of clusters cluster_mfo = [] cluster_mfo_labels = [] for i in range(self.num_clusters): mfo, l = self.mfo(i) cluster_mfo.append((i, mfo)) cluster_mfo_labels.append((i, l)) # Return data cluster_mfo.sort(key=lambda el: el[1], reverse=True) helper._print(f'Cluster MFO scores:') for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels): helper._print( f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}' ) removed_percent = 0 data_to_use = [] for cluster, acc in cluster_mfo: new_percent = removed_percent + len( data[self.cluster_predictions == cluster]) / len(data) removed_percent = new_percent if acc < cut_off: data_to_use.extend(data[self.cluster_predictions == cluster]) helper._print( f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds' ) return data_to_use, self.cluster_predictions