Esempio n. 1
0
    def build_cooccur(self, vocab, corpus, window=10):
        helper._print_subheader("Building cooccurrence matrix")

        vocab_size = len(vocab)
        idx2word = {i: word for word, i in vocab.items()}

        cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64)
        helper._print('Enumerating through the corpus...')
        for i, sent in enumerate(corpus):
            if i % 10000 == 0 and i != 0:
                helper._print(f"{i}/{len(corpus)} sentences processed")
                if i == 500000:
                    break
            token_ids = [vocab[word] for word in sent if word in vocab.keys()]

            for center_i, center_id in enumerate(token_ids):
                # Collect all word IDs in left window of center word
                context_ids = token_ids[max(0, center_i - window):center_i]
                contexts_len = len(context_ids)

                for left_i, left_id in enumerate(context_ids):
                    # Distance from center word
                    distance = contexts_len - left_i

                    # Weight by inverse of distance between words
                    increment = 1.0 / float(distance)

                    # Build co-occurrence matrix symmetrically (pretend we
                    # are calculating right contexts as well)
                    cooccurrences[center_id, left_id] += increment
                    cooccurrences[left_id, center_id] += increment
        return cooccurrences
Esempio n. 2
0
 def build_trained_embeddings(self):
     helper._print_header('Getting word2vec trained on Enron corpus...')
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     sentences = self.get_enron_sentences()
     model_logger = Word2VecLogger()
     path = directories.WORD2VEC_DIR + 'trained_word2vec.model'
     if os.path.isfile(path):
         helper._print('Loading previously trained model...')
         word2vec_model = KeyedVectors.load(path)
     else:
         helper._print_subheader('Building model...')
         word2vec_model = gensim.models.Word2Vec(
             sentences,
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         pool = multiprocessing.Pool()
         word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger])
         # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs)
         helper._print(f'Saving model to {path}')
         word2vec_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
Esempio n. 3
0
    def write_and_reset(self, data_set, _print=False):
        avg_loss = self.loss[data_set] / self.rounds[data_set]
        avg_acc = self.acc[data_set] / self.rounds[data_set]
        self.rounds[data_set] = 0
        self.acc[data_set] = 0
        self.loss[data_set] = 0
        self.history[data_set].append((self.speed["epoch"],
                                       avg_acc,
                                       avg_loss))
        if math.isnan(avg_loss):
            return False
        self.write_to_summary(data_set, avg_acc, avg_loss, self.speed["epoch"])

        if avg_acc >= self.best_acc[data_set]:
            self.best_acc[data_set] = avg_acc
            self._new_best_acc[data_set] = True
            helper.save_dict(self.best_acc, placement=directories.BEST_ACC_FILE(self.model_name))
        else:
            self._new_best_acc[data_set] = False

        if avg_loss <= self.best_loss[data_set]:
            self.best_loss[data_set] = avg_loss
            self._new_best_loss[data_set] = True
            helper.save_dict(self.best_loss, placement=directories.BEST_LOSS_FILE(self.model_name))
        else:
            self._new_best_loss[data_set] = False

        if _print:
            helper._print(data_set.capitalize(), "-", "acc:", avg_acc, "loss:", avg_loss)
        return True
Esempio n. 4
0
    def word2vec_index_keyed_vector(self, keyed_vector, vocab):
        helper._print_subheader('Creating index files!')
        vocab_keys = keyed_vector.vocab.keys()
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = ['ZERO']
        weights = [np.zeros(self.dimensions)]
        pbar = tqdm(
            bar_format='Indexing keyed_vector |{bar}| Elapsed: {elapsed} | ({n_fmt}/{total_fmt})', total=len(vocab_keys))
        i = 0
        for word in vocab_keys:
            if word in vocab.keys():
                i += 1
                word2idx[word] = i
                idx2word.append(word)
                weights.append(keyed_vector[word])
            pbar.update(1)

        pbar.close()
        print()

        UNKNOWN_TOKEN = len(weights)
        word2idx['UNK'] = UNKNOWN_TOKEN
        np.random.seed(240993)
        weights.append(np.random.randn(self.dimensions))

        helper._print('Index files ready!')

        # self.get_TSNE_plot(weights, [key for key in word2idx.keys()])
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Esempio n. 5
0
    def glove_generate_indexes(self):
        helper._print_subheader('Generating indexes for embeddings')
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = {ZERO_TOKEN: 'ZERO'}
        weights = [np.zeros(self.dimensions)]

        with open(self.word_embed_file_path, 'r', encoding="utf8") as file:
            for index, line in enumerate(file):
                values = line.split()  # Word and weights separated by space
                word = values[0]  # Word is first symbol on each line
                word_weights = np.asarray(
                    values[1:],
                    dtype=np.float32)  # Remainder of line is weights for word
                word2idx[
                    word] = index + 1  # ZERO is our zeroth index so shift by one weights.append(word_weights)
                idx2word[index + 1] = word
                weights.append(word_weights)
                if index % FLAGS.word_embed_subset_size == 0 and index != 0:
                    helper._print(f'{index} words indexed')
                    if FLAGS.word_embed_subset:
                        break
            UNKNOWN_TOKEN = len(weights)
            word2idx['UNK'] = UNKNOWN_TOKEN
            np.random.seed(240993)
            weights.append(np.random.randn(self.dimensions))

            helper._print_subheader('Indexes done!')
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Esempio n. 6
0
 def converging_tick(self):
     if self.best_acc[self.VAL] - self.speed["converging_acc"] > FLAGS.acc_min_delta_conv:
         self.speed["converging_acc"] = self.best_acc[self.VAL]
         self.speed["converging_count"] = 0
     else:
         self.speed["converging_count"] += 1
     helper._print(
         f"Converging in {self.speed['converging_count']}/{FLAGS.conv_cond} epochs. Prev best val acc: {self.speed['converging_acc']}")
Esempio n. 7
0
    def cluster(self, inputs):
        t = time()
        helper._print('Training clusters (KMeans)...')
        kmeans = KM(n_clusters=self.num_clusters, init=self.cluster_init, max_iter=1000, tol=0.000001)
        cluster_pred = kmeans.fit_predict(inputs)
        helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!')

        return cluster_pred
Esempio n. 8
0
    def cluster(self, inputs):
        t = time()
        helper._print('Training clusters (Agglomerative clustering)...')
        agglo = AgglomerativeClustering(n_clusters=self.num_clusters)
        cluster_pred = agglo.fit_predict(inputs)
        helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!')

        return cluster_pred
Esempio n. 9
0
 def handle_val_test(self, history, sess, test_writer, total_step, validation_writer):
     val_acc, val_loss, val_time = self.compute_acc_loss(self.data.val_trees, sess,
                                                         validation_writer,
                                                         total_step)
     helper._print("Validation -  acc:", val_acc, "loss:", val_loss, "time:", val_time)
     history["val"].append((total_step, val_acc, val_loss))
     test_acc, test_loss, test_time = self.compute_acc_loss(self.data.test_trees, sess,
                                                            test_writer,
                                                            total_step, data_set="test")
     helper._print("Test -  acc:", test_acc, "loss:", test_loss, "time:", test_time)
     history["test"].append((total_step, test_acc, test_loss))
     return val_acc
Esempio n. 10
0
    def write_history_to_summary(self, history, train_writer, validation_writer, test_writer):
        helper._print("Restoring summary...")

        def write_history(point_list, writer):
            for point in point_list:
                steps, acc, loss = point
                self.write_to_summary(acc, loss, steps, writer)

        write_history(history["train"], train_writer)
        write_history(history["val"], validation_writer)
        write_history(history["test"], test_writer)

        helper._print("Summary restored!")
Esempio n. 11
0
 def make_tree_text_file(self):
     if not os.path.isfile(directories.ENRON_TRAIN_SENTENCES_TXT_PATH):
         helper._print(
             f'Create .txt file for sentences in {directories.ENRON_TRAIN_SENTENCES_TXT_PATH}'
         )
         if FLAGS.dataset == 'all':
             all_train_trees = self.train_trees
         else:
             all_train_trees = tree_util.parse_trees(dataset='all',
                                                     type='train')
         tree_util.trees_to_textfile(
             list(all_train_trees),
             directories.ENRON_TRAIN_SENTENCES_TXT_PATH)
Esempio n. 12
0
    def make_needed_dir(self):
        helper._print("Constructing directories...")

        directory = FLAGS.logs_dir + FLAGS.model_name
        if os.path.exists(directory):
            shutil.rmtree(directory)
        os.mkdir(directory)
        os.mkdir(directory + 'train')
        os.mkdir(directory + 'validation')
        os.mkdir(directory + 'test')
        if not os.path.exists(FLAGS.histories_dir + FLAGS.model_name):
            os.mkdir(FLAGS.histories_dir + FLAGS.model_name)

        helper._print("Directories constructed!")
Esempio n. 13
0
    def construct_dir(self):
        model_name = self.model_name
        helper._print("Constructing directories...")

        if not os.path.exists(directories.TRAINED_MODELS_DIR):
            os.mkdir(directories.TRAINED_MODELS_DIR)

        if FLAGS.load_model:
            if not os.path.exists(directories.TMP_MODEL_DIR(model_name)):
                self.make_model_dirs(model_name)
        else:
            if os.path.exists(directories.MODEL_DIR(model_name)):
                shutil.rmtree(directories.MODEL_DIR(model_name))
            self.make_model_dirs(model_name)

        helper._print("Directories constructed!")
Esempio n. 14
0
 def get_enron_sentences(self):
     helper._print_subheader('Reading ' + FLAGS.enron_emails_txt_path +
                             '...')
     if not os.path.isfile(FLAGS.enron_emails_txt_path):
         self.load_enron_txt_data()
     with open(FLAGS.enron_emails_txt_path, 'r',
               encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Esempio n. 15
0
 def build_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model'
     pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
     sentences = self.get_enron_sentences()
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     if os.path.isfile(path):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(path)
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         if not os.path.isfile(pretrained_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
             sys.exit()
         helper._print_subheader('Unpacking ' + pretrained_path)
         model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
         helper._print_subheader('Done unpacking!')
         finetuned_model = gensim.models.Word2Vec(
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())], update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         finetuned_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)
Esempio n. 16
0
 def get_enron_sentences(self):
     """
         Generator for getting the enron data as individual sentences.
     """
     helper._print_subheader('Reading ' +
                             directories.ENRON_TRAIN_SENTENCES_TXT_PATH +
                             '...')
     with open(directories.ENRON_TRAIN_SENTENCES_TXT_PATH,
               'r',
               encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Esempio n. 17
0
    def load_enron_txt_data(self):
        helper._print_header("Loading Enron emails")
        try:
            if os.name == 'nt':
                """
                Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal
                representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64
                should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long'
                """
                csv.field_size_limit((2**31) - 1)
            else:
                csv.field_size_limit(sys.maxsize)
        except OverflowError as e:
            # skip setting the limit for now
            pass
        if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH):
            data = 'wcukierski/enron-email-dataset'
            helper._print_subheader(f'Downloading enron emails from Kaggle')
            helper.download_from_kaggle(data, directories.ENRON_DIR)
            helper._print_subheader('Download finished! Unzipping...')
            with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH,
                                 'r') as zip:
                zip.extractall(path=directories.ENRON_DIR)
        if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH):
            helper._print_subheader('Processing emails into .txt file!')
            with open(directories.ENRON_EMAILS_CSV_PATH, 'r',
                      encoding='utf-8') as emails_csv:
                with open(directories.ENRON_EMAILS_TXT_PATH,
                          'w',
                          encoding='utf-8') as text_file:
                    email_reader = csv.reader(emails_csv, delimiter=",")
                    for index, row in enumerate(email_reader):
                        if index == 0:
                            continue
                        sentences = nltk.sent_tokenize(
                            self.format_email_body(row))
                        for sent in sentences:
                            if len(sent.split(' ')) > 2:
                                text_file.write(sent + '\n')
                        if index % 100000 == 0 and index != 0:
                            helper._print(f'{index} emails processed')

        helper._print_subheader('Enron email data loaded!')
Esempio n. 18
0
    def build_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
        sentences = self.get_enron_sentences()
        if not os.path.isdir(directories.WORD2VEC_DIR):
            os.makedirs(directories.WORD2VEC_DIR)
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(path):
            helper._print(
                'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
            sys.exit()
        else:
            helper._print_subheader('Unpacking ' + path)
            model = KeyedVectors.load_word2vec_format(path, binary=True)
            helper._print_subheader('Done unpacking!')
            vocab = self.build_vocab(sentences)
            return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
Esempio n. 19
0
 def word2vec_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     if not os.path.isdir(FLAGS.word2vec_dir):
         os.makedirs(FLAGS.word2vec_dir)
     if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir +
                                                'word2vec.model')
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
         if not os.path.isfile(binary_file_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
             )
             sys.exit()
         helper._print_subheader('Unpacking ' + binary_file_path)
         model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                   binary=True)
         helper._print_subheader('Done unpacking!')
         sentences = self.get_enron_sentences()
         finetuned_model = Word2Vec(size=300, min_count=3)
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())],
                                     update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(binary_file_path,
                                                   binary=True,
                                                   lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences,
                               total_examples=len(sentences),
                               epochs=FLAGS.word2vec_finetuned_mode_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model')
     return self.word2vec_index_keyed_vector(finetuned_model.wv)
Esempio n. 20
0
 def get_enron_sentences(self, kaggle=True, all=True):
     if kaggle:
         path = directories.ENRON_EMAILS_TXT_PATH
         if not os.path.isfile(path):
             self.load_enron_txt_data()
     else:
         if all:
             path = directories.TREE_ALL_SENTENCES_TXT_PATH
         else:
             path = directories.TREE_SENTENCES_TXT_PATH
     helper._print_subheader('Reading ' + path + '...')
     with open(path, 'r', encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Esempio n. 21
0
    def word2vec_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        if not os.path.isdir(FLAGS.word2vec_dir):
            os.makedirs(FLAGS.word2vec_dir)
        self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt'
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(self.word_embed_file_path):
            binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
            if not os.path.isfile(binary_file_path):
                helper._print(
                    'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
                )
                sys.exit()
            else:
                helper._print_subheader('Unpacking ' + binary_file_path)
                model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                          binary=True)
                helper._print_subheader('Done unpacking!')
                return self.word2vec_index_keyed_vector(model)
Esempio n. 22
0
 def train_and_save_finetuned_embeddings(self):
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH):
         # idx2word = {i: word for word, i in word2idx.items()}
         cooccur = self.build_cooccur(vocab, sentences)
         pretrained_embeddings = self.glove2dict(
             directories.GLOVE_EMBEDDING_FILE_PATH)
         helper._print(
             f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set'
         )
         helper._print_subheader('Building model...')
         mittens_dir = directories.GLOVE_DIR + 'mittens/'
         if not os.path.isdir(mittens_dir):
             os.makedirs(mittens_dir)
         mittens_model = Mittens(n=self.dimensions,
                                 xmax=100,
                                 max_iter=10000,
                                 display_progress=10,
                                 learning_rate=0.05,
                                 alpha=0.75,
                                 tol=1e-4,
                                 log_dir=mittens_dir,
                                 mittens=0.1)
         helper._print_subheader('Training Mittens model...')
         finetuned_embeddings = mittens_model.fit(
             cooccur,
             vocab=vocab,
             initial_embedding_dict=pretrained_embeddings)
         print()
         helper._print_subheader(
             'Done training finetuned embeddings! Merging with pre-trained embeddings...'
         )
         resulting_embeddings = pretrained_embeddings
         for word, weights in zip(vocab.keys(), finetuned_embeddings):
             resulting_embeddings[word] = weights
         self.dict2glove(resulting_embeddings,
                         directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH)
         return vocab, cooccur, resulting_embeddings
     return vocab, None, None
Esempio n. 23
0
    def build_vocab(self, corpus, min_count=FLAGS.glove_min_count):
        """
        Credit to https://github.com/hans/glove.py/blob/master/glove.py

        Returns a dictionary `w -> (i, f)`, mapping word strings to pairs of
        word ID and word corpus frequency.
        """
        helper._print_subheader('Building vocabulary from corpus')
        vocab = Counter()
        for i, doc in enumerate(corpus):
            if i % 100000 == 0 and i != 0:
                helper._print(f"{i}/{len(corpus)} sentences processed")
                break
            vocab.update(doc)
        helper._print_subheader('Done building vocabulary')
        i = 0
        word2index = {}
        for word, freq in vocab.items():
            if freq >= min_count:
                word2index[word] = i
                i += 1
        return word2index
Esempio n. 24
0
    def word2vec_index_keyed_vector(self, keyed_vector):
        helper._print_subheader('Creating index files!')
        vocab_keys = keyed_vector.vocab.keys()
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = {ZERO_TOKEN: 'ZERO'}
        weights = [np.zeros(self.dimensions)]
        for index, word in enumerate(vocab_keys):
            word2idx[word] = index + 1
            idx2word[index + 1] = word
            weights.append(keyed_vector[word])
            if index % FLAGS.word_embed_subset_size == 0 and index != 0:
                helper._print(f'{index} words indexed')
                if FLAGS.word_embed_subset:
                    break

        UNKNOWN_TOKEN = len(weights)
        word2idx['UNK'] = UNKNOWN_TOKEN
        np.random.seed(240993)
        weights.append(np.random.randn(self.dimensions))
        helper._print_subheader('Index files ready!')
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Esempio n. 25
0
    def build_vocab(self, corpus, min_count=FLAGS.word_min_count):
        helper._print_subheader('Building vocabulary from corpus')
        vocab = Counter()
        pbar = tqdm(
            bar_format=
            '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
            total=len(corpus))
        for i, doc in enumerate(corpus):
            if (i + 1) % 1000 == 0 and i != 0:
                pbar.update(1000)
            vocab.update(doc)
        pbar.update(len(corpus) % 1000)
        pbar.close()
        print()
        i = 0
        word2index = {}
        for word, freq in vocab.items():
            if freq >= min_count:
                word2index[word] = i
                i += 1

        helper._print(f'Done building vocabulary. Length: {len(word2index)}')
        return word2index
Esempio n. 26
0
    def build_cooccur(self, vocab, corpus, window=10):
        helper._print_subheader("Building cooccurrence matrix")
        vocab_size = len(vocab)
        cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64)
        pbar = tqdm(
            bar_format=
            '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
            total=len(corpus))
        for i, sent in enumerate(corpus):
            if (i + 1) % 10000 == 0 and i != 0:
                pbar.update(10000)
            token_ids = [vocab[word] for word in sent if word in vocab.keys()]

            for center_i, center_id in enumerate(token_ids):
                # Collect all word IDs in left window of center word
                context_ids = token_ids[max(0, center_i - window):center_i]
                contexts_len = len(context_ids)

                for left_i, left_id in enumerate(context_ids):
                    # Distance from center word
                    distance = contexts_len - left_i

                    # Weight by inverse of distance between words
                    increment = 1.0 / float(distance)

                    # Build co-occurrence matrix symmetrically (pretend we
                    # are calculating right contexts as well)
                    cooccurrences[center_id, left_id] += increment
                    cooccurrences[left_id, center_id] += increment

        pbar.update(len(corpus) % 10000)
        pbar.close()
        print()
        helper._print(
            f'Done building cooccurrence matrix. Shape: {np.shape(cooccurrences)}'
        )
        return cooccurrences
Esempio n. 27
0
    def train(self, train_data):
        helper._print("Learning rate:", self.sess.run(self.model.lr))
        done = False
        run_time = 0
        while not done:
            batches = helper.batches(train_data, self.batch_size, perm=True)
            pbar = tqdm(
                bar_format=
                "(Training) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})",
                total=len(batches))
            for step, batch in enumerate(batches):
                self.summary.batch_inc()
                feed_dict, _ = self.model.build_feed_dict(batch, train=True)
                start_run_time = time.time()

                _, acc, loss = self.sess.run(
                    [self.model.train_op, self.model.acc, self.model.loss],
                    feed_dict=feed_dict)
                self.summary.add(self.summary.TRAIN, acc, loss)

                end_run_time = time.time()
                run_time += end_run_time - start_run_time

                pbar.update(1)
            pbar.close()
            print()

            # pbar = tqdm(
            #     bar_format="(Accuracy) {percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt})",
            #     total=len(batches))
            # for step, batch in enumerate(batches):
            #     acc_feed_dict, _ = self.model.build_feed_dict(batch)
            #     acc, loss = self.sess.run([self.model.acc, self.model.loss],
            #                               feed_dict=acc_feed_dict)
            #     self.summary.add(self.summary.TRAIN, acc, loss)
            #     pbar.update(1)
            # pbar.close()
            # print()
            # loading and saving tmp model - just in case something goes wrong
            if not self.summary.write_and_reset(self.summary.TRAIN,
                                                _print=True):
                helper._print("Nan loss encountered, trying again...")
                self.model.load_tmp(self.sess, self.saver)
            else:
                done = True
                self.model.save_tmp(self.sess, self.saver)

            helper._print(
                "Training time:",
                str(int(run_time / 60)) + "m " + str(int(run_time % 60)) + "s")
        return run_time
Esempio n. 28
0
if FLAGS.word_embed_model == constants.WORD2VEC:
    word_embeddings = Word2Vec(mode=FLAGS.word_embed_mode,
                               dimensions=FLAGS.word_embedding_size)
else:  # FLAGS.word_embed_model == constants.GLOVE
    word_embeddings = GloVe(mode=FLAGS.word_embed_mode,
                            dimensions=FLAGS.word_embedding_size)

model_placement = directories.TRAINED_MODELS_DIR + FLAGS.model_name + "model.ckpt"

if FLAGS.model == constants.DEEP_RNN:
    model = deepRNN(data, word_embeddings, model_name)
elif FLAGS.model == constants.BATCH_TREE_RNN:
    model = treeRNN_batch(data, word_embeddings, model_name)
elif FLAGS.model == constants.NEERBEK_TREE_RNN:
    model = treeRNN_neerbek(data, word_embeddings, model_name)
elif FLAGS.model == constants.TREE_LSTM:
    model = treeLSTM(data, word_embeddings, model_name)
elif FLAGS.model == constants.TRACKER_TREE_RNN:
    model = treeRNN_tracker(data, word_embeddings, model_name)
elif FLAGS.model == constants.TRACKER_TREE_LSTM:
    model = treeLSTM_tracker(data, word_embeddings, model_name)
elif FLAGS.model == constants.LSTM:
    model = LSTM(data, word_embeddings, model_name)

with tf.Session() as sess:
    saver = tf.train.Saver()
    model.load(sess, saver)
    helper._print("Acc:", model.accuracy(data.test_trees, sess))
    p = Performance(data.test_trees, model, sess)
    p.plot_ROC()
Esempio n. 29
0
 def on_epoch_begin(self, model):
     helper._print(f"Epoch {self.epoch} / {model.iter}")
     self.epoch += 1
Esempio n. 30
0
    def select_data(self, data, cut_off, cluster_predictions=None):
        roots_size = [tree_util.size_of_tree(root) for root in data]
        data = np.array(helper.sort_by(data, roots_size))

        t = time()
        if cluster_predictions is None:

            # Get representations
            representations, predictions, labels, permutations = [], [], [], []
            batch_size = 500
            batches = helper.batches(data, batch_size, perm=False)
            pbar = tqdm(
                bar_format=
                '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} (batches: {n_fmt}/{total_fmt}) ',
                total=len(batches))

            for i, batch in enumerate(batches):
                feed_dict, permuts = self.model.build_feed_dict(batch,
                                                                sort=True)
                reps, labs = self.session.run(
                    [self.model.sentence_representations, self.model.labels],
                    feed_dict=feed_dict)
                representations.extend(reps)
                labels.extend(labs)
                permutations.extend(list(i * batch_size + np.array(permuts)))
                pbar.update(1)
            pbar.close()
            print()

            self.representations = np.array(representations)[permutations]
            self.labels = np.array(performance.get_prediction(
                np.array(labels)))[permutations]

            # Get clusters

            try_cluster = True
            tries = 10
            while try_cluster:
                tries -= 1
                self.cluster_predictions = self.cluster_model.cluster(
                    self.representations)
                if np.bincount(self.cluster_predictions).max() <= 0.8 * len(
                        self.representations) or tries >= 0:
                    try_cluster = False

        else:
            self.cluster_predictions = cluster_predictions
            self.labels = tree_util.get_labels(data)

        # Get acc of clusters
        cluster_mfo = []
        cluster_mfo_labels = []
        for i in range(self.num_clusters):
            mfo, l = self.mfo(i)
            cluster_mfo.append((i, mfo))
            cluster_mfo_labels.append((i, l))

        # Return data
        cluster_mfo.sort(key=lambda el: el[1], reverse=True)
        helper._print(f'Cluster MFO scores:')
        for (k, mfo), (_, l) in zip(cluster_mfo, cluster_mfo_labels):
            helper._print(
                f'\tCluster {k}: {mfo}, highest label: {l}, size: {len(self.labels[self.cluster_predictions == k])}/{len(data)}'
            )

        removed_percent = 0
        data_to_use = []
        for cluster, acc in cluster_mfo:
            new_percent = removed_percent + len(
                data[self.cluster_predictions == cluster]) / len(data)
            removed_percent = new_percent
            if acc < cut_off:
                data_to_use.extend(data[self.cluster_predictions == cluster])

        helper._print(
            f'Done selecting data for training. Overall time used for selection is {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds'
        )
        return data_to_use, self.cluster_predictions