Example #1
0
 def build_trained_embeddings(self):
     helper._print_header('Getting word2vec trained on Enron corpus...')
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     sentences = self.get_enron_sentences()
     model_logger = Word2VecLogger()
     path = directories.WORD2VEC_DIR + 'trained_word2vec.model'
     if os.path.isfile(path):
         helper._print('Loading previously trained model...')
         word2vec_model = KeyedVectors.load(path)
     else:
         helper._print_subheader('Building model...')
         word2vec_model = gensim.models.Word2Vec(
             sentences,
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         pool = multiprocessing.Pool()
         word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger])
         # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs)
         helper._print(f'Saving model to {path}')
         word2vec_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
Example #2
0
    def glove_finetuned_embeddings(self):
        helper._print_header('Getting fine-tuned GloVe embeddings')
        self.glove_download_pretrained_model()
        sentences = self.get_enron_sentences()
        vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path,
                                    self.build_vocab, sentences)
        # idx2word = {i: word for word, i in word2idx.items()}
        print(len(vocab))
        cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path,
                                      self.build_cooccur,
                                      vocab,
                                      sentences,
                                      type='numpy')
        print(np.shape(cooccur))
        pretrained_embeddings = self.glove2dict(self.word_embed_file_path)
        helper._print_subheader('Starting Mittens model...')
        mittens_model = Mittens(n=self.dimensions,
                                max_iter=1000,
                                display_progress=1,
                                log_dir=FLAGS.glove_dir + 'mittens/')
        finetuned_embeddings = mittens_model.fit(
            cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings)
        print(finetuned_embeddings)

        return 'test', 'test', 'test'
Example #3
0
 def get_TSNE_plot(self, embeddings, vocab, words=None):
     helper._print_subheader('Plotting embeddings')
     vocab = vocab
     embeddings = embeddings
     # fit a 2d PCA model to the vectors
     tsne = TSNE(perplexity=30,
                 n_components=2,
                 verbose=2,
                 init='pca',
                 n_iter=5000,
                 method='exact')
     result = tsne.fit_transform(embeddings)
     # create a scatter plot of the projection
     if not words is None:
         result = np.array(
             [[x, y, i]
              for i, (x, y) in enumerate(result) if vocab[i] in words],
             dtype=np.float64)
         pyplot.scatter(result[:, 0], result[:, 1])
         for r in result:
             pyplot.annotate(vocab[int(r[2])], xy=(r[0], r[1]))
     else:
         pyplot.scatter(result[:, 0], result[:, 1])
         for i, word in enumerate(vocab):
             pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
     pyplot.show()
Example #4
0
    def word2vec_index_keyed_vector(self, keyed_vector, vocab):
        helper._print_subheader('Creating index files!')
        vocab_keys = keyed_vector.vocab.keys()
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = ['ZERO']
        weights = [np.zeros(self.dimensions)]
        pbar = tqdm(
            bar_format='Indexing keyed_vector |{bar}| Elapsed: {elapsed} | ({n_fmt}/{total_fmt})', total=len(vocab_keys))
        i = 0
        for word in vocab_keys:
            if word in vocab.keys():
                i += 1
                word2idx[word] = i
                idx2word.append(word)
                weights.append(keyed_vector[word])
            pbar.update(1)

        pbar.close()
        print()

        UNKNOWN_TOKEN = len(weights)
        word2idx['UNK'] = UNKNOWN_TOKEN
        np.random.seed(240993)
        weights.append(np.random.randn(self.dimensions))

        helper._print('Index files ready!')

        # self.get_TSNE_plot(weights, [key for key in word2idx.keys()])
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Example #5
0
    def build_cooccur(self, vocab, corpus, window=10):
        helper._print_subheader("Building cooccurrence matrix")

        vocab_size = len(vocab)
        idx2word = {i: word for word, i in vocab.items()}

        cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64)
        helper._print('Enumerating through the corpus...')
        for i, sent in enumerate(corpus):
            if i % 10000 == 0 and i != 0:
                helper._print(f"{i}/{len(corpus)} sentences processed")
                if i == 500000:
                    break
            token_ids = [vocab[word] for word in sent if word in vocab.keys()]

            for center_i, center_id in enumerate(token_ids):
                # Collect all word IDs in left window of center word
                context_ids = token_ids[max(0, center_i - window):center_i]
                contexts_len = len(context_ids)

                for left_i, left_id in enumerate(context_ids):
                    # Distance from center word
                    distance = contexts_len - left_i

                    # Weight by inverse of distance between words
                    increment = 1.0 / float(distance)

                    # Build co-occurrence matrix symmetrically (pretend we
                    # are calculating right contexts as well)
                    cooccurrences[center_id, left_id] += increment
                    cooccurrences[left_id, center_id] += increment
        return cooccurrences
Example #6
0
 def train_and_save_embeddings(self):
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     if not os.path.isfile(directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH):
         sentences = self.get_enron_sentences()
         vocab = self.build_vocab(sentences)
         cooccur = self.build_cooccur(vocab, sentences)
         helper._print_subheader('Building model...')
         glove_model = mittens_glove(n=300,
                                     xmax=100,
                                     max_iter=20000,
                                     learning_rate=0.01,
                                     alpha=0.75,
                                     tol=1e-4,
                                     display_progress=10,
                                     log_dir=directories.GLOVE_DIR +
                                     'mittens/')
         helper._print_subheader('Training GloVE model...')
         trained_embeddings = glove_model.fit(cooccur)
         resulting_embeddings = {}
         for word, weights in zip(vocab.keys(), trained_embeddings):
             resulting_embeddings[word] = weights
         self.dict2glove(resulting_embeddings,
                         directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH)
         return vocab, cooccur, resulting_embeddings
     return vocab, None, None
Example #7
0
    def glove_generate_indexes(self):
        helper._print_subheader('Generating indexes for embeddings')
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = {ZERO_TOKEN: 'ZERO'}
        weights = [np.zeros(self.dimensions)]

        with open(self.word_embed_file_path, 'r', encoding="utf8") as file:
            for index, line in enumerate(file):
                values = line.split()  # Word and weights separated by space
                word = values[0]  # Word is first symbol on each line
                word_weights = np.asarray(
                    values[1:],
                    dtype=np.float32)  # Remainder of line is weights for word
                word2idx[
                    word] = index + 1  # ZERO is our zeroth index so shift by one weights.append(word_weights)
                idx2word[index + 1] = word
                weights.append(word_weights)
                if index % FLAGS.word_embed_subset_size == 0 and index != 0:
                    helper._print(f'{index} words indexed')
                    if FLAGS.word_embed_subset:
                        break
            UNKNOWN_TOKEN = len(weights)
            word2idx['UNK'] = UNKNOWN_TOKEN
            np.random.seed(240993)
            weights.append(np.random.randn(self.dimensions))

            helper._print_subheader('Indexes done!')
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Example #8
0
 def glove2dict(self, glove_filename):
     helper._print_subheader(
         'Generating dict from pretrained GloVe embeddings')
     with open(glove_filename, 'r', encoding="utf8") as file:
         embed = {}
         for index, line in enumerate(file):
             values = line.split()
             embed[values[0]] = np.asarray(values[1:], dtype=np.float32)
     return embed
Example #9
0
 def predict_and_label(self, data, sess):
     helper._print_subheader("Predicting")
     prob, labels = [], []
     batches = helper.batches(data,
                              batch_size=500 if FLAGS.use_gpu else 2,
                              use_tail=True,
                              perm=False)
     for batch in batches:
         feed_dict, _ = self.build_feed_dict(batch)
         p, l = sess.run([self.p, self.labels], feed_dict=feed_dict)
         prob.extend(p)
         labels.extend(l)
     return prob, labels
Example #10
0
    def word2vec_trained_embeddings(self):
        helper._print_header('Getting word2vec trained on Enron corpus...')
        if not os.path.isdir(FLAGS.word2vec_dir):
            os.makedirs(FLAGS.word2vec_dir)
        documents = self.get_enron_sentences()
        model_logger = Word2VecLogger()
        if os.path.isfile(FLAGS.word2vec_dir + 'word2vec.model'):
            helper._print_subheader('Loading previously trained model...')
            model = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model')
        else:
            helper._print_subheader('Building model...')
            model = Word2Vec(
                documents,
                size=300,
                sg=1,  # Use Skip-Gram (0 for CBOW)
                hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
                window=10,
                min_count=3,
                workers=10,
                iter=1)
            helper._print_subheader('Saving untrained model...')
            model.save(FLAGS.word2vec_dir + 'word2vec.model')
        model.train(documents,
                    total_examples=len(documents),
                    epochs=FLAGS.word2vec_trained_mode_epochs,
                    callbacks=[model_logger])
        helper._print_subheader('Saving model...')
        model.save(FLAGS.word2vec_dir + 'trained_word2vec.model')

        return self.word2vec_index_keyed_vector(model.wv)
Example #11
0
 def get_enron_sentences(self):
     helper._print_subheader('Reading ' + FLAGS.enron_emails_txt_path +
                             '...')
     if not os.path.isfile(FLAGS.enron_emails_txt_path):
         self.load_enron_txt_data()
     with open(FLAGS.enron_emails_txt_path, 'r',
               encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Example #12
0
    def dict2glove(self, embeddings_dict, path):
        helper._print_subheader('Saving to glove format...')
        with open(path, 'w', encoding="utf8") as file:

            pbar = tqdm(
                bar_format=
                '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
                total=len(embeddings_dict))
            for index, (word, weights) in enumerate(embeddings_dict.items()):
                if index % 1000 == 0 and index != 0:
                    pbar.update(1000)
                embeddings_string = word
                for weight in weights:
                    embeddings_string += ' ' + str(weight)
                file.write(embeddings_string + '\n')
            pbar.update(len(embeddings_dict) % 1000)
            pbar.close()
            print()
Example #13
0
    def build_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
        sentences = self.get_enron_sentences()
        if not os.path.isdir(directories.WORD2VEC_DIR):
            os.makedirs(directories.WORD2VEC_DIR)
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(path):
            helper._print(
                'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
            sys.exit()
        else:
            helper._print_subheader('Unpacking ' + path)
            model = KeyedVectors.load_word2vec_format(path, binary=True)
            helper._print_subheader('Done unpacking!')
            vocab = self.build_vocab(sentences)
            return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
Example #14
0
 def glove2dict(self, glove_filename):
     helper._print_subheader(
         'Generating dict from pretrained GloVe embeddings')
     with open(glove_filename, 'r', encoding="utf8") as file:
         embed = {}
         lines = file.readlines()
         pbar = tqdm(
             bar_format=
             '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
             total=len(lines))
         for index, line in enumerate(lines):
             if index % 10000 == 0 and index != 0:
                 pbar.update(10000)
             values = line.split()
             embed[values[0]] = np.asarray(values[1:], dtype=np.float32)
         pbar.update(len(lines) % 10000)
         pbar.close()
         print()
     return embed
Example #15
0
 def get_enron_sentences(self):
     """
         Generator for getting the enron data as individual sentences.
     """
     helper._print_subheader('Reading ' +
                             directories.ENRON_TRAIN_SENTENCES_TXT_PATH +
                             '...')
     with open(directories.ENRON_TRAIN_SENTENCES_TXT_PATH,
               'r',
               encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Example #16
0
    def generate_indexes(self, vocab, file):
        helper._print_subheader('Generating indexes for embeddings')
        weights = [np.zeros(self.dimensions)]
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = ['ZERO']

        i = 0
        with open(file, 'r', encoding='utf-8', newline='\n',
                  errors='ignore') as file:
            if FLAGS.word_embed_model == 'fasttext':
                n, d = map(int, file.readline().split())
            lines = file.readlines()
            pbar = tqdm(
                bar_format=
                '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
                total=len(lines))
            for index, line in enumerate(lines):
                values = line.split()  # Word and weights separated by space
                word = values[0]  # Word is first symbol on each line
                if word in vocab.keys() and helper.is_float(values[1]):
                    i += 1
                    word_weights = np.asarray(
                        values[1:], dtype=np.float32
                    )  # Remainder of line is weights for word
                    word2idx[word] = i
                    idx2word.append(word)
                    weights.append(word_weights)
                pbar.update(1)
            pbar.close()
            UNKNOWN_TOKEN = len(weights)
            word2idx['UNK'] = UNKNOWN_TOKEN
            idx2word.append('UNK')
            np.random.seed(240993)
            weights.append(np.random.randn(self.dimensions))

            # self.get_TSNE_plot(weights, [key for key in word2idx.keys()])

            helper._print_subheader(
                f'Indexes done! {len(weights) - 2} word embeddings!')
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Example #17
0
 def get_enron_sentences(self, kaggle=True, all=True):
     if kaggle:
         path = directories.ENRON_EMAILS_TXT_PATH
         if not os.path.isfile(path):
             self.load_enron_txt_data()
     else:
         if all:
             path = directories.TREE_ALL_SENTENCES_TXT_PATH
         else:
             path = directories.TREE_SENTENCES_TXT_PATH
     helper._print_subheader('Reading ' + path + '...')
     with open(path, 'r', encoding='utf-8') as txt_file:
         for index, line in enumerate(txt_file):
             if index % 1000000 == 0 and index != 0:
                 helper._print(f'{index} sentences read')
                 break
             preproccesed_line = simple_preprocess(line)
             if preproccesed_line != []:
                 yield preproccesed_line
     helper._print(f'{index} sentences read')
     helper._print_subheader('Done reading Enron email data!')
Example #18
0
    def word2vec_pretrained_embeddings(self):
        helper._print_header('Getting pretrained word2vec embeddings')
        if not os.path.isdir(FLAGS.word2vec_dir):
            os.makedirs(FLAGS.word2vec_dir)
        self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt'
        if not self.dimensions == 300:
            helper._print('Only support word2vec with vectors of size 300')

        if not os.path.isfile(self.word_embed_file_path):
            binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
            if not os.path.isfile(binary_file_path):
                helper._print(
                    'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
                )
                sys.exit()
            else:
                helper._print_subheader('Unpacking ' + binary_file_path)
                model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                          binary=True)
                helper._print_subheader('Done unpacking!')
                return self.word2vec_index_keyed_vector(model)
Example #19
0
    def build_vocab(self, corpus, min_count=FLAGS.glove_min_count):
        """
        Credit to https://github.com/hans/glove.py/blob/master/glove.py

        Returns a dictionary `w -> (i, f)`, mapping word strings to pairs of
        word ID and word corpus frequency.
        """
        helper._print_subheader('Building vocabulary from corpus')
        vocab = Counter()
        for i, doc in enumerate(corpus):
            if i % 100000 == 0 and i != 0:
                helper._print(f"{i}/{len(corpus)} sentences processed")
                break
            vocab.update(doc)
        helper._print_subheader('Done building vocabulary')
        i = 0
        word2index = {}
        for word, freq in vocab.items():
            if freq >= min_count:
                word2index[word] = i
                i += 1
        return word2index
Example #20
0
    def word2vec_index_keyed_vector(self, keyed_vector):
        helper._print_subheader('Creating index files!')
        vocab_keys = keyed_vector.vocab.keys()
        ZERO_TOKEN = 0
        word2idx = {'ZERO': ZERO_TOKEN}
        idx2word = {ZERO_TOKEN: 'ZERO'}
        weights = [np.zeros(self.dimensions)]
        for index, word in enumerate(vocab_keys):
            word2idx[word] = index + 1
            idx2word[index + 1] = word
            weights.append(keyed_vector[word])
            if index % FLAGS.word_embed_subset_size == 0 and index != 0:
                helper._print(f'{index} words indexed')
                if FLAGS.word_embed_subset:
                    break

        UNKNOWN_TOKEN = len(weights)
        word2idx['UNK'] = UNKNOWN_TOKEN
        np.random.seed(240993)
        weights.append(np.random.randn(self.dimensions))
        helper._print_subheader('Index files ready!')
        return np.array(weights, dtype=np.float32), word2idx, idx2word
Example #21
0
    def build_vocab(self, corpus, min_count=FLAGS.word_min_count):
        helper._print_subheader('Building vocabulary from corpus')
        vocab = Counter()
        pbar = tqdm(
            bar_format=
            '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
            total=len(corpus))
        for i, doc in enumerate(corpus):
            if (i + 1) % 1000 == 0 and i != 0:
                pbar.update(1000)
            vocab.update(doc)
        pbar.update(len(corpus) % 1000)
        pbar.close()
        print()
        i = 0
        word2index = {}
        for word, freq in vocab.items():
            if freq >= min_count:
                word2index[word] = i
                i += 1

        helper._print(f'Done building vocabulary. Length: {len(word2index)}')
        return word2index
Example #22
0
    def build_cooccur(self, vocab, corpus, window=10):
        helper._print_subheader("Building cooccurrence matrix")
        vocab_size = len(vocab)
        cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64)
        pbar = tqdm(
            bar_format=
            '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ',
            total=len(corpus))
        for i, sent in enumerate(corpus):
            if (i + 1) % 10000 == 0 and i != 0:
                pbar.update(10000)
            token_ids = [vocab[word] for word in sent if word in vocab.keys()]

            for center_i, center_id in enumerate(token_ids):
                # Collect all word IDs in left window of center word
                context_ids = token_ids[max(0, center_i - window):center_i]
                contexts_len = len(context_ids)

                for left_i, left_id in enumerate(context_ids):
                    # Distance from center word
                    distance = contexts_len - left_i

                    # Weight by inverse of distance between words
                    increment = 1.0 / float(distance)

                    # Build co-occurrence matrix symmetrically (pretend we
                    # are calculating right contexts as well)
                    cooccurrences[center_id, left_id] += increment
                    cooccurrences[left_id, center_id] += increment

        pbar.update(len(corpus) % 10000)
        pbar.close()
        print()
        helper._print(
            f'Done building cooccurrence matrix. Shape: {np.shape(cooccurrences)}'
        )
        return cooccurrences
Example #23
0
    def load_enron_txt_data(self):
        helper._print_header("Loading Enron emails")
        try:
            if os.name == 'nt':
                """
                Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal
                representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64
                should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long'
                """
                csv.field_size_limit((2**31) - 1)
            else:
                csv.field_size_limit(sys.maxsize)
        except OverflowError as e:
            # skip setting the limit for now
            pass
        if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH):
            data = 'wcukierski/enron-email-dataset'
            helper._print_subheader(f'Downloading enron emails from Kaggle')
            helper.download_from_kaggle(data, directories.ENRON_DIR)
            helper._print_subheader('Download finished! Unzipping...')
            with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH,
                                 'r') as zip:
                zip.extractall(path=directories.ENRON_DIR)
        if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH):
            helper._print_subheader('Processing emails into .txt file!')
            with open(directories.ENRON_EMAILS_CSV_PATH, 'r',
                      encoding='utf-8') as emails_csv:
                with open(directories.ENRON_EMAILS_TXT_PATH,
                          'w',
                          encoding='utf-8') as text_file:
                    email_reader = csv.reader(emails_csv, delimiter=",")
                    for index, row in enumerate(email_reader):
                        if index == 0:
                            continue
                        sentences = nltk.sent_tokenize(
                            self.format_email_body(row))
                        for sent in sentences:
                            if len(sent.split(' ')) > 2:
                                text_file.write(sent + '\n')
                        if index % 100000 == 0 and index != 0:
                            helper._print(f'{index} emails processed')

        helper._print_subheader('Enron email data loaded!')
Example #24
0
 def train_and_save_finetuned_embeddings(self):
     sentences = self.get_enron_sentences()
     vocab = self.build_vocab(sentences)
     if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH):
         # idx2word = {i: word for word, i in word2idx.items()}
         cooccur = self.build_cooccur(vocab, sentences)
         pretrained_embeddings = self.glove2dict(
             directories.GLOVE_EMBEDDING_FILE_PATH)
         helper._print(
             f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set'
         )
         helper._print_subheader('Building model...')
         mittens_dir = directories.GLOVE_DIR + 'mittens/'
         if not os.path.isdir(mittens_dir):
             os.makedirs(mittens_dir)
         mittens_model = Mittens(n=self.dimensions,
                                 xmax=100,
                                 max_iter=10000,
                                 display_progress=10,
                                 learning_rate=0.05,
                                 alpha=0.75,
                                 tol=1e-4,
                                 log_dir=mittens_dir,
                                 mittens=0.1)
         helper._print_subheader('Training Mittens model...')
         finetuned_embeddings = mittens_model.fit(
             cooccur,
             vocab=vocab,
             initial_embedding_dict=pretrained_embeddings)
         print()
         helper._print_subheader(
             'Done training finetuned embeddings! Merging with pre-trained embeddings...'
         )
         resulting_embeddings = pretrained_embeddings
         for word, weights in zip(vocab.keys(), finetuned_embeddings):
             resulting_embeddings[word] = weights
         self.dict2glove(resulting_embeddings,
                         directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH)
         return vocab, cooccur, resulting_embeddings
     return vocab, None, None
Example #25
0
 def on_train_end(self, model):
     helper._print_subheader('Training ended!')
Example #26
0
 def on_train_begin(self, model):
     helper._print_subheader(
         f'Training started! Going through {model.iter} epochs...')
Example #27
0
 def on_train_begin(self, model):
     helper._print_subheader(f'Training Model ({model.iter} epochs)...')
Example #28
0
 def on_train_end(self, model):
     # self.pbar.close()
     helper._print_subheader('Training ended!')
Example #29
0
 def word2vec_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     if not os.path.isdir(FLAGS.word2vec_dir):
         os.makedirs(FLAGS.word2vec_dir)
     if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir +
                                                'word2vec.model')
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin'
         if not os.path.isfile(binary_file_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM'
             )
             sys.exit()
         helper._print_subheader('Unpacking ' + binary_file_path)
         model = KeyedVectors.load_word2vec_format(binary_file_path,
                                                   binary=True)
         helper._print_subheader('Done unpacking!')
         sentences = self.get_enron_sentences()
         finetuned_model = Word2Vec(size=300, min_count=3)
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())],
                                     update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(binary_file_path,
                                                   binary=True,
                                                   lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences,
                               total_examples=len(sentences),
                               epochs=FLAGS.word2vec_finetuned_mode_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model')
     return self.word2vec_index_keyed_vector(finetuned_model.wv)
Example #30
0
 def build_finetuned_embeddings(self):
     helper._print_header('Getting fine-tuned word2vec embeddings')
     path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model'
     pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH
     sentences = self.get_enron_sentences()
     if not os.path.isdir(directories.WORD2VEC_DIR):
         os.makedirs(directories.WORD2VEC_DIR)
     if os.path.isfile(path):
         helper._print_subheader('Loading previously fine-tuned model...')
         finetuned_model = {}
         finetuned_model.wv = KeyedVectors.load(path)
     else:
         if not self.dimensions == 300:
             helper._print('Only support word2vec with vectors of size 300')
             sys.exit()
         if not os.path.isfile(pretrained_path):
             helper._print(
                 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM')
             sys.exit()
         helper._print_subheader('Unpacking ' + pretrained_path)
         model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
         helper._print_subheader('Done unpacking!')
         finetuned_model = gensim.models.Word2Vec(
             size=FLAGS.word_embedding_size,
             sg=1,  # Use Skip-Gram (0 for CBOW)
             hs=0,  # Use Negative sampling. (1 for Hierarchical Softmax)
             window=FLAGS.word2vec_window,
             min_count=FLAGS.word2vec_min_count,
             workers=10,
             iter=1
         )
         helper._print_subheader('Building fine-tuned model vocab...')
         finetuned_model.build_vocab(sentences)
         helper._print_subheader('Updating with pretrained model vocab...')
         finetuned_model.build_vocab([list(model.vocab.keys())], update=True)
         helper._print_subheader('Intersection with pretrained vectors...')
         finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0)
         model_logger = Word2VecLogger()
         finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs,
                               callbacks=[model_logger])
         helper._print_subheader('Saving model...')
         finetuned_model.save(path)
     vocab = self.build_vocab(sentences)
     return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)