def __getitem__(self, bow, iterations=100): """Get vector for document(s). Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document (or corpus) in BoW format. iterations : int, optional Number of iterations that will be used for inferring. Returns ------- list of (int, float) LDA vector for document as sequence of (topic_id, topic_probability) **OR** list of list of (int, float) LDA vectors for corpus in same format. """ is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def convert_input(self, corpus, infer=False, serialize_corpus=True): """Convert corpus to Mallet format and save it to a temporary text file. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. infer : bool, optional ... serialize_corpus : bool, optional ... """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = \ self.mallet_path + \ " import-file --preserve-case --keep-sequence " \ "--remove-stopwords --token-regex \"\S+\" --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def train(self, corpus): """Train Mallet LDA. Parameters ---------- corpus : iterable of iterable of (int, int) Corpus in BoW format """ self.convert_input(corpus, infer=False) cmd = self.mallet_path + ' train-topics --input %s --num-topics %s --alpha %s --optimize-interval %s '\ '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\ '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold ) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s", cmd) check_output(args=cmd, shell=True) self.word_topics = self.load_word_topics() # NOTE - we are still keeping the wordtopics variable to not break backward compatibility. # word_topics has replaced wordtopics throughout the code; # wordtopics just stores the values of word_topics when train is called. self.wordtopics = self.word_topics
def convert_input(self, corpus, infer=False): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ logger.info("serializing temporary corpus to %s" % self.fcorpustxt()) # write out the corpus in a file format that MALLET understands: one document per line: # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens with utils.smart_open(self.fcorpustxt(), 'wb') as fout: for docno, doc in enumerate(corpus): if self.id2word: tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) else: tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) fout.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) # convert the text file above into MALLET's internal format cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s" if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s" % cmd) check_output(cmd, shell=True)
def train(self, corpus, time_slices, mode, model): """ Train DTM model using specified corpus and time slices. """ self.convert_input(corpus, time_slices) arguments = "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha) params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed) arguments = arguments + " " + params logger.info("training DTM with args %s" % arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s" % cmd) check_output(cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence)
def train(self, corpus): self.convert_input(corpus, infer=False) cmd = self.mallet_path + " train-topics --input %s --num-topics %s --optimize-interval %s "\ "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\ "--num-iterations %s --inferencer-filename %s" cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.optimize_interval, self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer()) # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory logger.info("training MALLET LDA with %s" % cmd) check_output(cmd, shell=True) self.word_topics = self.load_word_topics()
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def testConversion(self): check_output(args=[ sys.executable, '-m', 'gensim.scripts.glove2word2vec', '--input', self.datapath, '--output', self.output_file ]) # test that the converted model loads successfully try: self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file) self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) except Exception: if os.path.isfile(os.path.join(self.output_file)): self.fail('model file %s was created but could not be loaded.' % self.output_file) else: self.fail( 'model file %s creation failed, check the parameters and input file format.' % self.output_file )
def convert_input(self, corpus, infer=False, serialize_corpus=True): """ Serialize documents (lists of unicode tokens) to a temporary text file, then convert that text file to MALLET format `outfile`. """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) with smart_open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s' if infer: cmd += ' --use-pipe-from ' + self.fcorpusmallet() cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer') else: cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet()) logger.info("converting temporary corpus to MALLET format with %s", cmd) check_output(args=cmd, shell=True)
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble ) if cleanup_files: rmtree(model_dir) return model
def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=91, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0): """ `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_path` is the path to directory which will be created to save embeddings and training data. `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ meta_data_path = 'matrix.meta' vocab_file = 'vocab.txt' temp_vocab_file = 'tempvocab.txt' cooccurrence_file = 'cooccurrence' cooccurrence_shuf_file = 'wiki.toy' meta_file = 'meta' # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_path) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) os.chdir(meta_dir) cmd_vocab_count = ['../../glove/vocab_count', '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)] cmd_cooccurence_count = ['../../glove/cooccur', '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)] cmd_shuffle_cooccurences = ['../../glove/shuffle', '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] logger.info("Prepare training data using glove code '%s'", commands) input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for line in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for line in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file, numwords, vocab_file) f.write(meta_info.encode('utf-8')) wr_args = { 'path': 'meta', 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } os.chdir('..') # run wordrank executable with wr_args cmd = ['mpirun', '-np', '1', '../wordrank'] for option, value in wr_args.items(): cmd.append("--%s" % option) cmd.append(str(value)) logger.info("Running wordrank binary '%s'", cmd) output = utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter / dump_period * dump_period - 1 copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words') copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts') model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble) os.chdir('../..') if cleanup_files: rmtree(model_dir) return model
def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): """ `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. `corpus_file` is the filename of the text file to be used for training the FastText model. Expects file to contain utf-8 encoded text. `model` defines the training algorithm. By default, cbow is used. Accepted values are 'cbow', 'skipgram'. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate. `min_count` = ignore all words with total occurrences lower than this. `word_ngram` = max length of word ngram `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax), `ns` (negative sampling) and `softmax`. Defaults to `ns` `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 1e-3, useful range is (0, 1e-5). `negative` = the value for negative specifies how many "noise words" should be drawn (usually between 5-20). Default is 5. If set to 0, no negative samping is used. Only relevant when `loss` is set to `ns` `iter` = number of iterations (epochs) over the corpus. Default is 5. `min_n` = min length of char ngrams to be used for training word representations. Default is 3. `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. Default is 6. `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `threads` = number of threads to use. Default is 12. """ ft_path = ft_path output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model') ft_args = { 'input': corpus_file, 'output': output_file, 'lr': alpha, 'dim': size, 'ws': window, 'epoch': iter, 'minCount': min_count, 'wordNgrams': word_ngrams, 'neg': negative, 'loss': loss, 'minn': min_n, 'maxn': max_n, 'thread': threads, 't': sample } cmd = [ft_path, model] for option, value in ft_args.items(): cmd.append("-%s" % option) cmd.append(str(value)) output = utils.check_output(args=cmd) model = cls.load_fasttext_format(output_file) cls.delete_training_files(output_file) return model
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """Train model. Parameters ---------- wr_path : str Absolute path to the Wordrank directory. corpus_file : str Path to corpus file, expected space-separated tokens in a each line format. out_name : str Name of the directory which will be created (in wordrank folder) to save embeddings and training data: * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period. * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period. * ``meta/vocab.txt`` - vocab file. * ``meta/wiki.toy`` - word-word concurrence values. size : int, optional Dimensionality of the feature vectors. window : int, optional Number of context words to the left (and to the right, if `symmetric = 1`). symmetric : {0, 1}, optional If 1 - using symmetric windows, if 0 - will use only left context words. min_count : int, optional Ignore all words with total frequency lower than `min_count`. max_vocab_size : int, optional Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit. sgd_num : int, optional Number of SGD taken for each data point. lrate : float, optional Learning rate (attention: too high diverges, give Nan). period : int, optional Period of xi variable updates. iter : int, optional Number of iterations (epochs) over the corpus. epsilon : float, optional Power scaling value for weighting function. dump_period : int, optional Period after which embeddings should be dumped. reg : int, optional Value of regularization parameter. alpha : int, optional Alpha parameter of gamma distribution. beta : int, optional Beta parameter of gamma distribution. loss : {"logistic", "hinge"}, optional Name of the loss function. memory : float, optional Soft limit for memory consumption, in GB. np : int, optional Number of process to execute (mpirun option). cleanup_files : bool, optional If True, delete directory and files used by this wrapper. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble ) if cleanup_files: rmtree(model_dir) return model
def train(self, corpus, time_slices, mode, model): """Train DTM model. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. """ self.convert_input(corpus, time_slices) arguments = \ "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ "--outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha ) params = \ "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ "--top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed ) arguments = arguments + " " + params logger.info("training DTM with args %s", arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s", cmd) check_output(args=cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence)