Beispiel #1
0
    def train(self, corpus):
        """Train Mallet LDA.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format

        """
        self.convert_input(corpus, infer=False)
        cmd = self.mallet_path + ' train-topics --input %s --num-topics %s  --alpha %s --optimize-interval %s '\
            '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\
            '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s'
        cmd = cmd % (
            self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval,
            self.workers, self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations,
            self.finferencer(), self.topic_threshold
        )
        # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
        logger.info("training MALLET LDA with %s", cmd)
        check_output(args=cmd, shell=True)
        self.word_topics = self.load_word_topics()
        # NOTE - we are still keeping the wordtopics variable to not break backward compatibility.
        # word_topics has replaced wordtopics throughout the code;
        # wordtopics just stores the values of word_topics when train is called.
        self.wordtopics = self.word_topics
Beispiel #2
0
    def convert_input(self, corpus, infer=False):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that MALLET understands: one document per line:
        # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
        with utils.smart_open(self.fcorpustxt(), 'wb') as fout:
            for docno, doc in enumerate(corpus):
                if self.id2word:
                    tokens = sum(([self.id2word[tokenid]] * int(cnt)
                                  for tokenid, cnt in doc), [])
                else:
                    tokens = sum(([str(tokenid)] * int(cnt)
                                  for tokenid, cnt in doc), [])
                fout.write(
                    utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))

        # convert the text file above into MALLET's internal format
        cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s" %
                    cmd)
        check_output(cmd, shell=True)
Beispiel #3
0
    def convert_input(self, corpus, infer=False, serialize_corpus=True):
        """Convert corpus to Mallet format and save it to a temporary text file.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        infer : bool, optional
            ...
        serialize_corpus : bool, optional
            ...

        """
        if serialize_corpus:
            logger.info("serializing temporary corpus to %s",
                        self.fcorpustxt())
            with smart_open(self.fcorpustxt(), 'wb') as fout:
                self.corpus2mallet(corpus, fout)

        # convert the text file above into MALLET's internal format
        cmd = \
            self.mallet_path + \
            " import-file --preserve-case --keep-sequence " \
            "--remove-stopwords --token-regex \"\\S+\" --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s",
                    cmd)
        check_output(args=cmd, shell=True)
Beispiel #4
0
    def train(self, corpus):
        """Train Mallet LDA.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format

        """
        self.convert_input(corpus, infer=False)
        cmd = self.mallet_path + ' train-topics --input %s --num-topics %s  --alpha %s --optimize-interval %s '\
            '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s '\
            '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s'
        cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.alpha,
                     self.optimize_interval, self.workers, self.fstate(),
                     self.fdoctopics(), self.ftopickeys(), self.iterations,
                     self.finferencer(), self.topic_threshold)
        # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
        logger.info("training MALLET LDA with %s", cmd)
        check_output(args=cmd, shell=True)
        self.word_topics = self.load_word_topics()
        # NOTE - we are still keeping the wordtopics variable to not break backward compatibility.
        # word_topics has replaced wordtopics throughout the code;
        # wordtopics just stores the values of word_topics when train is called.
        self.wordtopics = self.word_topics
Beispiel #5
0
    def convert_input(self, corpus, infer=False, serialize_corpus=True):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        if serialize_corpus:
            logger.info("serializing temporary corpus to %s",
                        self.fcorpustxt())
            with smart_open(self.fcorpustxt(), 'wb') as fout:
                self.corpus2mallet(corpus, fout)

        # convert the text file above into MALLET's internal format
        cmd = \
            self.mallet_path + \
            " import-file --preserve-case --keep-sequence " \
            "--remove-stopwords --token-regex \"\S+\" --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s",
                    cmd)
        check_output(args=cmd, shell=True)
Beispiel #6
0
    def __getitem__(self, bow, iterations=100):
        """Get vector for document(s).

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document (or corpus) in BoW format.
        iterations : int, optional
            Number of iterations that will be used for inferring.

        Returns
        -------
        list of (int, float)
            LDA vector for document as sequence of (topic_id, topic_probability) **OR**
        list of list of (int, float)
            LDA vectors for corpus in same format.

        """
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = \
            self.mallet_path + ' infer-topics --input %s --inferencer %s ' \
                               '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(),
                     self.fdoctopics() + '.infer', iterations,
                     self.topic_threshold)
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
Beispiel #7
0
    def convert_input(self, corpus, infer=False, serialize_corpus=True):
        """Convert corpus to Mallet format and save it to a temporary text file.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        infer : bool, optional
            ...
        serialize_corpus : bool, optional
            ...

        """
        if serialize_corpus:
            logger.info("serializing temporary corpus to %s", self.fcorpustxt())
            with smart_open(self.fcorpustxt(), 'wb') as fout:
                self.corpus2mallet(corpus, fout)

        # convert the text file above into MALLET's internal format
        cmd = \
            self.mallet_path + \
            " import-file --preserve-case --keep-sequence " \
            "--remove-stopwords --token-regex \"\S+\" --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s", cmd)
        check_output(args=cmd, shell=True)
Beispiel #8
0
    def convert_input(self, corpus, infer=False):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that MALLET understands: one document per line:
        # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
        with utils.smart_open(self.fcorpustxt(), 'wb') as fout:
            for docno, doc in enumerate(corpus):
                if self.id2word:
                    tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
                else:
                    tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
                fout.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))

        # convert the text file above into MALLET's internal format
        cmd = self.mallet_path + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s" % cmd)
        check_output(cmd, shell=True)
Beispiel #9
0
    def train(self, corpus: Iterable[tuple[int, int]]):
        """Train STTM model.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format

        """
        self.convert_input(corpus)
        self.java_opts = '-Xmx1G'
        cmd = 'java {} -jar {} -model {} -corpus {} -ntopics {} -alpha {} -beta {} -niters {} -twords {} -name {} -sstep {}'
        cmd = cmd.format(
            self.java_opts,
            self.sstm_jar_path,
            self.model,
            self.text_corpus_filename(),
            self.num_topics,
            self.alpha[0],
            self.beta,
            self.iterations,
            self.twords,
            self.name,
            self.sstep,
        )

        if self.vectors is not None:
            cmd += ' -vectors {}'.format(self.vectors)

        logger.info("training STTM model with %s", cmd)
        check_output(args=cmd, shell=True)
        self.word_topics = self.load_word_topics()
        self.wordtopics = self.word_topics
Beispiel #10
0
    def __getitem__(self, bow, iterations=100):
        """Get vector for document(s).

        Parameters
        ----------
        bow : {list of (int, int), iterable of list of (int, int)}
            Document (or corpus) in BoW format.
        iterations : int, optional
            Number of iterations that will be used for inferring.

        Returns
        -------
        list of (int, float)
            LDA vector for document as sequence of (topic_id, topic_probability) **OR**
        list of list of (int, float)
            LDA vectors for corpus in same format.

        """
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = \
            self.mallet_path + ' infer-topics --input %s --inferencer %s ' \
                               '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (
            self.fcorpusmallet() + '.infer', self.finferencer(),
            self.fdoctopics() + '.infer', iterations, self.topic_threshold
        )
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
Beispiel #11
0
    def train(self, corpus, time_slices, mode, model):
        """
        Train DTM model using specified corpus and time slices.

        """
        self.convert_input(corpus, time_slices)

        arguments = "--ntopics={p0} --model={mofrl}  --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format(
            p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha)

        params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1}  --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format(
            p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed)

        arguments = arguments + " " + params
        logger.info("training DTM with args %s" % arguments)

        cmd = [self.dtm_path] + arguments.split()
        logger.info("Running command %s" % cmd)
        check_output(cmd, stderr=PIPE)

        self.em_steps = np.loadtxt(self.fem_steps())
        self.init_ss = np.loadtxt(self.flda_ss())

        if self.initialize_lda:
            self.init_alpha = np.loadtxt(self.finit_alpha())
            self.init_beta = np.loadtxt(self.finit_beta())

        self.lhood_ = np.loadtxt(self.fout_liklihoods())

        # document-topic proportions
        self.gamma_ = np.loadtxt(self.fout_gamma())
        # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic
        # in doc 5
        self.gamma_.shape = (self.lencorpus, self.num_topics)
        # normalize proportions
        self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis]

        self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))
        self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))

        for t in range(self.num_topics):
                topic = "%03d" % t
                self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic))
                self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic))
        # cast to correct shape, lambda[5,10,0] is the proportion of the 10th
        # topic in doc 5 at time 0
        self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        # extract document influence on topics for each time slice
        # influences_time[0] , influences at time 0
        if model == 'fixed':
            for k, t in enumerate(self.time_slices):
                stamp = "%03d" % k
                influence = np.loadtxt(self.fout_influence().format(i=stamp))
                influence.shape = (t, self.num_topics)
                # influence[2,5] influence of document 2 on topic 5
                self.influences_time.append(influence)
Beispiel #12
0
    def train(self, corpus, time_slices, mode, model):
        """
        Train DTM model using specified corpus and time slices.

        """
        self.convert_input(corpus, time_slices)

        arguments = "--ntopics={p0} --model={mofrl}  --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} --outname={p4} --alpha={p5}".format(
            p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha)

        params = "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1}  --lda_sequence_max_iter={p2} --top_chain_var={p3} --rng_seed={p4} ".format(
            p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed)

        arguments = arguments + " " + params
        logger.info("training DTM with args %s" % arguments)

        cmd = [self.dtm_path] + arguments.split()
        logger.info("Running command %s" % cmd)
        check_output(cmd, stderr=PIPE)

        self.em_steps = np.loadtxt(self.fem_steps())
        self.init_ss = np.loadtxt(self.flda_ss())

        if self.initialize_lda:
            self.init_alpha = np.loadtxt(self.finit_alpha())
            self.init_beta = np.loadtxt(self.finit_beta())

        self.lhood_ = np.loadtxt(self.fout_liklihoods())

        # document-topic proportions
        self.gamma_ = np.loadtxt(self.fout_gamma())
        # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic
        # in doc 5
        self.gamma_.shape = (self.lencorpus, self.num_topics)
        # normalize proportions
        self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis]

        self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))
        self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))

        for t in range(self.num_topics):
                topic = "%03d" % t
                self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic))
                self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic))
        # cast to correct shape, lambda[5,10,0] is the proportion of the 10th
        # topic in doc 5 at time 0
        self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        # extract document influence on topics for each time slice
        # influences_time[0] , influences at time 0
        if model == 'fixed':
            for k, t in enumerate(self.time_slices):
                stamp = "%03d" % k
                influence = np.loadtxt(self.fout_influence().format(i=stamp))
                influence.shape = (t, self.num_topics)
                # influence[2,5] influence of document 2 on topic 5
                self.influences_time.append(influence)
Beispiel #13
0
 def train(self, corpus):
     self.convert_input(corpus, infer=False)
     cmd = self.mallet_path + " train-topics --input %s --num-topics %s --optimize-interval %s "\
         "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\
         "--num-iterations %s --inferencer-filename %s"
     cmd = cmd % (self.fcorpusmallet(), self.num_topics, self.optimize_interval, self.workers,
         self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer())
     # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
     logger.info("training MALLET LDA with %s" % cmd)
     check_output(cmd, shell=True)
     self.word_topics = self.load_word_topics()
Beispiel #14
0
 def train(self, corpus):
     self.convert_input(corpus, infer=False)
     cmd = self.mallet_path + " train-topics --input %s --num-topics %s  --alpha %s --optimize-interval %s "\
         "--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s "\
         "--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s"
     cmd = cmd % (
         self.fcorpusmallet(), self.num_topics, self.alpha, self.optimize_interval, self.workers,
         self.fstate(), self.fdoctopics(), self.ftopickeys(), self.iterations, self.finferencer(), self.topic_threshold)
     # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
     logger.info("training MALLET LDA with %s", cmd)
     check_output(cmd, shell=True)
     self.word_topics = self.load_word_topics()
Beispiel #15
0
def dim(dtm_path, input_dir, output_dir, num_topics=40):

    print("Running DIM")

    command = "--mode=fit     --rng_seed=0     --model=fixed     --initialize_lda=true     --corpus_prefix=example/test     --outname=example/output     --time_resolution=2     --influence_flat_years=5     --top_obs_var=0.5     --top_chain_var=0.005     --sigma_d=0.0001     --sigma_l=0.0001     --alpha=0.01     --lda_sequence_min_iter=6     --lda_sequence_max_iter=20     --save_time=-1     --ntopics=10     --lda_max_em_iter=10"
    command = command.split()
    command[4] = "--corpus_prefix=" + input_dir + "/dim"
    command[5] = "--outname=" + output_dir
    command[16] = "--ntopics=" + str(num_topics)
    command.insert(0, dtm_path)
    check_output(command)

    print("Done with DIM")
Beispiel #16
0
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold)
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
 def testConversion(self):
     check_output(args=[
         sys.executable, '-m', 'gensim.scripts.glove2word2vec',
         '--input', self.datapath, '--output', self.output_file
     ])
     # test that the converted model loads successfully
     try:
         self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file)
         self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0))
     except Exception:
         if os.path.isfile(os.path.join(self.output_file)):
             self.fail('model file %s was created but could not be loaded.' % self.output_file)
         else:
             self.fail(
                 'model file %s creation failed, check the parameters and input file format.' % self.output_file
             )
Beispiel #18
0
 def testConversion(self):
     check_output(args=[
         sys.executable, '-m', 'gensim.scripts.glove2word2vec',
         '--input', self.datapath, '--output', self.output_file
     ])
     # test that the converted model loads successfully
     try:
         self.test_model = gensim.models.KeyedVectors.load_word2vec_format(self.output_file)
         self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0))
     except Exception:
         if os.path.isfile(os.path.join(self.output_file)):
             self.fail('model file %s was created but could not be loaded.' % self.output_file)
         else:
             self.fail(
                 'model file %s creation failed, check the parameters and input file format.' % self.output_file
             )
Beispiel #19
0
    def convert_input(self, corpus, infer=False, serialize_corpus=True):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        if serialize_corpus:
            logger.info("serializing temporary corpus to %s", self.fcorpustxt())
            with smart_open(self.fcorpustxt(), 'wb') as fout:
                self.corpus2mallet(corpus, fout)

        # convert the text file above into MALLET's internal format
        cmd = self.mallet_path + ' import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input %s --output %s'
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s", cmd)
        check_output(args=cmd, shell=True)
Beispiel #20
0
    def train(self, corpus: Iterable[Iterable[Tuple[int, int]]], **kwargs):
        """Train Mallet LDA.
        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format
        """
        use_existing_corpus: bool = kwargs.get('use_existing_corpus', False)

        if os.path.isfile(self.mallet_corpus_filename()) and use_existing_corpus:
            logger.warning("using EXISTING corpus.mallet!")
        else:
            self.convert_input(corpus, infer=False)

        cmd: str = (
            f"{self.mallet_path} train-topics "
            f"--input {self.mallet_corpus_filename()} "
            f"--num-topics {self.num_topics} "
            f"--alpha {self.alpha} "
            f"--optimize-interval {self.optimize_interval} "
            f"--num-threads {self.workers} "
            f"--output-state {self.mallet_state_filename()} "
            f"--output-doc-topics {self.document_topics_filename()} "
            f"--output-topic-keys {self.topic_keys_filename()} "
            f"--num-top-words {self.num_top_words} "
            f"--diagnostics-file {self.diagnostics_filename()} "
            f"--num-iterations {self.iterations} "
            f"--inferencer-filename {self.inferencer_filename()} "
            f"--doc-topics-threshold {self.topic_threshold} "
            f"--random-seed {str(self.random_seed)} "
        )

        # f"--topic-word-weights-file {self.ftopicwordweights()} "

        logger.info(f"training MALLET LDA with {cmd}")
        check_output(args=cmd, shell=True)
        self.word_topics = self.load_word_topics()
        self.wordtopics = self.word_topics
Beispiel #21
0
def fasttext_fit(train_file_path,
                 param_dict,
                 fasttext_path,
                 thread=1,
                 compress_model=False,
                 model_path='/dev/shm/model',
                 pretrained_vectors_path=None):
    """
    Trains a fastText supervised model. This is a wrapper around the fastText command line interface.

    :param train_file_path: path to the training dataset
    :param param_dict: dictionary mapping fasttext hyperparameters to their values
    :param fasttext_path: path to the fastText executable
    :param thread: int, the number of threads to use
    :param compress_model: indicates whether the fastText model should be compressed (using fastText's quantize).
    :param model_path: str, path to output model
    :param pretrained_vectors_path: str, path to pre-trained `.vec` file with word embeddings
    :return str: path to trained model
    """
    train_call, compress_call = get_fasttext_train_calls(
        train_file_path,
        param_dict,
        fasttext_path,
        model_path,
        thread,
        pretrained_vectors_path=pretrained_vectors_path)
    utils.check_output(args=train_call, stderr=subprocess.DEVNULL)
    if compress_model:
        utils.check_output(args=compress_call, stderr=subprocess.DEVNULL)
    model_file = model_path + '.bin'
    # remove auxiliary vectors file
    os.remove(model_path + '.vec')
    # remove non-compressed model file if compression was performed
    if compress_model:
        os.remove(model_file)
        model_file = model_path + '.ftz'
    return model_file
Beispiel #22
0
def fasttext_predict(trained_model_path, test_file_path, fasttext_path,
                     probability_file_path):
    """
    Predicts class probabilities for a given dataset using a previously trained fastText model.

    :param trained_model_path: path to the trained fastText model
    :param test_file_path: path to the test dataset
    :param fasttext_path: path to the fastText executable
    :param probability_file_path: str, path to the output file with class probabilities for the test dataset;
        output written to this file will always be gzipped
    """
    predict_call = get_fasttext_test_calls(test_file_path, fasttext_path,
                                           trained_model_path)
    predictions = utils.check_output(args=predict_call,
                                     stderr=subprocess.DEVNULL)
    with gzip.open(probability_file_path, 'wb') as fout:
        fout.write(predictions)
Beispiel #23
0
    def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
              sgd_num=100, lrate=0.001, period=10, iter=91, epsilon=0.75, dump_period=10, reg=0, alpha=100,
              beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0):
        """
        `wr_path` is the path to the Wordrank directory.
        `corpus_file` is the filename of the text file to be used for training the Wordrank model.
        Expects file to contain space-separated tokens in a single line
        `out_path` is the path to directory which will be created to save embeddings and training data.
        `size` is the dimensionality of the feature vectors.
        `window` is the number of context words to the left (and to the right, if symmetric = 1).
        `symmetric` if 0, only use left context words, else use left and right both.
        `min_count` = ignore all words with total frequency lower than this.
        `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit.
        `sgd_num` number of SGD taken for each data point.
        `lrate` is the learning rate (too high diverges, give Nan).
        `period` is the period of xi variable updates
        `iter` = number of iterations (epochs) over the corpus.
        `epsilon` is the power scaling value for weighting function.
        `dump_period` is the period after which embeddings should be dumped.
        `reg` is the value of regularization parameter.
        `alpha` is the alpha parameter of gamma distribution.
        `beta` is the beta parameter of gamma distribution.
        `loss` = name of the loss (logistic, hinge).
        `memory` = soft limit for memory consumption, in GB.
        `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging
        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
        `ensemble` = 0 (default), use ensemble of word and context vectors
        """

        meta_data_path = 'matrix.meta'
        vocab_file = 'vocab.txt'
        temp_vocab_file = 'tempvocab.txt'
        cooccurrence_file = 'cooccurrence'
        cooccurrence_shuf_file = 'wiki.toy'
        meta_file = 'meta'

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_path)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))
        os.chdir(meta_dir)

        cmd_vocab_count = ['../../glove/vocab_count', '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)]
        cmd_cooccurence_count = ['../../glove/cooccur', '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)]
        cmd_shuffle_cooccurences = ['../../glove/shuffle', '-memory', str(memory)]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
        logger.info("Prepare training data using glove code '%s'", commands)
        input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file]
        output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

        for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for line in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for line in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file, numwords, vocab_file)
            f.write(meta_info.encode('utf-8'))

        wr_args = {
            'path': 'meta',
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        os.chdir('..')
        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', '1', '../wordrank']
        for option, value in wr_args.items():
            cmd.append("--%s" % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary '%s'", cmd)
        output = utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter / dump_period * dump_period - 1
        copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words')
        copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts')
        model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble)
        os.chdir('../..')

        if cleanup_files:
            rmtree(model_dir)
        return model
Beispiel #24
0
    def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
              sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
              beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):
        """
        The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
        which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
        available inside the wordrank directory. These files are used by the wordrank binary for training.

        `wr_path` is the absolute path to the Wordrank directory.
        `corpus_file` is the filename of the text file to be used for training the Wordrank model.
        Expects file to contain space-separated tokens in a single line
        `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data.
        It will contain following contents:

            Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt
            Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt
            A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths

        `size` is the dimensionality of the feature vectors.
        `window` is the number of context words to the left (and to the right, if symmetric = 1).
        `symmetric` if 0, only use left context words, else use left and right both.
        `min_count` = ignore all words with total frequency lower than this.
        `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit.
        `sgd_num` number of SGD taken for each data point.
        `lrate` is the learning rate (too high diverges, give Nan).
        `period` is the period of xi variable updates
        `iter` = number of iterations (epochs) over the corpus.
        `epsilon` is the power scaling value for weighting function.
        `dump_period` is the period after which embeddings should be dumped.
        `reg` is the value of regularization parameter.
        `alpha` is the alpha parameter of gamma distribution.
        `beta` is the beta parameter of gamma distribution.
        `loss` = name of the loss (logistic, hinge).
        `memory` = soft limit for memory consumption, in GB.
        `np` number of copies to execute. (mpirun option)
        `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging
        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
        `ensemble` = 0 (default), use ensemble of word and context vectors
        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'),
            '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory),
            '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)
        ]
        cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
        input_fnames = [
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            cooccurrence_file
        ]
        output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

        logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1],
                numwords, vocab_file.split('/')[-1]
            )
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter
            )

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble
        )

        if cleanup_files:
            rmtree(model_dir)
        return model
Beispiel #25
0
    def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
            loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
        """
        `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`.

        `corpus_file` is the filename of the text file to be used for training the FastText model.
        Expects file to contain utf-8 encoded text.

        `model` defines the training algorithm. By default, cbow is used. Accepted values are
        'cbow', 'skipgram'.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate.

        `min_count` = ignore all words with total occurrences lower than this.

        `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax),
        `ns` (negative sampling) and `softmax`. Defaults to `ns`

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
            default is 1e-3, useful range is (0, 1e-5).

        `negative` = the value for negative specifies how many "noise words" should be drawn
        (usually between 5-20). Default is 5. If set to 0, no negative samping is used.
        Only relevant when `loss` is set to `ns`

        `iter` = number of iterations (epochs) over the corpus. Default is 5.

        `min_n` = min length of char ngrams to be used for training word representations. Default is 3.

        `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be
        lesser than `min_n` to avoid char ngrams being used. Default is 6.

        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before
        assigning word indexes.

        `threads` = number of threads to use. Default is 12.

        """
        ft_path = ft_path
        output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model')
        ft_args = {
            'input': corpus_file,
            'output': output_file,
            'lr': alpha,
            'dim': size,
            'ws': window,
            'epoch': iter,
            'minCount': min_count,
            'neg': negative,
            'loss': loss,
            'minn': min_n,
            'maxn': max_n,
            'thread': threads,
            't': sample
        }
        cmd = [ft_path, model]
        for option, value in ft_args.items():
            cmd.append("-%s" % option)
            cmd.append(str(value))

        output = utils.check_output(args=cmd)
        model = cls.load_fasttext_format(output_file)
        cls.delete_training_files(output_file)
        return model
Beispiel #26
0
    def train(self, corpus, time_slices, mode, model):
        """Train DTM model.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.

        """
        self.convert_input(corpus, time_slices)

        arguments = \
            "--ntopics={p0} --model={mofrl}  --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \
            "--outname={p4} --alpha={p5}".format(
                p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda,
                p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha
            )

        params = \
            "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1}  --lda_sequence_max_iter={p2} " \
            "--top_chain_var={p3} --rng_seed={p4} ".format(
                p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter,
                p3=self.top_chain_var, p4=self.rng_seed
            )

        arguments = arguments + " " + params
        logger.info("training DTM with args %s", arguments)

        cmd = [self.dtm_path] + arguments.split()
        logger.info("Running command %s", cmd)
        check_output(args=cmd, stderr=PIPE)

        self.em_steps = np.loadtxt(self.fem_steps())
        self.init_ss = np.loadtxt(self.flda_ss())

        if self.initialize_lda:
            self.init_alpha = np.loadtxt(self.finit_alpha())
            self.init_beta = np.loadtxt(self.finit_beta())

        self.lhood_ = np.loadtxt(self.fout_liklihoods())

        # document-topic proportions
        self.gamma_ = np.loadtxt(self.fout_gamma())
        # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic
        # in doc 5
        self.gamma_.shape = (self.lencorpus, self.num_topics)
        # normalize proportions
        self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis]

        self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))
        self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices)))

        for t in range(self.num_topics):
            topic = "%03d" % t
            self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic))
            self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic))
        # cast to correct shape, lambda[5,10,0] is the proportion of the 10th
        # topic in doc 5 at time 0
        self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices))
        # extract document influence on topics for each time slice
        # influences_time[0] , influences at time 0
        if model == 'fixed':
            for k, t in enumerate(self.time_slices):
                stamp = "%03d" % k
                influence = np.loadtxt(self.fout_influence().format(i=stamp))
                influence.shape = (t, self.num_topics)
                # influence[2,5] influence of document 2 on topic 5
                self.influences_time.append(influence)
        for el in id_lema:
            if len(el[0]):
                fout.write(el[0] + ' 0 ' + ' '.join(el[2]) + '\n')
            else:
                fout.write(el[1] + ' 0 ' + ' '.join(el[2]) + '\n')

    token_regexp=cf.get('CorpusGeneration','token_regexp')
    cmd = str(mallet_path) + \
              ' import-file --preserve-case --keep-sequence ' + \
              '--remove-stopwords --token-regex "' + token_regexp + '" ' + \
              '--input %s --output %s'
    cmd = cmd % (corpus_file, corpus_mallet)

    try:
        print(f'-- -- Running command {cmd}')
        check_output(args=cmd, shell=True)
    except:
        print('-- -- Mallet failed to import data. Revise command')
    
    #############################################################
    # Generate corpus with procedure data for BioProtocol
    #############################################################
    corpus_dir = Path2corpus.joinpath('lemasWithProcedure')
    print('El corpus incluyendo procedimientos se guardará en el directorio', corpus_dir)
    corpus_dir.mkdir()

    id_lema = BIO_df[['ProtocolID', 'S2paperID', 'LEMAS', 'LEMASprocedures']].values.tolist()

    import_config = corpus_dir.joinpath('import.config')
    with import_config.open('w', encoding='utf8') as fout:
        fout.write('min_lemas = ' + str(min_lemas) + '\n')
Beispiel #28
0
    def train(cls,
              wr_path,
              corpus_file,
              out_name,
              size=100,
              window=15,
              symmetric=1,
              min_count=5,
              max_vocab_size=0,
              sgd_num=100,
              lrate=0.001,
              period=10,
              iter=90,
              epsilon=0.75,
              dump_period=10,
              reg=0,
              alpha=100,
              beta=99,
              loss='hinge',
              memory=4.0,
              np=1,
              cleanup_files=False,
              sorted_vocab=1,
              ensemble=0):
        """
        The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
        which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
        available inside the wordrank directory. These files are used by the wordrank binary for training.

        `wr_path` is the absolute path to the Wordrank directory.
        `corpus_file` is the filename of the text file to be used for training the Wordrank model.
        Expects file to contain space-separated tokens in a single line
        `out_name` is name of the directory which will be created (in wordrank folder)
        to save embeddings and training data.
        It will contain following contents:

            Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt
            Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt
            A meta directory which contain: 'vocab.txt' - vocab words,
            'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths

        `size` is the dimensionality of the feature vectors.
        `window` is the number of context words to the left (and to the right, if symmetric = 1).
        `symmetric` if 0, only use left context words, else use left and right both.
        `min_count` = ignore all words with total frequency lower than this.
        `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words.
        Default is 0 for no limit.
        `sgd_num` number of SGD taken for each data point.
        `lrate` is the learning rate (too high diverges, give Nan).
        `period` is the period of xi variable updates
        `iter` = number of iterations (epochs) over the corpus.
        `epsilon` is the power scaling value for weighting function.
        `dump_period` is the period after which embeddings should be dumped.
        `reg` is the value of regularization parameter.
        `alpha` is the alpha parameter of gamma distribution.
        `beta` is the beta parameter of gamma distribution.
        `loss` = name of the loss (logistic, hinge).
        `memory` = soft limit for memory consumption, in GB.
        `np` number of copies to execute. (mpirun option)
        `cleanup_files` if True, delete directory and files used by this wrapper,
        setting to False can be useful for debugging
        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
        `ensemble` = 0 (default), use ensemble of word and context vectors
        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir,
                                           corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count',
            str(min_count), '-max-vocab',
            str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory',
            str(memory), '-vocab-file', temp_vocab_file, '-window-size',
            str(window), '-symmetric',
            str(symmetric)
        ]
        cmd_shuffle_cooccurences = [
            os.path.join(wr_path, 'glove', 'shuffle'), '-memory',
            str(memory)
        ]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [
            cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences
        ]
        input_fnames = [
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]), cooccurrence_file
        ]
        output_fnames = [
            temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file
        ]

        logger.info("Prepare training data (%s) using glove code",
                    ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames,
                                                      output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines,
                cooccurrence_shuf_file.split('/')[-1], numwords,
                vocab_file.split('/')[-1])
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, "
                "as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) "
                "such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter)

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab,
            ensemble)

        if cleanup_files:
            rmtree(model_dir)
        return model
Beispiel #29
0
    def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
              sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
              beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):
        """Train model.

        Parameters
        ----------
        wr_path : str
            Absolute path to the Wordrank directory.
        corpus_file : str
            Path to corpus file, expected space-separated tokens in a each line format.
        out_name : str
            Name of the directory which will be created (in wordrank folder) to save embeddings and training data:
                * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period.
                * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period.
                * ``meta/vocab.txt`` - vocab file.
                * ``meta/wiki.toy`` - word-word concurrence values.
        size : int, optional
            Dimensionality of the feature vectors.
        window : int, optional
            Number of context words to the left (and to the right, if `symmetric = 1`).
        symmetric : {0, 1}, optional
            If 1 - using symmetric windows, if 0 - will use only left context words.
        min_count : int, optional
            Ignore all words with total frequency lower than `min_count`.
        max_vocab_size : int, optional
            Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit.
        sgd_num : int, optional
            Number of SGD taken for each data point.
        lrate : float, optional
            Learning rate (attention: too high diverges, give Nan).
        period : int, optional
            Period of xi variable updates.
        iter : int, optional
            Number of iterations (epochs) over the corpus.
        epsilon : float, optional
            Power scaling value for weighting function.
        dump_period : int, optional
            Period after which embeddings should be dumped.
        reg : int, optional
            Value of regularization parameter.
        alpha : int, optional
            Alpha parameter of gamma distribution.
        beta : int, optional
            Beta parameter of gamma distribution.
        loss : {"logistic", "hinge"}, optional
            Name of the loss function.
        memory : float, optional
            Soft limit for memory consumption, in GB.
        np : int, optional
            Number of process to execute (mpirun option).
        cleanup_files : bool, optional
            If True, delete directory and files used by this wrapper.
        sorted_vocab : {0, 1}, optional
            If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.
        ensemble : {0, 1}, optional
            If 1 - use ensemble of word and context vectors.

        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'),
            '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory),
            '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)
        ]
        cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
        input_fnames = [
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            cooccurrence_file
        ]
        output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

        logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1],
                numwords, vocab_file.split('/')[-1]
            )
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, "
                "as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) "
                "such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter
            )

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble
        )

        if cleanup_files:
            rmtree(model_dir)
        return model
Beispiel #30
0
    def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5,
            word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12):
        """
        `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`.

        `corpus_file` is the filename of the text file to be used for training the FastText model.
        Expects file to contain utf-8 encoded text.

        `model` defines the training algorithm. By default, cbow is used. Accepted values are
        'cbow', 'skipgram'.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate.

        `min_count` = ignore all words with total occurrences lower than this.

        `word_ngram` = max length of word ngram

        `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax),
        `ns` (negative sampling) and `softmax`. Defaults to `ns`

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
            default is 1e-3, useful range is (0, 1e-5).

        `negative` = the value for negative specifies how many "noise words" should be drawn
        (usually between 5-20). Default is 5. If set to 0, no negative samping is used.
        Only relevant when `loss` is set to `ns`

        `iter` = number of iterations (epochs) over the corpus. Default is 5.

        `min_n` = min length of char ngrams to be used for training word representations. Default is 3.

        `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be
        lesser than `min_n` to avoid char ngrams being used. Default is 6.

        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before
        assigning word indexes.

        `threads` = number of threads to use. Default is 12.

        """
        ft_path = ft_path
        output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model')
        ft_args = {
            'input': corpus_file,
            'output': output_file,
            'lr': alpha,
            'dim': size,
            'ws': window,
            'epoch': iter,
            'minCount': min_count,
            'wordNgrams': word_ngrams,
            'neg': negative,
            'loss': loss,
            'minn': min_n,
            'maxn': max_n,
            'thread': threads,
            't': sample
        }
        cmd = [ft_path, model]
        for option, value in ft_args.items():
            cmd.append("-%s" % option)
            cmd.append(str(value))

        output = utils.check_output(args=cmd)
        model = cls.load_fasttext_format(output_file)
        cls.delete_training_files(output_file)
        return model
Beispiel #31
0
    def train(self, corpus, time_slices, mode, model):
        """Train DTM model.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.

        """
        self.convert_input(corpus, time_slices)

        arguments = \
            "--ntopics={p0} --model={mofrl}  --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \
            "--outname={p4} --alpha={p5}".format(
                p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda,
                p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha
            )

        params = \
            "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1}  --lda_sequence_max_iter={p2} " \
            "--top_chain_var={p3} --rng_seed={p4} ".format(
                p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter,
                p3=self.top_chain_var, p4=self.rng_seed
            )

        arguments = arguments + " " + params
        logger.info("training DTM with args %s", arguments)

        cmd = [self.dtm_path] + arguments.split()
        logger.info("Running command %s", cmd)
        check_output(args=cmd, stderr=PIPE)

        self.em_steps = np.loadtxt(self.fem_steps())
        self.init_ss = np.loadtxt(self.flda_ss())

        if self.initialize_lda:
            self.init_alpha = np.loadtxt(self.finit_alpha())
            self.init_beta = np.loadtxt(self.finit_beta())

        self.lhood_ = np.loadtxt(self.fout_liklihoods())

        # document-topic proportions
        self.gamma_ = np.loadtxt(self.fout_gamma())
        # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic
        # in doc 5
        self.gamma_.shape = (self.lencorpus, self.num_topics)
        # normalize proportions
        self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis]

        self.lambda_ = np.zeros(
            (self.num_topics, self.num_terms * len(self.time_slices)))
        self.obs_ = np.zeros(
            (self.num_topics, self.num_terms * len(self.time_slices)))

        for t in range(self.num_topics):
            topic = "%03d" % t
            self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic))
            self.obs_[t, :] = np.loadtxt(
                self.fout_observations().format(i=topic))
        # cast to correct shape, lambda[5,10,0] is the proportion of the 10th
        # topic in doc 5 at time 0
        self.lambda_.shape = (self.num_topics, self.num_terms,
                              len(self.time_slices))
        self.obs_.shape = (self.num_topics, self.num_terms,
                           len(self.time_slices))
        # extract document influence on topics for each time slice
        # influences_time[0] , influences at time 0
        if model == 'fixed':
            for k, t in enumerate(self.time_slices):
                stamp = "%03d" % k
                influence = np.loadtxt(self.fout_influence().format(i=stamp))
                influence.shape = (t, self.num_topics)
                # influence[2,5] influence of document 2 on topic 5
                self.influences_time.append(influence)
Beispiel #32
0
    def train(cls,
              wr_path,
              corpus_file,
              out_name,
              size=100,
              window=15,
              symmetric=1,
              min_count=5,
              max_vocab_size=0,
              sgd_num=100,
              lrate=0.001,
              period=10,
              iter=90,
              epsilon=0.75,
              dump_period=10,
              reg=0,
              alpha=100,
              beta=99,
              loss='hinge',
              memory=4.0,
              np=1,
              cleanup_files=False,
              sorted_vocab=1,
              ensemble=0):
        """Train model.

        Parameters
        ----------
        wr_path : str
            Absolute path to the Wordrank directory.
        corpus_file : str
            Path to corpus file, expected space-separated tokens in a each line format.
        out_name : str
            Name of the directory which will be created (in wordrank folder) to save embeddings and training data:
                * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period.
                * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period.
                * ``meta/vocab.txt`` - vocab file.
                * ``meta/wiki.toy`` - word-word concurrence values.
        size : int, optional
            Dimensionality of the feature vectors.
        window : int, optional
            Number of context words to the left (and to the right, if `symmetric = 1`).
        symmetric : {0, 1}, optional
            If 1 - using symmetric windows, if 0 - will use only left context words.
        min_count : int, optional
            Ignore all words with total frequency lower than `min_count`.
        max_vocab_size : int, optional
            Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit.
        sgd_num : int, optional
            Number of SGD taken for each data point.
        lrate : float, optional
            Learning rate (attention: too high diverges, give Nan).
        period : int, optional
            Period of xi variable updates.
        iter : int, optional
            Number of iterations (epochs) over the corpus.
        epsilon : float, optional
            Power scaling value for weighting function.
        dump_period : int, optional
            Period after which embeddings should be dumped.
        reg : int, optional
            Value of regularization parameter.
        alpha : int, optional
            Alpha parameter of gamma distribution.
        beta : int, optional
            Beta parameter of gamma distribution.
        loss : {"logistic", "hinge"}, optional
            Name of the loss function.
        memory : float, optional
            Soft limit for memory consumption, in GB.
        np : int, optional
            Number of process to execute (mpirun option).
        cleanup_files : bool, optional
            If True, delete directory and files used by this wrapper.
        sorted_vocab : {0, 1}, optional
            If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.
        ensemble : {0, 1}, optional
            If 1 - use ensemble of word and context vectors.

        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir,
                                           corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count',
            str(min_count), '-max-vocab',
            str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory',
            str(memory), '-vocab-file', temp_vocab_file, '-window-size',
            str(window), '-symmetric',
            str(symmetric)
        ]
        cmd_shuffle_cooccurences = [
            os.path.join(wr_path, 'glove', 'shuffle'), '-memory',
            str(memory)
        ]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [
            cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences
        ]
        input_fnames = [
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]), cooccurrence_file
        ]
        output_fnames = [
            temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file
        ]

        logger.info("Prepare training data (%s) using glove code",
                    ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames,
                                                      output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines,
                cooccurrence_shuf_file.split('/')[-1], numwords,
                vocab_file.split('/')[-1])
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, "
                "as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) "
                "such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter)

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab,
            ensemble)

        if cleanup_files:
            rmtree(model_dir)
        return model
Beispiel #33
0
    def train(self, corpus, **_):
        """Train Mallet LDA.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format

        MALLET infer-topics --help TRUE

        A tool for estimating, saving and printing diagnostics for topic models, such as LDA.
        --help TRUE|FALSE
            Print this command line option usage information.  Give argument of TRUE for longer documentation
            Default is false
        --prefix-code 'JAVA CODE'
            Java code you want run before any other interpreted code.  Note that the text is interpreted without modification,
            so unlike some other Java code options, you need to include any necessary 'new's when creating objects.
            Default is null
        --config FILE
            Read command option values from a file
            Default is null
        --input FILENAME
            The filename from which to read the list of training instances.  Use - for stdin.
            The instances must be FeatureSequenceor FeatureSequenceWithBigrams, not FeatureVector
            Default is null
        --input-model FILENAME
            The filename from which to read the binary topic model. The --input option is ignored.
            By default this is null, indicating that no file will be read.
            Default is null
        --input-state FILENAME
            The filename from which to read the gzipped Gibbs sampling state created by --output-state.
            The original input file must be included, using --input. By default this is null, indicating that no file will be read.
            Default is null
        --output-model FILENAME
            The filename in which to write the binary topic model at the end of the iterations.
            By default this is null,indicating that no file will be written.
            Default is null
        --output-state FILENAME
            The filename in which to write the Gibbs sampling state after at the end of the iterations.
            By default this is null, indicating that no file will be written.
            Default is null
        --output-model-interval INTEGER
            The number of iterations between writing the model (and its Gibbs sampling state) to a binary file.
            You must also set the --output-model to use this option, whose argument will be the prefix of the filenames.
            Default is 0
        --output-state-interval INTEGER
            The number of iterations between writing the sampling state to a text file.
            You must also set the --output-state to use this option, whose argument will be the prefix of the filenames.
            Default is 0
        --inferencer-filename FILENAME
            A topic inferencer applies a previously trained topic model to new documents.
            By default this is null, indicating that no file will be written.
            Default is null
            --evaluator-filename FILENAME
            A held-out likelihood evaluator for new documents.
            By default this is null, indicating that no file will be written.
            Default is null
        --output-topic-keys FILENAME
            The filename in which to write the top words for each topic and any Dirichlet parameters.
            By default this is null, indicating that no file will be written.
            Default is null
        --num-top-words INTEGER
            The number of most probable words to print for each topic after model estimation.
            Default is 20
        --show-topics-interval INTEGER
            The number of iterations between printing a brief summary of the topics so far.
            Default is 50
        --topic-word-weights-file FILENAME
            The filename in which to write unnormalized weights for every topic and word type.
            By default this is null, indicating that no file will be written.
            Default is null
        --word-topic-counts-file FILENAME
            The filename in which to write a sparse representation of topic-word assignments.
            By default this is null, indicating that no file will be written.
            Default is null
        --diagnostics-file FILENAME
            The filename in which to write measures of topic quality, in XML format.
            By default this is null, indicating that no file will be written.
            Default is null
        --xml-topic-report FILENAME
            The filename in which to write the top words for each topic and any Dirichlet parameters in XML format.
            By default this is null, indicating that no file will be written.
            Default is null
        --xml-topic-phrase-report FILENAME
            The filename in which to write the top words and phrases for each topic and any Dirichlet parameters in XML format.
            By default this is null, indicating that no file will be written.
            Default is null
        --output-topic-docs FILENAME
            The filename in which to write the most prominent documents for each topic, at the end of the iterations.
            By default this is null, indicating that no file will be written.
            Default is null
        --num-top-docs INTEGER
            When writing topic documents with --output-topic-docs, report this number of top documents.
            Default is 100
        --output-doc-topics FILENAME
            The filename in which to write the topic proportions per document, at the end of the iterations.
            By default this is null, indicating that no file will be written.
            Default is null
        --doc-topics-threshold DECIMAL
            When writing topic proportions per document with --output-doc-topics, do not print topics with proportions less than this threshold value.
            Default is 0.0
        --doc-topics-max INTEGER
            When writing topic proportions per document with --output-doc-topics, do not print more than INTEGER number of topics.
            A negative value indicates that all topics should be printed.
            Default is -1
        --num-topics INTEGER
            The number of topics to fit.
            Default is 10
        --num-threads INTEGER
            The number of threads for parallel training.
            Default is 1
        --num-iterations INTEGER
            The number of iterations of Gibbs sampling.
            Default is 1000
        --num-icm-iterations INTEGER
            The number of iterations of iterated conditional modes (topic maximization).
            Default is 0
        --no-inference true|false
            Do not perform inference, just load a saved model and create a report. Equivalent to --num-iterations 0.
            Default is false
        --random-seed INTEGER
            The random seed for the Gibbs sampler.  Default is 0, which will use the clock.
            Default is 0
        --optimize-interval INTEGER
            The number of iterations between reestimating dirichlet hyperparameters.
            Default is 0
        --optimize-burn-in INTEGER
            The number of iterations to run before first estimating dirichlet hyperparameters.
            Default is 200
        --use-symmetric-alpha true|false
            Only optimize the concentration parameter of the prior over document-topic distributions.
            This may reduce the number of very small, poorly estimated topics, but may disperse common words over several topics.
            Default is false
        --alpha DECIMAL
            SumAlpha parameter: sum over topics of smoothing over doc-topic distributions. alpha_k = [this value] / [num topics]
            Default is 5.0
        --beta DECIMAL
            Beta parameter: smoothing parameter for each topic-word. beta_w = [this value]
            Default is 0.01

        """
        self.convert_input(corpus, infer=False)
        cmd = (
            self.mallet_path +
            ' train-topics --input %s --num-topics %s  --alpha %s --optimize-interval %s '
            '--num-threads %s --output-state %s --output-doc-topics %s --output-topic-keys %s --num-top-words %s '
            '--num-iterations %s --inferencer-filename %s --doc-topics-threshold %s  --random-seed %s'
        )

        cmd = cmd % (
            self.mallet_corpus_filename(),
            self.num_topics,
            self.alpha,
            self.optimize_interval,
            self.workers,
            self.mallet_state_filename(),
            self.document_topics_filename(),
            self.topic_keys_filename(),
            self.num_top_words,
            self.iterations,
            self.inferencer_filename(),
            self.topic_threshold,
            str(self.random_seed),
        )
        # NOTE "--keep-sequence-bigrams" / "--use-ngrams true" poorer results + runs out of memory
        logger.info("training MALLET LDA with %s", cmd)
        check_output(args=cmd, shell=True)
        self.word_topics = self.load_word_topics()
        # NOTE - we are still keeping the wordtopics variable to not break backward compatibility.
        # word_topics has replaced wordtopics throughout the code;
        # wordtopics just stores the values of word_topics when train is called.
        self.wordtopics = self.word_topics