class LDAembedding(InputEmbedding):
    def __init__(self, workdir="./embedding-models", name="lda-embedding"):
        """
        Erstellt durch Aufruf von Pretrain ein Vokabular
        :param workdir:
        :param name:
        """
        super(LDAembedding, self).__init__(workdir=workdir, name=name)
        self._normalizer = TweetNormalisation()

    def _load(self):
        modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name))
        if not modeldir.exists():
            return False
        self._lda = LdaMulticore.load(str(modeldir))
        self._dictionary = Dictionary.load(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))

    def pretrain(self, texts: typing.Iterable[typing.Text]):
        texts = [self._normalizer(text).split() for text in tqdm(texts)]
        self._dictionary = Dictionary(texts, prune_at=200000)
        corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)]
        self._lda = LdaMulticore(corpus=corpus,
                                 id2word=self._dictionary,
                                 workers=15,
                                 num_topics=50)

        self._dictionary.save(
            str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
        self._lda.save(
            str(self._workdir.joinpath("ldamodel_{}".format(self._name))))

    def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array:
        to_array = lambda x: np.array([
            v
            for _, v in self._lda.get_document_topics(x, minimum_probability=0)
        ])
        return np.stack([
            to_array(self._dictionary.doc2bow(self._normalizer(text).split()))
            for text in texts
        ])
Example #2
0
def make_ldamodel(pre_processed, num_topics=5, pylda=False):

    dictionary = corpora.Dictionary(pre_processed)
    corpus = [dictionary.doc2bow(text) for text in pre_processed]
    model = LdaMulticore(corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=30,
                         random_state=1)
    if pylda:
        lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary)
        with open('topic_associations.txt', 'w') as outfile:
            for f in sorted(glob.glob('letters/*.txt')):
                fp = open(f, 'r')
                text = pre_process([fp.read()])
                dictionary = corpora.Dictionary(text)
                outfile.write(
                    '%s - %s\n' %
                    (f, model.get_document_topics(dictionary.doc2bow(
                        text[0]))))
                fp.close()

        pyLDAvis.show(lda_display)
Example #3
0
# Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=35, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)


pprint(model.print_topics())
doc_lda = model[corpus]
doc_lda[4]
model.get_document_topics(corpus)[1]

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary)
vis

mallet_path = '/home/ubuntu/Signal/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=35, id2word=id2word)
Example #4
0
    def generate_embedings(self,
                           method="tf-idf",
                           tag=None,
                           tag_column=None,
                           return_model=False):
        # Coleta os dados dos embedings e salva em um arquivo par ao multiprocess
        if tag != None and tag_column != None:
            if (tag_column not in self.df.columns):
                raise ValueError(f"Tag {tag_column} not found in dataset")
            elif tag not in self.df[tag_column].to_list():
                raise ValueError(
                    f"Tag {tag} not found in dataset column {tag_column}")
            texts = self.df[self.df[tag_column] == tag][self.text_column]
        else:
            texts = self.df[self.text_column]

        with open('storage/texts.txt', 'w', encoding='utf8') as file:
            for sentence in texts:
                file.write(" ".join([tok for tok in sentence]) + "\n")

        # Verifica se usuario cometeu um erro no imput das tags
        if tag != None and tag_column == None:
            raise ValueError("if passing tag must pass tag_column as well")

        if tag_column != None and tag == None:
            raise ValueError("if passing tag_column must pass tag as well")

        # Verifica se o vetor ja foi gerado e se o alvo é o corpus inteiro
        if method in self.embedings and tag == None:
            if return_model:
                return self.embedings[method]
            else:
                return self.embedings[method][0]

        # Realiza o tf-idf
        if method == "tf-idf":
            model = TfidfVectorizer(min_df=5,
                                    max_df=0.9,
                                    max_features=5000,
                                    sublinear_tf=False,
                                    analyzer=lambda x: x)

            vectors = model.fit_transform(texts)

        # Realiza o Word2Vec
        elif method == "word2vec" or method == "cbow":
            model = gensim.models.Word2Vec(corpus_file='storage/texts.txt',
                                           window=5,
                                           size=200,
                                           min_count=5,
                                           iter=100,
                                           workers=4)

            vectors = model.wv
            if tag == None:
                self.embedings["word2vec"] = vectors

            # Realiza o cbow
            if method == "cbow":
                vectors = []
                for text in texts:
                    vec = np.zeros(model.wv.vector_size)
                    for word in text:
                        if word in model.wv.vocab:
                            vec += model.wv.get_vector(word)

                    norm = np.linalg.norm(vec)
                    if norm > np.finfo(float).eps:
                        vec /= norm
                    vectors.append(vec)

                vectors = scipy.sparse.csr.csr_matrix(vectors)

        # Realiza o Doc2Vec
        elif method == "doc2vec":

            model = gensim.models.Doc2Vec(corpus_file='storage/texts.txt',
                                          vector_size=200,
                                          window=5,
                                          min_count=5,
                                          workers=12,
                                          epochs=100)

            vectors = scipy.sparse.csr.csr_matrix(model.docvecs.vectors_docs)

        # Realiza a LDA
        elif "lda" in method:
            if "_" in method:
                NUM_TOPICS = int(method.split("_")[-1])
            else:
                NUM_TOPICS = 20

            dictionary = Dictionary(texts)
            doc2bow = [dictionary.doc2bow(text) for text in texts]
            ldamodel = LdaMulticore(doc2bow,
                                    num_topics=NUM_TOPICS,
                                    id2word=dictionary,
                                    passes=30)

            raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow]

            lda_vecs = []
            for vec in raw_vecs:
                this_vec = []
                curr = 0
                for i in range(ldamodel.num_topics):
                    if (i == vec[curr][0]):
                        this_vec.append(vec[curr][1])
                        curr += 1
                        if curr == len(vec):
                            curr = -1
                    else:
                        this_vec.append(0)
                lda_vecs.append(this_vec)

            vectors = scipy.sparse.csr.csr_matrix(lda_vecs)
            model = [ldamodel, doc2bow, dictionary]

        else:
            raise ValueError(f"Method {method} is not recognized")

        # Se não estiver fazendo uma versão com tags salva os resultados
        if tag == None and not self.low_memory:
            self.embedings[method] = (vectors, model)

        if return_model:
            return vectors, model
        else:
            return vectors
class LDA():

    corpus = None
    model = None
    dictionary = None
    util = None
    loaded = False
    topicLabelling = defaultdict(int)

    def __init__(self, utilObj=None, logfilename=None):
        if (utilObj != None):
            self.util = utilObj
        elif (logfilename != None):
            self.util = Utilities.Utility()
            self.util.setupLogFileLoc(logfilename)

        self.util.startTimeTrack()

    def labelTopics(self, modelFilename):

        if (os.path.exists(modelFilename + '.label')):
            f = open(modelFilename + '.label', "rb")
            self.topicLabelling = pickle.load(f)
            f.close()
        else:
            #Label file not available, performing manual labelling. (One time operation)
            topics = self.model.show_topics(num_topics=100, num_words=20)
            print(
                'You will be shown a series of words and asked to label the topic in the form of an integer\n'
            )
            for topic in topics:
                print('The words affliated to this topic is as follows\n',
                      topic[1])
                print(
                    '\033[92m' +
                    'Please label as one of these \n(0) EDUCATION\n(1) SKILLS\n(2) PERSONAL DETAILS\n(3) WORK EXPERIENCE'
                    + '\033[0m')
                mappedTopicInt = input(
                    'Please enter a new integer for this topic: ')
                self.topicLabelling[topic[0]] = mappedTopicInt
            f = open(modelFilename + '.label', "wb")
            pickle.dump(self.topicLabelling, f)
            f.close()

    def buildCorpus(self, folderListOfCorpus=None, maxdocs=-1):
        """
        For each folder
            for each cvd2v in in folder
                Get tokens from Utility tokenise and then form into a string
                Append string into a list (This forms a document)
        """
        self.util.logDebug('LDA', 'Building and fitting corpus ')
        documentList = []
        maxDocPerFolder = int(maxdocs / len(folderListOfCorpus.split(',')))
        docCounter = 0
        for folder in folderListOfCorpus.split(','):
            self.util.logDebug('LDA', 'Processing ' + folder)
            for filename in sorted(glob.iglob(folder + '/*.cvd2v')):
                if (docCounter <= maxDocPerFolder):
                    fileContent = self.util.tokensToStr(
                        self.util.tokenize(
                            self.util.readFileContent(filename=filename),
                            removeStopwords=True,
                            toLowercase=True,
                            replaceSlash=True,
                            flatEmail=True,
                            flatMonth=True,
                            flatNumber=True,
                            lemmatize=True), ' ')
                    documentList.append(fileContent)
                    docCounter = docCounter + 1
                else:
                    docCounter = 0
                    break

        self.util.logDebug(
            'LDA',
            str(len(documentList)) + ' documents loaded in ' +
            self.util.stopTimeTrack())
        texts = [[word for word in document.lower().split()]
                 for document in documentList]
        self.util.logDebug('LDA', 'No of vocab words: ' + str(len(texts)))
        self.util.logDebug('LDA', 'Text example: ' + str(texts[0]))
        self.dictionary = Dictionary(texts)

        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.util.logDebug('LDA',
                           'Corpus built in ' + self.util.stopTimeTrack())

    def trainModel(self, noOfTopics=4, dstFilename=None):
        workers = 30
        eval_every = 10
        iterations = 400
        passes = 20

        self.util.logDebug('LDA', 'Training model...')
        self.model = LdaMulticore(self.corpus,
                                  workers=workers,
                                  num_topics=noOfTopics,
                                  id2word=self.dictionary,
                                  eval_every=None,
                                  iterations=iterations,
                                  passes=passes)
        self.util.logDebug('LDA',
                           'Model trained in ' + self.util.stopTimeTrack())
        print(self.model.print_topics())
        self.saveModel(dstFilename)
        self.loaded = True

    def saveModel(self, filename):
        self.util.logDebug('LDA', 'Saving model to ' + filename)
        self.model.save(filename)
        self.dictionary.save(filename + '.dict')
        MmCorpus.serialize(filename + '.corpus', self.corpus)
        self.util.logDebug('LDA', 'Saved in ' + self.util.stopTimeTrack())

    def loadModel(self, filename):
        self.util.logDebug('LDA', 'Loading model from ' + filename)
        self.model = LdaMulticore.load(fname=filename)
        self.dictionary = Dictionary.load(fname=filename + '.dict')
        self.corpus = MmCorpus(filename + '.corpus')
        print(self.dictionary)
        print(self.model.print_topic(0, topn=5))
        print(self.model.print_topic(1, topn=5))
        print(self.model.print_topic(2, topn=5))
        print(self.model.print_topic(3, topn=5))
        self.loaded = True
        self.util.logDebug('LDA',
                           'Model loaded in ' + self.util.stopTimeTrack())
        self.labelTopics(filename)

    def getTopTopic(self, inferenceOutput):
        thisDict = defaultdict(int)
        probList = []
        for topic, prob in inferenceOutput:
            thisDict[str(prob)] = topic
            probList.append(prob)
        largestProb = max(probList)
        mostLikelyTopic = thisDict[str(largestProb)]
        return mostLikelyTopic

    def infer_topic_proba(self, string):
        import numpy as np
        prediction = [0.0, 0.0, 0.0, 0.0]
        if (self.loaded):
            bow = self.dictionary.doc2bow(self.util.tokenize(string))
            results = self.model.get_document_topics(bow)
            for result in results:
                prediction[result[0]] = result[1]
        else:
            self.util.logError('LDA', 'Model is not loaded, cannot infer')
        prediction = np.array(prediction)
        return prediction

    def infer_topic(self, string):
        results = None
        if (self.loaded):
            bow = self.dictionary.doc2bow(self.util.tokenize(string))
            results = self.model.get_document_topics(bow)
        else:
            self.util.logError('LDA', 'Model is not loaded, cannot infer')
        results = self.getTopTopic(results)
        return results

    def visualizeLDA(self, filename):

        dictionary = Dictionary.load(filename + '.dict')
        corpus = MmCorpus(filename + '.corpus')
        lda = LdaMulticore.load(filename)
        self.util.logDebug('LDA', 'Preparing HTML ')
        ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
        self.util.logDebug('LDA',
                           'HTML prepared in ' + self.util.stopTimeTrack())
        pyLDAvis.save_html(ldavis, filename + '.html')
        self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack())


#
# lda = LDA(logfilename='/home/kah1/test.log')
# lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model')
# lda.labelTopics()
Example #6
0
def lda_MULTICORE(texts,
                  n_topics_range,
                  iterations,
                  passes,
                  workers,
                  out_dir,
                  verbose=True,
                  save_doc_top=True):
    '''Fit topic models and search for optimal hyperparameters.

    Dirtier, multicore version for faster running of HOPE stuff.


    Parameters
    ----------
    texts : list
        preprocessed corpus, where texts[0] is a document
        and texts[0][0] is a token.

    n_topics_range : range of int
        range of integers to use as the number of topics
        in interations of the topic model.

    iterations : int
        maximum number of iterations for each topic models

    passes : int
        maximum number of passes (start iterations again) for each topic models

    workers : int
        how many CPUs to initiate?

    out_dir : str
        path to a directory, where results will be saved (in a child directory).

    verbose : bool
        give comments about the progress?

    save_doc_top : bool
        save documet-topic matices from models?


    Exports
    -------
    out_dir/report_lines/*
        pickled dict with model information
        (n topics, model coherence, per-topic coherence, hyperparameters)
        
    out_dir/models/*
        gensim objects, where the model is saved.
        
    out_dir/plots/*
        pyLDAvis visualizations of the model
    '''
    # check how legit out_dir is
    make_folders(out_dir)

    # if a single model is to be fitted,
    # make sure it can be "iterated"
    if isinstance(n_topics_range, int):
        n_topics_range = [n_topics_range]

    # input texts to gensim format
    dictionary = corpora.Dictionary(texts)
    bows = [dictionary.doc2bow(tl) for tl in texts]

    # iterate
    report_list = []
    for n_top in chain(n_topics_range):

        if verbose:
            print("{} topics".format(n_top))

        start_time = time()

        # paths for saving
        ## it's not very elegant defining the paths here
        ## after there already is funciton make_folders
        filename = str(n_top) + "T_" + 'ASM'
        report_path = os.path.join(out_dir, 'report_lines',
                                   filename + '.ndjson')

        trained_path = os.path.join(out_dir, 'trained_lda',
                                    filename + '.model')

        pyldavis_path = os.path.join(out_dir, 'plots',
                                     filename + '_pyldavis.html')

        doctop_path = os.path.join(out_dir, 'doctop_mats',
                                   filename + '_mat.ndjson')

        model = LdaMulticore(
            corpus=bows,
            num_topics=n_top,
            id2word=dictionary,
            workers=workers,
            chunksize=2000,
            passes=passes,
            batch=False,
            alpha='symmetric',
            eta=None,
            decay=0.5,
            offset=1.0,
            eval_every=10,
            iterations=iterations,
            gamma_threshold=0.001,
            random_state=None,
            minimum_probability=0.01,
            minimum_phi_value=0.01,
            per_word_topics=False,
        )

        # track time usage
        training_time = time() - start_time
        if verbose:
            print('    Time: {}'.format(training_time))

        # coherence
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         corpus=bows,
                                         coherence='c_v')

        coh_score = coherence_model.get_coherence()
        coh_topics = coherence_model.get_coherence_per_topic()

        if verbose:
            print('    Coherence: {}'.format(coh_score.round(2)))

        # save priors
        alpha = model.alpha.tolist()
        eta = model.eta.tolist()

        # save report
        report = (n_top, alpha, eta, training_time, coh_score, coh_topics)
        report_list.append(report)
        with open(report_path, 'w') as f:
            ndjson.dump(report, f)

        # save model
        model.save(trained_path)

        # produce a visualization
        # it is imperative that sort_topics should never be turned on!
        vis = pyLDAvis.gensim.prepare(model,
                                      bows,
                                      dictionary,
                                      sort_topics=False)

        pyLDAvis.save_html(vis, pyldavis_path)

        # save document-topic matrix
        if save_doc_top:
            # keep minimum_probability at 0 for a complete matrix
            doc_top = [
                model.get_document_topics(doc, minimum_probability=0)
                for doc in model[bows]
            ]

            # unnest (n topic, prob) tuples
            # float to convert from np.float32 which is not
            # JSON serializable
            doc_top_prob = [[float(prob) for i, prob in doc]
                            for doc in doc_top]

            # save the matrix as ndjson
            with open(doctop_path, 'w') as f:
                ndjson.dump(doc_top_prob, f)

        gc.collect()

    return None