Ejemplo n.º 1
0
    for c in lda_model[corpus[5:8]]:
        print("Document Topics      : ", c[0])  # [(Topics, Perc Contrib)]
        print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
        print("Phi Values (word id) : ",
              c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
        print("Word, Topics         : ",
              [(dct[wd], topic)
               for wd, topic in c[1][:2]])  # [(Word, [Topics])]
        print("Phi Values (word)    : ",
              [(dct[wd], topic)
               for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
        print("------------------------------------------------------\n")

    train_vecs = []
    for i in range(len(train)):
        top_topics = lda_model.get_document_topics(corpus[i],
                                                   minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        # topic_vec.extend([train.iloc[i].real_counts]) # counts of reviews for restaurant
        topic_vec.extend([len(train.iloc[i].comment)])  # length review
        train_vecs.append(topic_vec)
    print(train_vecs)
    X = np.array(train_vecs)
    y = np.array((train['label']).values)
    print(y)
    print(len(X), len(y))
    test_vecs = []
    for i in range(len(test)):
        i = i + len(train)
        top_topics = lda_model.get_document_topics(corpus[i],
                                                   minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        for pair in tp[1]:
            words.append(pair[0])
            weights.append(int(pair[1] * 10000))
        keywordWeights.append(weights)

# Top topics per paragraph
    df = pd.DataFrame()
    df['referenceId'] = referenceIds
    df['paragraph'] = raw_paragraphs
    topicNumbers = []
    for c in range(len(corpus)):
        maxProbability = 0
        indexOfMax = 0
        topTopics = []
        topTopicProbabilities = []
        lda_model.get_document_topics(corpus[c])
        for topicNumber in lda_model.get_document_topics(corpus[c]):
            topTopics.append(topicNumber[0])
            topTopicProbabilities.append(topicNumber[1])
        topTopicsSorted = [
            x for _, x in sorted(zip(topTopicProbabilities, topTopics),
                                 reverse=True)
        ]
        topicNumbers.append(topTopicsSorted)
    df['topTopics'] = topicNumbers

    # Most probable topic per paragraph
    topTopics = []
    for index, row in df.iterrows():
        if (row['topTopics']):
            topTopics.append(row['topTopics'][0])
Ejemplo n.º 3
0
class Lda():
    def __init__(self):
        self.logger = Logger.logger
        self.storage_path = Config.lda_storage_path

        # Filenames
        self.gensim_dictionary = 'dictionary.gensim'
        self.gensim_model = 'model.gensim'
        self.corpus_pickle = 'corpus.pkl'

    def persist_lda(self):
        """
        Persist corpus,dictionary and lda-model locally.
        :return:
        """
        if not os.path.exists(self.storage_path):
            os.makedirs(self.storage_path)
        self.logger.info("Persist corpus, dictionary and lda-model to file.")
        pickle.dump(
            self.bow_corpus,
            open(os.path.join(self.storage_path, self.corpus_pickle), 'wb'))
        self.dictionary.save(
            os.path.join(self.storage_path, self.gensim_dictionary))
        self.ldamodel.save(os.path.join(self.storage_path, self.gensim_model))

    def load_lda(self):
        """
        Load corpus,dictionary and lda-model from local storage.
        :return:
        """
        self.logger.info("Loading corpus, dictionary and lda-model from file.")
        self.dictionary = corpora.Dictionary.load(
            os.path.join(self.storage_path, self.gensim_dictionary))
        self.bow_corpus = pickle.load(
            (open(os.path.join(self.storage_path, self.corpus_pickle), "rb")))
        path = os.path.join(self.storage_path, self.gensim_model)
        self.ldamodel = LdaModel.load(path)

    def show_topics(self):
        topics = self.ldamodel.print_topics(num_words=5)
        for topic in topics:
            print(topic)

    def train_lda(self, texts, num_topics=5, n=None):
        self.logger.info("Create corpus, dictionary lda-model.")
        self.dictionary = corpora.Dictionary(texts)
        self.bow_corpus = [self.dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(self.bow_corpus)
        corpus_tfidf = tfidf[self.bow_corpus]

        self.ldamodel = LdaMulticore(
            corpus=corpus_tfidf,
            num_topics=num_topics,
            id2word=self.dictionary,
            passes=10,
            workers=3,
        )

    def classify(self, text):
        """
        Returns an vector of probabilities to which class the given text belongs to.
        :param text:
        :return:
        """
        self.logger.info("Classify the given text.")
        new_doc_bow = self.dictionary.doc2bow(text)
        return self.ldamodel.get_document_topics(new_doc_bow)

    def export_html(self):
        self.logger.info("Export LDA to html file.")
        lda_display = pyLDAvis.gensim.prepare(self.ldamodel,
                                              self.bow_corpus,
                                              self.dictionary,
                                              sort_topics=True)
        pyLDAvis.save_html(lda_display, self.storage_path + "/index.html")

    def visualize(self):
        """
        Visualizes the lda-model usung LDAVIS.
        :return:
        """
        lda_display = pyLDAvis.gensim.prepare(self.ldamodel,
                                              self.bow_corpus,
                                              self.dictionary,
                                              sort_topics=True)
        pyLDAvis.show(lda_display)
    def run(self, args):

        # mlflow logs
        experiment_name = "dev-LessonsClustering"
        if args.environment == "production":
            experiment_name = "LessonsClustering"
        elif args.environment == "staging":
            experiment_name = "staging-LessonsClustering"
        mlflow.set_experiment(experiment_name)
        client = mlflow.tracking.MlflowClient()

        with mlflow.start_run():
            log_param("environment", args.environment)
            log_param("mode", args.mode)
            log_param("update_related_lessons", args.update_related_lessons)

            # Get lessons data from database

            df = ef.getLessons(self.credentials)

            # Pre Processing
            lessonsData = df[df['isLesson'] == True]
            lessonsData = lessonsData[lessonsData['summary'] ==
                                      lessonsData['summary']]
            raw_paragraphs = lessonsData['paragraph']
            urls = lessonsData['urlToFile']
            raw_sentences = raw_paragraphs
            ids = lessonsData['_id']

            sentences = [line.split(' ') for line in raw_sentences]
            stop_words = stopwords.words('english')
            stop_words.extend(
                ['from', 'subject', 're', 'edu', 'use', 'äô', 'äù', 'äì'])
            words_to_remove = ['iii', 'project']

            def remove_stopwords(texts):
                return [[
                    word for word in simple_preprocess(str(doc))
                    if word not in stop_words
                ] for doc in texts]

            def remove_words(texts):
                return [[
                    word for word in simple_preprocess(str(doc))
                    if word not in words_to_remove
                ] for doc in texts]

            def remove_word_length_2(texts):
                allSentences = []
                for doc in texts:
                    newWords = []
                    for word in doc:
                        if len(word) > 2:
                            newWords.append(word)
                    allSentences.append(newWords)
                return allSentences

            def replace_adb_special_characters(texts):
                return [[
                    word.replace('’s',
                                 "'s ").replace('O’Smach', "0").replace(
                                     'äù', "").replace('äô',
                                                       "").replace('äì', "")
                    for word in doc
                ] for doc in texts]

            def get_wordnet_pos(word):
                tag = nltk.pos_tag([word])[0][1][0].upper()
                tag_dict = {
                    "J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV
                }

                return tag_dict.get(tag, wordnet.NOUN)

            sentences = replace_adb_special_characters(sentences)
            data_words_nostops = remove_stopwords(sentences)
            lemmatizer = WordNetLemmatizer()
            lemmatized_output = []
            for paragraph in data_words_nostops:
                lemmatized_output.append([
                    lemmatizer.lemmatize(word, get_wordnet_pos(word))
                    for word in paragraph
                ])
            sentences = remove_words(lemmatized_output)
            sentences_no_length_2 = remove_word_length_2(sentences)
            sentences = sentences_no_length_2

            id2word = corpora.Dictionary(sentences)
            texts = sentences
            corpus = [id2word.doc2bow(text) for text in texts]

            def compute_coherence_values(corpus, dictionary, k, a, b):
                lda_model = LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=k,
                                         random_state=100,
                                         chunksize=100,
                                         passes=10,
                                         alpha=a,
                                         eta=b,
                                         per_word_topics=True)
                coherence_model_lda = CoherenceModel(model=lda_model,
                                                     texts=sentences,
                                                     dictionary=id2word,
                                                     coherence='c_v')
                return coherence_model_lda.get_coherence()

    # Fine Tuning

            if args.mode == "fine_tuning":
                grid = {}
                grid['Validation_Set'] = {}

                # Topics range
                min_topics = 2
                max_topics = args.max_number_of_topics
                step_size = 1
                topics_range = range(min_topics, max_topics + 1, step_size)

                # Alpha parameter
                alpha = list(np.arange(0.01, 1, 0.3))
                # alpha.append('symmetric')
                # alpha.append('asymmetric')

                # Beta parameter
                beta = list(np.arange(0.01, 1, 0.3))
                # beta.append('symmetric')

                # Validation sets
                # num_of_docs = len(corpus)
                corpus_sets = [
                    # ClippedCorpus(corpus, int(num_of_docs*0.25)),
                    # ClippedCorpus(corpus, int(num_of_docs*0.5)),
                    # ClippedCorpus(corpus, int(num_of_docs*0.75)),
                    corpus
                ]
                # corpus_title = [
                #                 '25% Corpus'
                #                 '50% Corpus',
                #                 '75% Corpus'
                #                 '100% Corpus'
                # ]
                model_results = {
                    # 'Validation_Set': [],
                    'Number Of Topics': [],
                    'Alpha': [],
                    'Beta': [],
                    'Coherence': []
                }
                model_results_2 = {
                    'Number Of Topics': [],
                    'Average Coherence': []
                }
                maxCoherence = 0
                maxCoherenceK = 2
                maxCoherenceA = 0.01
                maxCoherenceB = 0.01
                for i in range(len(corpus_sets)):
                    for k in topics_range:
                        for a in alpha:
                            for b in beta:
                                cv = compute_coherence_values(
                                    corpus=corpus_sets[i],
                                    dictionary=id2word,
                                    k=k,
                                    a=a,
                                    b=b)
                                if cv > maxCoherence:
                                    maxCoherence = cv
                                    maxCoherenceK = k
                                    maxCoherenceA = a
                                    maxCoherenceB = b
                                # model_results['Validation_Set'].append(corpus_title[i])
                                model_results['Number Of Topics'].append(k)
                                model_results['Alpha'].append(a)
                                model_results['Beta'].append(b)
                                model_results['Coherence'].append(cv)
                                customStep = int(
                                    str(k) +
                                    "{:.2f}".format(a).replace(".", "") +
                                    "{:.2f}".format(b).replace(".", ""))
                                log_metric("coherence", cv, step=customStep)

                        model_results_2['Number Of Topics'].append(k)
                        model_results_2['Average Coherence'].append(cv)
                        log_metric("average_coherence", cv, step=k)
                log_metric("max_coherence", maxCoherence)
                log_metric("number_of_topics_of_max_coherence", maxCoherenceK)
                log_metric("alpha_of_max_coherence", maxCoherenceA)
                log_metric("beta_of_max_coherence", maxCoherenceB)
                pd.DataFrame(model_results).to_csv(defaults.DATA_PATH +
                                                   "fine-tuning.csv",
                                                   index=False)
                pd.DataFrame(model_results_2).to_csv(defaults.DATA_PATH +
                                                     "fine-tuning-2.csv",
                                                     index=False)
                log_artifact(defaults.DATA_PATH + "fine-tuning.csv", "data/")
                log_artifact(defaults.DATA_PATH + "fine-tuning-2.csv", "data/")

    # Train LDA model
            elif args.mode == "train":
                log_metric("number_of_topics", args.number_of_topics)
                log_metric("alpha", args.alpha)
                log_metric("beta", args.beta)
                lda_model = LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=args.number_of_topics,
                                         random_state=200,
                                         chunksize=100,
                                         passes=10,
                                         alpha=args.alpha,
                                         eta=args.beta,
                                         per_word_topics=True)
                cv = compute_coherence_values(corpus=corpus,
                                              dictionary=id2word,
                                              k=args.number_of_topics,
                                              a=args.alpha,
                                              b=args.beta)
                log_metric("coherence", cv)
                lda_model.save(defaults.MODEL_PATH + "lda.model")
                log_artifact(defaults.MODEL_PATH + "lda.model", "models/")

    # Predict LDA model
            elif args.mode == "predict":
                log_param("run_id_model", args.run_id_model)
                number_of_topics = int(args.number_of_topics)
                if not args.run_id_model == "":
                    data = client.get_run(args.run_id_model).data
                    number_of_topics = int(data.params['number_of_topics'])
                    alpha = float(data.params['alpha'])
                    beta = float(data.params['beta'])
                    log_metric("number_of_topics", number_of_topics)
                    log_metric("alpha", alpha)
                    log_metric("beta", beta)
                    cv = compute_coherence_values(corpus=corpus,
                                                  dictionary=id2word,
                                                  k=number_of_topics,
                                                  a=alpha,
                                                  b=beta)
                    log_metric("coherence", cv)

    # Download and load the LDA model
                modelFilePath = defaults.MODEL_PATH + "lda.model"
                af.downloadLDAModel(args, modelFilePath)
                lda_model = LdaModel.load(modelFilePath)
                # lda_model.save(defaults.MODEL_PATH + "lda.model")
                # log_artifact(defaults.MODEL_PATH + "lda.model", "models/")

                # Keyword weights

                x = lda_model.show_topics(num_topics=number_of_topics,
                                          num_words=50,
                                          formatted=False)
                keywordWeights = []
                topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
                for tp in x:
                    words = []
                    weights = []
                    for pair in tp[1]:
                        words.append(pair[0])
                        weights.append(int(pair[1] * 10000))
                    keywordWeights.append(weights)

    # Top topics per paragraph
                topicNumbers = []
                for c in range(len(corpus)):
                    maxProbability = 0
                    indexOfMax = 0
                    topTopics = []
                    topTopicProbabilities = []
                    lda_model.get_document_topics(corpus[c])
                    for topicNumber in lda_model.get_document_topics(
                            corpus[c]):
                        topTopics.append(topicNumber[0])
                        topTopicProbabilities.append(topicNumber[1])
                    topTopicsSorted = [
                        x for _, x in sorted(zip(topTopicProbabilities,
                                                 topTopics),
                                             reverse=True)
                    ]
                    topicNumbers.append(topTopicsSorted)
                lessonsData['newTopTopics'] = topicNumbers
                lessonsData['topTopics'] = topicNumbers

                # Most probable topic per paragraph
                topTopics = []
                for index, row in lessonsData.iterrows():
                    if (row['topTopics']):
                        topTopics.append(row['topTopics'][0])
                    else:
                        topTopics.append(-1)
                lessonsData['topic'] = topTopics

                # Frequencies of topic keywords and number of PCRs per topic
                topics = pd.DataFrame()
                topicKeywords = []
                allKeywords = []
                topicIds = []
                for topic, words in topics_words:
                    allKeywords.append(words)
                    topicIds.append(topic)
                topics['key'] = topicIds
                topics['keywords'] = allKeywords
                topics['oldFrequencies'] = [[0] * len(keywords)
                                            for keywords in allKeywords]
                topics['numberOfLessons'] = 0
                topics['PCRs'] = [[] for i in range(len(topics))]
                topics['numberOfPCRs'] = 0

                for sentenceTopicNumbers, sentenceURL in zip(
                        topicNumbers, urls):
                    for topicNumber in sentenceTopicNumbers:
                        topics.at[topicNumber, 'numberOfLessons'] = topics.at[
                            topicNumber, 'numberOfLessons'] + 1
                        topics.at[topicNumber, 'PCRs'].append(sentenceURL)
                for index, row in topics.iterrows():
                    topics.at[index, 'numberOfPCRs'] = len(
                        set(topics.at[index, 'PCRs']))
                topics = topics.drop(columns=['PCRs'])

                # Frequencies of words per sentence per topic
                topics['oldFrequencies'] = [[0] * len(keywords)
                                            for keywords in allKeywords]
                for index, row in topics.iterrows():
                    topicNumber = topics.at[index, 'key']
                    topicKeywords = topics.at[index, 'keywords']
                    topicKeywordsFrequencies = topics.at[index,
                                                         'oldFrequencies']
                    for sentence, sentenceTopicNumbers in zip(
                            sentences, topicNumbers):
                        for sentenceTopicNumber in sentenceTopicNumbers:
                            if topicNumber == sentenceTopicNumber:
                                for word in sentence:
                                    if word in topicKeywords:
                                        indexOfWord = topicKeywords.index(word)
                                        topicKeywordsFrequencies[
                                            indexOfWord] = topicKeywordsFrequencies[
                                                indexOfWord] + 1
                    topics.at[index,
                              'oldFrequencies'] = topicKeywordsFrequencies
                topics['frequencies'] = keywordWeights

                # Top word per topic
                topicTopWords = []
                for index, row in topics.iterrows():
                    topicTopWords.append(row['keywords'][0])
                topics['topWord'] = topicTopWords

                # Adjacent topics
                # pyLDAvis.enable_notebook()
                vis = pyLDAvis.gensim.prepare(lda_model,
                                              corpus,
                                              dictionary=lda_model.id2word)
                topics['x'] = 1.0
                topics['y'] = 1.0
                for topic, x in zip(list(vis.topic_coordinates.index),
                                    list(vis.topic_coordinates.x)):
                    topics.at[topic, 'x'] = float(x)
                for topic, y in zip(list(vis.topic_coordinates.index),
                                    list(vis.topic_coordinates.y)):
                    topics.at[topic, 'y'] = float(y)

                import math

                def calculateDistance(x1, y1, x2, y2):
                    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
                    return dist

                distanceMatrix = []
                allDistances = []
                c1 = 0
                topicsX = topics['x'].tolist()
                topicsY = topics['y'].tolist()
                for tx1, ty1 in zip(topicsX, topicsY):
                    distances = []
                    for tx2, ty2 in zip(topicsX, topicsY):
                        distance = calculateDistance(tx1, ty1, tx2, ty2)
                        if not distance:
                            distance = 999
                        else:
                            allDistances.append(distance)
                        distances.append(distance)
                    distanceMatrix.append(distances)
                    c1 = c1 + 1

                percentile20 = np.percentile(allDistances, 20)
                numberOfAdjacent = 0
                numberOfNodes = len(distanceMatrix)
                allAdjacentTopics = []
                for distances in distanceMatrix:
                    adjacentTopics = []
                    for index, distance in zip(range(len(distances)),
                                               distances):
                        if distance <= percentile20:
                            adjacentTopics.append(index)
                    allAdjacentTopics.append(adjacentTopics)
                    numberOfAdjacent = numberOfAdjacent + len(adjacentTopics)
                numberOfAdjacent = numberOfAdjacent / 2
                pairs = []
                for index, adjacentTopicList in zip(
                        range(len(allAdjacentTopics)), allAdjacentTopics):
                    for adjacentTopic in adjacentTopicList:
                        pairs.append(sorted([index, adjacentTopic]))
                pairs.sort()
                dedupedPairs = list(pairs
                                    for pairs, _ in itertools.groupby(pairs))
                topWordPairs = []
                for pair in dedupedPairs:
                    topWordPairs.append(
                        [topicTopWords[pair[0]], topicTopWords[pair[1]]])
                topics['adjacentTopics'] = allAdjacentTopics

                # Save topics data
                ef.deleteIndex(self.credentials, "topics")
                ef.saveTopics(self.credentials, topics)

                # Lesson strength
                maxLessonStrength = topics['numberOfPCRs'].sum()
                lessonStrengths = []
                for index, row in lessonsData.iterrows():
                    topicNumbers = row['topTopics']
                    lessonStrength = 0
                    for topicNumber in topicNumbers:
                        lessonStrength = lessonStrength + topics.at[
                            topicNumber, 'numberOfPCRs']
                    lessonStrengths.append(lessonStrength / maxLessonStrength)
                lessonsData['lessonStrength'] = lessonStrengths

                # Save lessons data
                ef.updateSentences(self.credentials, lessonsData)
                mf.backupIndex(self.credentials, "sentences")
                mf.backupIndex(self.credentials, "topics")

    # Update related lessons

    # Get TFIDF model
        if args.update_related_lessons == "True":
            tfidf = TfidfModel(corpus, smartirs='ntc')
            tfidf_corpus = []
            for doc in corpus:
                tfidf_corpus.append(tfidf[doc])
            tfidf_mat = matutils.corpus2dense(tfidf_corpus,
                                              num_terms=len(id2word.token2id))
            tfidf_mat_transpose = tfidf_mat.transpose()
            tfidfDF = pd.DataFrame(
                data=tfidf_mat_transpose[0:, 0:],
                index=[i for i in range(tfidf_mat_transpose.shape[0])],
                columns=[
                    '' + str(i) for i in range(tfidf_mat_transpose.shape[1])
                ])
            tfidfDF['id'] = ids.tolist()

            # Save related lessons
            cf.updateRelatedLessons(self.credentials, tfidfDF)
Ejemplo n.º 5
0
class AmazonReviewDataset():
    def __init__(self,
                 dirpath,
                 word2vector_path,
                 max_vocab_size=10000,
                 num_topics=50,
                 num_topic_iterations=2000,
                 num_topic_passes=10,
                 reproc=False):
        self.dirpath = dirpath
        self.word2vector_path = word2vector_path
        self.max_vocab_size = max_vocab_size
        self.num_topics = num_topics
        self.num_topic_iterations = num_topic_iterations
        self.num_topic_passes = num_topic_passes
        self.reproc = reproc
        self.domains = os.listdir(self.dirpath)
        if not os.path.exists("./preproc_data"):
            os.makedirs("./preproc_data")
            print("Initialize the Pre-processed data")
        elif self.reproc:
            os.rmdir("./preproc_data")
            os.makedirs("./preproc_data")
            print("Re-construct Pre-processed data")
        else:
            print("Re-use History Pre-processed data")

    "domain2data"

    ##################################################
    def load_domain2data(self):
        def file_parser(domain, split):
            def line_parser(line):
                features, review = line.split(' ')[:-1], []
                for feature in features:
                    ngram, count = feature.split(':')
                    for _ in range(int(count)):
                        review.append(ngram)
                return review

            file_path = os.path.join(self.dirpath, domain,
                                     '{}.review'.format(split))
            with open(file_path, "r") as f:
                reviews = [line_parser(line) for line in f]
            return reviews

        if os.path.exists("./preproc_data/domain2data.pkl"):
            with open("./preproc_data/domain2data.pkl", "rb") as filer:
                self.domain2data = pickle.load(filer)
        else:
            self.domain2data = {
                domain: {
                    "labeled": [],
                    "label": [],
                    "unlabeled": None
                }
                for domain in self.domains
            }
            for domain in self.domains:
                for split in ['positive', 'negative', 'unlabeled']:
                    reviews = file_parser(domain, split)
                    if split == 'unlabeled':
                        self.domain2data[domain]['unlabeled'] = reviews
                    else:
                        self.domain2data[domain]['labeled'] += reviews
                        self.domain2data[domain]['label'] += [
                            1 if split == "positive" else 0
                        ] * len(reviews)
                self.domain2data[domain]["label"] = np.array(
                    self.domain2data[domain]["label"])
            with open("./preproc_data/domain2data.pkl", "wb") as filew:
                pickle.dump(self.domain2data, filew)
        print("Load domain2data has done.")

    ##################################################
    def load_global_vocab(self):
        if os.path.exists("./preproc_data/vocab.txt"):
            self.word2id = {}
            with open("./preproc_data/vocab.txt", 'r') as f:
                for i, line in enumerate(f):
                    if i >= self.max_vocab_size: break
                    word, idx = line.split('\t')
                    self.word2id[word] = int(idx.strip())
            self.vocab_size = len(self.word2id)
            self.id2word = {
                index: word
                for word, index in self.word2id.items()
            }
        else:
            texts = []
            if not hasattr(self, "domain2data"):
                self.load_domain2data()
            for domain in self.domain2data:
                texts.extend(self.domain2data[domain]["labeled"])
                texts.extend(self.domain2data[domain]["unlabeled"])
            word_counts = Counter(itertools.chain(*texts))
            most_common = word_counts.most_common(n=self.max_vocab_size)
            self.word2id = {
                word: index
                for index, (word, _) in enumerate(most_common)
            }
            self.id2word = {
                index: word
                for word, index in self.word2id.items()
            }
            with open("./preproc_data/vocab.txt", 'w') as f:
                for word, index in sorted(self.word2id.items(),
                                          key=lambda d: d[1]):
                    f.write('%s\t%d\n' % (word, index))
            self.vocab_size = len(self.word2id)
        print("Load vocab has done.")

    "word2vector"

    ##################################################
    def load_word2vector(self):
        if os.path.exists("./preproc_data/word2vector.pkl"):
            with open("./preproc_data/word2vector.pkl", "rb") as filer:
                self.word2vector = pickle.load(filer)
        else:
            self.word2vector = {}
            if not hasattr(self, "word2id"):
                self.load_global_vocab()
            with open(self.word2vector_path, 'r') as f:
                for i, line in enumerate(f):
                    if i == 0: continue
                    word = line.split(' ')[0]
                    if word not in self.word2id: continue
                    line = ' '.join(line.split(' ')[1:]).strip()
                    vector = np.fromstring(line, dtype=float, sep=' ')
                    self.word2vector[word] = vector
            with open("./preproc_data/word2vector.pkl", "wb") as filer:
                pickle.dump(self.word2vector, filer)
        print("Load word2vector has done.")

    "topic_model"

    ##################################################
    def load_topic_model(self):
        if not hasattr(self, "word2id"):
            self.load_globel_vocab()
        self.vectorizer = CountVectorizer(vocabulary=self.word2id,
                                          tokenizer=lambda x: x,
                                          preprocessor=lambda x: x)
        file_path = "./preproc_data/topic_model.pkl"
        if os.path.exists(file_path):
            self.topic_model = LdaModel.load(file_path)
        else:
            texts = []
            if not hasattr(self, "domain2data"):
                self.load_domain2data()
            for domain in self.domain2data:
                texts.extend(self.domain2data[domain]["labeled"])
                texts.extend(self.domain2data[domain]["unlabeled"])
            corpus = self.vectorizer.fit_transform(texts)
            corpus = Sparse2Corpus(corpus, documents_columns=False)
            self.topic_model = LdaMulticore(
                corpus=corpus,
                num_topics=self.num_topics,
                id2word=self.id2word,
                iterations=self.num_topic_iterations,
                passes=self.num_topic_passes)
            self.topic_model.save(file_path)

    "tfidf"

    ##################################################
    def load_domain2tfidf(self):
        if os.path.exists("./preproc_data/domain2tfidf.pkl"):
            with open("./preproc_data/domain2tfidf.pkl", "rb") as filer:
                self.domain2tfidf = pickle.load(filer)
        else:
            if not hasattr(self, "domain2data"):
                self.load_domain2data()
            if not hasattr(self, "word2id"):
                self.load_global_vocab()
            self.domain2tfidf = {
                domain: {
                    "labeled": [],
                    "label": [],
                    "unlabeled": []
                }
                for domain in self.domains
            }
            for domain in self.domain2data:
                vectorizer = TfidfVectorizer(vocabulary=self.word2id,
                                             tokenizer=lambda x: x,
                                             preprocessor=lambda x: x)
                vectorizer.fit(self.domain2data[domain]["labeled"] +
                               self.domain2data[domain]["unlabeled"])
                for key in self.domain2tfidf[domain]:
                    self.domain2tfidf[domain][key] = self.domain2data[domain][key] if key == "label" \
                    else vectorizer.transform(self.domain2data[domain][key])
            with open("./preproc_data/domain2tfidf.pkl", "wb") as filew:
                pickle.dump(self.domain2tfidf, filew)
        print("Load domain2tfidf has done.")

    "texts"

    ##################################################
    def get_texts(self, domains, unlabeled=True):
        texts = []
        if not hasattr(self, "domain2data"):
            self.load_domain2data()
        for domain in domains:
            texts.extend(self.domain2data[domain]["labeled"])
            if unlabeled:
                texts.extend(self.domain2data[domain]["unlabeled"])
        return texts

    "distribution"

    ##################################################
    def get_texts_term_distribution(self, texts):
        if not hasattr(self, "word2id"):
            self.load_global_vocab()
        term_distribution = np.zeros(len(self.word2id))
        for text in texts:
            for word in text:
                if word in self.word2id:
                    term_distribution[self.word2id[word]] += 1
        term_distribution /= np.sum(term_distribution)
        if np.isnan(np.sum(term_distribution)):
            term_distribution = np.zeros(self.vocab_size)
        return term_distribution

    "topic"

    ##################################################
    def get_texts_topic_distribution(self, texts):
        if not hasattr(self, "vectorizer"):
            self.load_topic_model()
        vectorized_corpus = self.vectorizer.transform(texts)
        gensim_corpus = Sparse2Corpus(vectorized_corpus,
                                      documents_columns=False)
        topic_representations = []
        for doc in gensim_corpus:
            topic_representations.append([
                topic_prob
                for (_, topic_prob) in self.topic_model.get_document_topics(
                    doc, minimum_probability=0.)
            ])
        return np.array(topic_representations)

    "word2vec"

    ##################################################
    def get_texts_word2vec_distribution(self, texts):
        if not hasattr(self, "word2vector"):
            self.load_word2vector()
        word_embeds, t = [], 10e-5
        texts_term_distribution_weights = self.get_texts_term_distribution(
            texts)
        for text in texts:
            word_count, doc_vector = 0, np.zeros(
                len(list(self.word2vector.values())[0]))
            for word in text:
                if word not in self.word2vector: continue
                doc_vector += np.sqrt(t / (texts_term_distribution_weights[
                    self.word2id[word]])) * self.word2vector[word]
                word_count += 1
            doc_vector = doc_vector if word_count == 0 else doc_vector / word_count
            word_embeds.append(doc_vector)
        return np.array(word_embeds)

    "model feature"

    ##################################################
    def get_model_feature(self, domains):
        if not hasattr(self, "domain2tfidf"):
            self.load_domain2tfidf()
        X, Y, D = [], [], []
        for domain in domains:
            X.extend(self.domain2tfidf[domain]["labeled"])
            Y.extend(self.domain2tfidf[domain]["label"])
            D.extend([domain] * self.domain2tfidf[domain]["labeled"].shape[0])
        X = scipy.sparse.vstack(X).toarray()
        Y = np.asarray(Y)
        return X, Y, D

    "metric feature"

    ###################################################
    def get_metric_feature(self, target_domain, metric_dict):
        metric_names = [(metric_type, metric_name)
                        for metric_type in metric_dict
                        for metric_name in metric_dict[metric_type]]
        feature = []
        if "term" in metric_dict:
            term_feature = self.get_term_feature(target_domain, metric_names)
            feature.append(term_feature)
        if "topic" in metric_dict:
            topic_feature = self.get_topic_feature(target_domain, metric_names)
            feature.append(topic_feature)
        if "word2vec" in metric_dict:
            word2vec_feature = self.get_word2vec_feature(
                target_domain, metric_names)
            feature.append(word2vec_feature)
        if "diversity" in metric_dict:
            diversity_feature = self.get_diversity_feature(
                target_domain, metric_names)
            feature.append(diversity_feature)
        feature = np.concatenate(feature, axis=1)
        return feature

    def get_term_feature(self, target_domain, metric_names):
        filepath = "./preproc_data/term_feature_{}.pkl".format(target_domain)
        if os.path.exists(filepath):
            with open(filepath, "rb") as filer:
                term_feature = pickle.load(filer)
        else:
            source_texts = self.get_texts(
                [domain for domain in self.domains if domain != target_domain],
                unlabeled=False)
            target_texts = self.get_texts([target_domain], unlabeled=False)
            texts_distribution = [
                self.get_texts_term_distribution([text])
                for text in source_texts
            ]
            domain_distribution = self.get_texts_term_distribution(
                target_texts)
            rvalues = []
            for text_distribution in texts_distribution:
                values = []
                for metric_name in metric_names:
                    metric_type, metric_func = metric_name
                    if metric_type != "term": continue
                    if metric_func in [
                            'jensen_shannon', 'renyi', 'cosine', 'euclidean',
                            'variational', 'bhattacharyya'
                    ]:
                        values.append(
                            getattr(Metric, metric_func)(text_distribution,
                                                         domain_distribution))
                rvalues.append(values)
            term_feature = np.asarray(rvalues)
            with open(filepath, "wb") as filew:
                pickle.dump(term_feature, filew)
        return term_feature

    def get_topic_feature(self, target_domain, metric_names):
        filepath = "./preproc_data/topic_feature_{}.pkl".format(target_domain)
        if os.path.exists(filepath):
            with open(filepath, "rb") as filer:
                topic_feature = pickle.load(filer)
        else:
            source_texts = self.get_texts(
                [domain for domain in self.domains if domain != target_domain],
                unlabeled=False)
            target_texts = self.get_texts([target_domain], unlabeled=False)
            texts_distribution = self.get_texts_topic_distribution(
                source_texts)
            domain_distribution = np.mean(
                self.get_texts_topic_distribution(target_texts), axis=0)
            rvalues = []
            for text_distribution in texts_distribution:
                values = []
                for metric_name in metric_names:
                    metric_type, metric_func = metric_name
                    if metric_type != "topic": continue
                    if metric_func in [
                            'jensen_shannon', 'renyi', 'cosine', 'euclidean',
                            'variational', 'bhattacharyya'
                    ]:
                        values.append(
                            getattr(Metric, metric_func)(text_distribution,
                                                         domain_distribution))
                rvalues.append(values)
            topic_feature = np.asarray(rvalues)
            with open(filepath, "wb") as filew:
                pickle.dump(topic_feature, filew)
        return topic_feature

    def get_word2vec_feature(self, target_domain, metric_names):
        filepath = "./preproc_data/word2vec_feature_{}.pkl".format(
            target_domain)
        if os.path.exists(filepath):
            with open(filepath, "rb") as filer:
                word2vec_feature = pickle.load(filer)
        else:
            source_texts = self.get_texts(
                [domain for domain in self.domains if domain != target_domain],
                unlabeled=False)
            target_texts = self.get_texts([target_domain], unlabeled=False)
            texts_distribution = self.get_texts_word2vec_distribution(
                source_texts)
            domain_distribution = np.mean(
                self.get_texts_word2vec_distribution(target_texts), axis=0)
            rvalues = []
            for text_distribution in texts_distribution:
                values = []
                for metric_name in metric_names:
                    metric_type, metric_func = metric_name
                    if metric_type != "word2vec": continue
                    if metric_func in ['cosine', 'euclidean', 'variational']:
                        values.append(
                            getattr(Metric, metric_func)(text_distribution,
                                                         domain_distribution))
                rvalues.append(values)
            word2vec_feature = np.asarray(rvalues)
            with open(filepath, "wb") as filew:
                pickle.dump(word2vec_feature, filew)
        return word2vec_feature

    def get_diversity_feature(self, target_domain, metric_names):
        filepath = "./preproc_data/diversity_feature_{}.pkl".format(
            target_domain)
        if os.path.exists(filepath):
            with open(filepath, "rb") as filer:
                diversity_feature = pickle.load(filer)
        else:
            if not hasattr(self, "word2vector"):
                self.load_word2vector()
            source_texts = self.get_texts(
                [domain for domain in self.domains if domain != target_domain],
                unlabeled=False)
            term_distribution = self.get_texts_term_distribution(source_texts)
            rvalues = []
            for source_text in source_texts:
                p_words, p_word_vector_pairs = [], []
                for word in set(source_text):
                    if word in self.word2id:
                        p_words.append(term_distribution[self.word2id[word]])
                        if word in self.word2vector:
                            p_word_vector_pairs.append(
                                (term_distribution[self.word2id[word]],
                                 self.word2vector[word]))
                    else:
                        p_words.append(0.0)
                values = []
                for metric_name in metric_names:
                    metric_type, metric_func = metric_name
                    if metric_type != "diversity": continue
                    if metric_func in ['num_word_types', 'type_token_ratio']:
                        values.append(
                            getattr(Metric, metric_func)(source_text))
                    elif metric_func in [
                            'entropy', 'simpsons_index', 'renyi_entropy'
                    ]:
                        values.append(getattr(Metric, metric_func)(p_words))
                    elif metric_func in ['quadratic_entropy']:
                        values.append(
                            getattr(Metric, metric_func)(p_word_vector_pairs))
                    else:
                        raise AttributeError()
                rvalues.append(values)
            diversity_feature = np.asarray(rvalues)
            with open(filepath, "wb") as filew:
                pickle.dump(diversity_feature, filew)
        return diversity_feature
Ejemplo n.º 6
0
class LDAMWBase:
    def __init__(self,
                 mtype='multiple',
                 resource=None,
                 lda_work_folder=None,
                 lda_model_filename=None,
                 lda_dict_filename=None,
                 lda_topic_word_count=0,
                 lda_topics_count=0,
                 resource_language=None,
                 data_type=None):

        #
        # todo Deutsch Lemmatizer / Stemmer !!!
        #

        self.p_stemmer = PorterStemmer()
        self.wn_lemmatizer = WordNetLemmatizer()

        if resource is not None:
            # resource_lang == 'en' as default
            resource_lang = 'en'

            # hope that resource is correct and exists
            if data_type == 'db':
                resource_lang = Resources.select(Resources.lang).where(
                    Resources.resource == resource).get()
                resource_lang = resource_lang.__data__['lang'].lower()

            elif data_type == 'csv':
                if resource_language is None:
                    raise Exception(
                        "Resource language must be defined for csv data type.")
                else:
                    resource_lang = resource_language
            else:
                pass

            self.stop_words = get_stop_words(resource_lang)

        self.resource_identifier_name = resource

        def _create_model_deps(model_name,
                               twordscount,
                               tcount,
                               mini=False,
                               mini_path=None):

            if not mini:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + model_name
            else:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + mini_path

            mn = 'lda_model' + '_' + model_name
            md = 'dictionary' + '_' + model_name
            ltwordscount = twordscount
            ltcount = tcount

            _short_model_report = "{}{}: {} \n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}".format(
                INFO_FLAG, colored("Model path", 'red', None,
                                   ['bold']), mp, INFO_FLAG,
                colored("Model name", 'red', None, ['bold']), mn, INFO_FLAG,
                colored("Model dictionary", 'red', None,
                        ['bold']), md, INFO_FLAG,
                colored("Topic words count", 'red', None,
                        ['bold']), ltwordscount, INFO_FLAG,
                colored("Topics count", 'red', None, ['bold']), ltcount,
                "-" * 88)
            if model_name != 'mini':
                print(_short_model_report)

            return mp, mn, md, ltwordscount, ltcount

        if mtype == 'multiple':
            if resource is not None:
                mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                    self.resource_identifier_name, LDA_TOPIC_WORD_COUNT,
                    LDA_TOPICS_COUNT)
            else:
                raise Exception(
                    "{}Resource must be defined. Exiting... \n".format(
                        EXCEPTION_FLAG))

        elif mtype == 'single_ltc':
            mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                "mini",
                MINI_LDA_TOPIC_WORD_COUNT,
                MINI_LDA_TOPICS_COUNT,
                mini=True,
                mini_path=self.resource_identifier_name + "/mini")

        if lda_work_folder is None:
            self.lda_work_folder = mpath
        else:
            self.lda_work_folder = lda_work_folder

        if not os.path.exists(self.lda_work_folder):
            os.mkdir(self.lda_work_folder)

        if lda_model_filename is None:
            self.lda_model_filename = os.path.join(self.lda_work_folder, mname)
        else:
            self.lda_model_filename = os.path.join(self.lda_work_folder,
                                                   lda_model_filename)

        if lda_dict_filename is None:
            self.lda_dict_filename = os.path.join(self.lda_work_folder, mdict)
        else:
            self.lda_dict_filename = os.path.join(self.lda_work_folder,
                                                  lda_dict_filename)

        self.lda_topics_count = lda_topics_count
        self.lda_topic_word_count = lda_topic_word_count

        self.dictionary = None
        self.lda_model = None
        self.lda_topics = []

    @staticmethod
    def load_csv_data(csv_file):
        df = pd.read_csv(csv_file)
        train_documents = df['content'].values

        return train_documents

    @staticmethod
    def load_single_ltc(ltc_data):
        train_documents = re.split(
            r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ltc_data)

        return train_documents

    @staticmethod
    def load_db_data(resource=None):
        # if resource is None:
        #     art_content_stream = Articles.select()
        # else:
        art_content_stream = Articles.select().where(
            Articles.resource == resource)

        train_documents = (acs.content for acs in art_content_stream
                           if acs.content is not None)

        return train_documents

    def save_model(self,
                   as_name=None,
                   save_on_disk=True,
                   save_topics_into_db=False):
        if save_on_disk:
            print(" \t-> Model was saved as [ {} ]".format(as_name))
            if as_name is not None:
                self.lda_model.save(as_name)
            else:
                self.save_model(self.lda_model_filename)

        if save_topics_into_db:
            truncate_topics_tables(resource=self.resource_identifier_name)

            print(" \t-> Topics will be saved in database for [ {} ]".format(
                self.resource_identifier_name))

            model_numbers_topics = self._get_topics()

            try:
                for topic_info in model_numbers_topics:
                    tnum = topic_info[0]
                    tresourceid = topic_info[1]
                    tname = topic_info[2]

                    _topic = {
                        'ident_number': tnum,
                        'value': tname,
                        'created_at': dt.datetime.today().date()
                    }

                    t = Topics.create(**_topic)

                    t_id = t.__data__['topic']

                    _topic_resource = {
                        'resource': tresourceid,
                        'topic': t_id,
                        'created_at': dt.datetime.today().date()
                    }

                    tr = TopicsResources.create(**_topic_resource)

                print("{}[ {} ]".format(SUCCESS_FLAG,
                                        self.resource_identifier_name))
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                print("{}Failure: [ {} ]".format(
                    ERROR_FLAG, self.resource_identifier_name))

    def train_model(self,
                    data_type,
                    resource,
                    single_ltc_data=None,
                    data_file_path=None,
                    train_corpus=None,
                    train_dictionary=None,
                    save_model_as=None,
                    chunksize=LDA_CHUNKSIZE,
                    passes=LDA_PASSES):

        if train_corpus is not None:
            corpus = train_corpus

        elif data_type == 'db':
            corpus = self._make_corpus(data_type=data_type, resource=resource)

        elif data_type == 'single_ltc' and single_ltc_data is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       ltc=single_ltc_data,
                                       resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       data_file_path=data_file_path,
                                       resource=resource)

        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        if train_dictionary is not None:
            dictionary = train_dictionary
        else:
            dictionary = self.dictionary
        """
			id2word parameter need to get words in topics instead of their indexes in dict
		"""
        _tcount = self.lda_topics_count

        # self.lda_model = LdaModel(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize)
        self.lda_model = LdaMulticore(corpus=corpus,
                                      num_topics=_tcount,
                                      id2word=dictionary,
                                      passes=passes,
                                      chunksize=chunksize)

        if save_model_as is not None and not single_ltc_data:
            self.save_model(save_model_as,
                            save_on_disk=True,
                            save_topics_into_db=False)

        elif single_ltc_data:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)
        elif data_type == 'csv':
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)

        else:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=True)

        print("{}Trained".format(SUCCESS_FLAG))

    def load_model(self, model_file_path=None, dict_file_path=None):
        """
			load model and dictionary from file (need to save them in train function)
			uses to update model on another corpus
		"""

        if model_file_path is not None and os.path.exists(model_file_path):
            self.lda_model = LdaMulticore.load(model_file_path)
            # self.lda_model = LdaModel.load(model_file_path)
            self.dictionary = Dictionary.load(dict_file_path)
            print(" \t-> Loaded: [ {} ]".format(model_file_path))

        elif model_file_path is None and os.path.exists(
                self.lda_model_filename):
            self.lda_model = LdaMulticore.load(self.lda_model_filename)
            # self.lda_model = LdaModel.load(self.lda_model_filename)
            self.dictionary = Dictionary.load(self.lda_dict_filename)
            print(" \t-> Loaded: [ {} ]".format(self.lda_model_filename))

        else:
            print(
                "{}Filepath you gave is incorrect. \n     Give another one and retry."
                "\n     Exiting...".format(ERROR_FLAG))
            exit()

        for i in range(self.lda_model.num_topics):
            terms_id = self.lda_model.get_topic_terms(
                i, self.lda_topic_word_count)

            terms = [self.dictionary.get(x[0]) for x in terms_id]

            self.lda_topics.append(' '.join(terms))

    def update_model(self,
                     ondata_file_path=None,
                     resource=None,
                     data_type='db'):
        if ondata_file_path is not None and data_type == 'csv':
            corpus = self._make_corpus(data_file_path=ondata_file_path,
                                       data_type=data_type,
                                       resource=resource)
        elif data_type == 'db':
            corpus = self._make_corpus(data_file_path=None,
                                       data_type=data_type,
                                       resource=resource)
        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        self.lda_model.update(corpus)

    def process_record(self, text, data_type):
        """
			data_type - db / csv / single_ltc
		"""

        if data_type == 'single_ltc':
            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        elif self.lda_model is None:

            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        if data_type == 'db':
            if self.lda_model is None:
                return dict()

            doc = self._prepare_single_document(text)

            if doc is not None:
                topics = self._get_document_topics(doc)

                top_topic = topics[0]

                return [('topic', self.lda_topics[top_topic])]

            return [('topic', "")]

        elif data_type == 'csv':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                0], topics_in_count_by_ids[1:]

            result_topic_word_descr = re.sub(
                '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id))

            return [('topic', result_topic_word_descr),
                    ('other_topics', current_doc_other_topics)]

        elif data_type == 'single_ltc':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            if topics_in_count_by_ids is not None:
                current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                    0], topics_in_count_by_ids[1:]

                result_topic_word_descr = re.sub(
                    '[^A-Za-z]+', ' ',
                    self._get_topic_by_id(current_doc_topic_id))

                return result_topic_word_descr, current_doc_other_topics
            else:
                return "", []

    def _get_metric_fields(self):
        if self.lda_model is None:
            return []

        else:
            return ['topic']

    def _get_document_topics(self, doc, count=5):
        if doc is not None:
            bow = self.dictionary.doc2bow(doc)
            topics = self.lda_model.get_document_topics(
                bow, minimum_probability=0.0)
            topics_in_count = list(
                ident_number for (ident_number, prob) in sorted(
                    topics, key=itemgetter(1), reverse=True)[:count])

            return topics_in_count

    def _get_document_topic(self, doc_topics):
        topic_id_probs = {}

        for t_prob in doc_topics:
            topic_id_probs[t_prob[0]] = t_prob[1]

        doc_topic_id = sorted(topic_id_probs,
                              key=topic_id_probs.get,
                              reverse=True)[0]
        doc_topic_prob = topic_id_probs[doc_topic_id]

        return [doc_topic_id, doc_topic_prob]

    def _prepare_single_document(self, sd):
        if sd is None or type(sd) == np.float:
            return None

        try:
            sd = sd.lower()
            sd = nltk.tokenize.word_tokenize(sd)
            sd = (word for word in sd if word.isalpha() and len(word) > 2)
            stopped_sd = (word for word in sd if word not in self.stop_words)

            lemmatized_doc = [
                self.wn_lemmatizer.lemmatize(word) for word in stopped_sd
            ]

            return lemmatized_doc

        except AttributeError as e:
            print("{}{}".format(EXCEPTION_FLAG, e))
            return None

    def _make_bow(self, text):
        if text is not None:
            d = self._prepare_single_document(text)

            return self.dictionary.doc2bow(d)

    def _make_corpus(self,
                     data_type,
                     resource,
                     data_file_path=None,
                     save_train_dict=True,
                     save_dict_as=None,
                     ltc=None):
        """
			data type can be csv or db # or new - single_ltc
		"""
        if data_type == 'db':
            documents = self.load_db_data(resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            documents = self.load_csv_data(data_file_path)

        elif data_type == 'single_ltc' and ltc is not None:

            ltc_text = " ".join(e if type(e) is str else "" for e in ltc)
            documents = self.load_single_ltc(ltc_text)

        else:
            documents = None

            print("{}documents is None. Exiting ... \n".format(ERROR_FLAG))
            exit()

        with Pool() as pool:
            processed_docs = pool.imap(self._prepare_single_document,
                                       documents)
            pool.close()
            pool.join()

        processed_docs = (i for i in processed_docs if i is not None)
        self.dictionary = Dictionary(processed_docs)

        if save_train_dict and save_dict_as is None:
            self.dictionary.save(self.lda_dict_filename)
        else:
            self.dictionary.save(save_dict_as)

        corpus = [
            self.dictionary.doc2bow(proc_doc) for proc_doc in processed_docs
        ]

        return corpus

    def _get_topic_by_id(self, topic_id):
        if self.lda_topic_word_count is not None:
            return self.lda_model.print_topic(topic_id,
                                              self.lda_topic_word_count)

        else:
            return self.lda_model.print_topic(topic_id, 6)

    def _get_topics(self, default_view=False, for_db=True):
        """
			2-tuples (probability * word) of most probable words in topics
			num_topics=-1 <--- to print all topics
		"""
        def _get_words(probabilities_words_string):
            _pre_topic_with_digits_trash = " ".join(
                re.findall(ALL_CHARS, probabilities_words_string))
            probaply_clean_topic = re.sub(r'\b\d+(?:\.\d+)?\s+', "",
                                          _pre_topic_with_digits_trash)

            return probaply_clean_topic  # " ".join(re.findall('[a-zA-Z]+', probabilities_words_string))

        if default_view:
            return self.lda_model.print_topics(num_topics=-1)

        if for_db:
            resource_id = Resources.select().where(
                Resources.resource == self.resource_identifier_name).first()
            resource_id = resource_id.__data__['resource']

            return [(elem[0], resource_id, _get_words(elem[1]))
                    for elem in self.lda_model.print_topics(
                        num_topics=self.lda_topics_count,
                        num_words=self.lda_topic_word_count)]

        return [(elem[0], _get_words(elem[1]))
                for elem in self.lda_model.print_topics(
                    num_topics=self.lda_topics_count,
                    num_words=self.lda_topic_word_count)]