Ejemplo n.º 1
0
    def main(self):
        print('Loading data')
        data = pd.read_csv('../../resources/abcnews-date-text.csv', error_bad_lines=False)
        data_text = data[['headline_text']]
        data_text['index'] = data_text.index
        documents = data_text

        np.random.seed(2018)

        print('Preprocessing text')
        preprocessed_docs = documents['headline_text'].map(self.preprocess)

        print('Building bag of words corpus')
        dictionary = Dictionary(preprocessed_docs)   # list: token_id, token
        dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
        bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]     # list: token_id, token_count

        print(documents[documents['index'] == 4310].values[0][0])
        print(bow_corpus[4310])
        print(bow_corpus[:100])

        print('Building lda model from bag of words')
        lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_bow.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model_bow[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10)))

        print('Building tfidf corpus from bag of words corpus')
        tfidf = TfidfModel(bow_corpus)
        tfidf_corpus = tfidf[bow_corpus]
        from pprint import pprint
        for doc in tfidf_corpus:
            pprint(doc)
            break

        print('Building lda model from tfidf')
        lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_tfidf.print_topics(-1):
            print('Topic: {} Word: {}'.format(idx, topic))

        for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

        print('Testing on unseen document')
        unseen_document = 'Facebook’s global lobbying against data privacy laws'
        bow_vector = dictionary.doc2bow(self.preprocess(unseen_document))

        print('Bow:')
        for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5)))

        print('TfIdf:')
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
Ejemplo n.º 2
0
def train(file=DATA_FILE, type=JSON):
    delete_previous_models()

    faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type)
    faq_df = clean_data(faq_df)
    faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess)
    faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess)
    print('Preprocessing Done')
    if DEBUG:
        print(faq_df.head())

    for mode in modes:
        model = modes[mode]
        dictionary = corpora.Dictionary(faq_df[model.column])
        dictionary.save(os.path.join(MODEL_DIR, model.dictionary))
        corpus = faq_df[model.column].map(dictionary.doc2bow)
        if DEBUG:
            print(f'{model.corpus} generated')
            print(corpus.head())
        corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus),
                                   corpus)
        tfidf_model = TfidfModel(corpus)
        if DEBUG:
            print(f'{model.tfidf} generated')
        tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf))
        tfidf = tfidf_model[corpus]
        lda_model = LdaMulticore(corpus=tfidf,
                                 id2word=dictionary,
                                 num_topics=30)
        lda_model.save(os.path.join(MODEL_DIR, model.model))
        if DEBUG:
            print(f'{model.model} generated')
            print(lda_model.print_topics(5))
    print('Training completed')
Ejemplo n.º 3
0
    def load_model(self, phrase):
        processed_phrase = self.preprocessing(phrase)

        self.all_phrases.append(processed_phrase)
        # print(self.all_phrases)
        # dct = Dictionary(common_texts)
        dct = Dictionary(self.all_phrases)
        corpus = [dct.doc2bow(line) for line in self.all_phrases]
        lda_model = LdaMulticore(corpus=corpus,
                                 id2word=dct,
                                 random_state=100,
                                 num_topics=3,
                                 passes=10,
                                 chunksize=1000,
                                 batch=False,
                                 alpha="asymmetric",
                                 decay=0.5,
                                 offset=64,
                                 eta=None,
                                 eval_every=0,
                                 iterations=100,
                                 gamma_threshold=0.001,
                                 per_word_topics=True)

        topic_keywords = []
        topics = lda_model.print_topics(-1)

        for topic in topics[:3]:
            topics_str = topic[1]
            pattern = r"[^a-zA-Z+]"
            topics_list = re.sub(pattern, "", topics_str).split("+")
            topic_keywords += topics_list[:5]

        return topic_keywords
def updateLDA():
    api_file="./newsapi.key"
    categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology']
    
    with open(api_file,"r") as apikey:
        newsapi=NewsApiClient(api_key=apikey.read().strip())
    
    headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories}
    pp_docs=[]
    
    for category in headlines:
        for article in headlines[category]['articles']:
            #print(lemma_pp(article['title']))
            pp_docs.append(lemma_pp(article['title']))
            
            
    if os.path.exists(MODEL_DIR+"corpus_dict.model"):
        corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model")
        corp_d.add_documents(pp_docs)
    else:
        corp_d = Dictionary(pp_docs)
        corp_d.filter_extremes(no_below=2, no_above=0.5)
    
    
    dtm=[corp_d.doc2bow(doc) for doc in pp_docs]
    
    tfidf=TfidfModel(dtm)
    corp_tfidf=tfidf[dtm]
    
    lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3)
    print(lda.print_topics(num_topics=5, num_words=5))
    checkdir(MODEL_DIR)
    corp_d.save(MODEL_DIR+"corpus_dict.model")
    #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model")
    lda.save(MODEL_DIR+"lda.model")
Ejemplo n.º 5
0
def extract_topics_from_text(text_tokenized):
    try:
        dictionary = Dictionary(text_tokenized)
        bow_corpus = [dictionary.doc2bow(doc) for doc in text_tokenized]
        lda_model = LdaMulticore(bow_corpus,
                                 num_topics=1,
                                 random_state=42,
                                 id2word=dictionary,
                                 workers=2)
        topics, topics_list = pretty_print_topics(
            lda_model.print_topics(num_topics=1, num_words=5))
    except ValueError:
        return 'No topics extracted'
    return topics
def LDA(corpus, dictionary):
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             random_state=100,
                             num_topics=250,
                             passes=10,
                             chunksize=1000,
                             batch=False,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)
    return lda_model.print_topics(5, num_words=10)
Ejemplo n.º 7
0
def create_topics(context_groups, num_topics):
    topics = {}
    topics_dist = defaultdict(lambda: {})
    word_counts = defaultdict(lambda: 0)
    for key, citation in context_groups.items():
        try:
            citations = citation.values()
            if type(citations) == str:
                citations = [citations]
            dictionary = Dictionary(citations)
            bow_corpus = [dictionary.doc2bow(doc) for doc in citations]
            lda_model = LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2)
            topics[key], topics_list = pretty_print_topics(lda_model.print_topics(num_topics=num_topics, num_words=5))
            topics_d = []
            probs = []
            topics_counts = defaultdict(lambda: [])
            for topic in topics_list:
                topic_words = topic.split(', ')
                for word in topic_words:
                    word_counts[word] += 1
            s = 0
            for i in range(len(bow_corpus)):
                pretty_output, probs_, topics_counts = print_topics_by_ids(lda_model[bow_corpus[i]], topics_list,
                                                                           list(citation.keys())[i], topics_counts)
                s += len(pretty_output)
                topics_d.extend(pretty_output)
                probs.extend(probs_)
            topics_counts_ = []
            for key_, value_ in topics_counts.items():
                temp_dict = {}
                temp_dict['topic'] = key_
                temp_dict['number'] = str(len(value_))
                temp_dict['probability_average'] = str(round(np.average(value_), 3))
                temp_dict['probability_std'] = str(round(np.std(value_), 3))
                topics_counts_.append(temp_dict)
            topics_dist[key]['topics'] = sorted(topics_counts_, key=lambda k: int(k['number']), reverse=True)
            topics_dist[key]['contexts'] = sorted(topics_d, key=lambda k: float(k['probability']), reverse=True)
            #         visdata = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
        #         pyLDAvis.save_html(visdata, path.join('..', 'data', 'new_vis', '{}_vis.html'.format(key)))
        except ValueError:
            continue
    return dict(topics_dist)
Ejemplo n.º 8
0
def main():
    """
    Main file for Discere
    """

    if len(sys.argv) < 2 or not isinstance(sys.argv[1], str):
        print("You need to provide a path for a pdf file")

    path = sys.argv[1]
    raw_pdf = parse_pdf(path)
    segments = segment(raw_pdf)
    processed_docs = clean_segments(segments)
    dictionary = Dictionary(processed_docs)
    bow = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = LdaMulticore(bow,
                             num_topics=20,
                             id2word=dictionary,
                             passes=10,
                             workers=2)

    print(lda_model.print_topics())
def lda3(corpus,dictionary):
  lda_model = LdaMulticore(corpus=corpus,
                          id2word=dictionary,
                          random_state=22,
                          num_topics=100,
                          passes=10,
                          chunksize=1000,
                          batch=False,
                          alpha='asymmetric',
                          decay=0.5,
                          offset=64,
                          eta=None,
                          eval_every=0,
                          iterations=100,
                          gamma_threshold=0.001,
                          per_word_topics=True)
  # save the model
  lda_model.save('lda_model.model')
  # See the topics
  for topic in lda_model.print_topics(100,20):
      print(topic)
                         num_topics=200,
                         passes=50,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

corpus_lda = lda_model[bow_corpus]

topics = lda_model.print_topics(5, num_words=10)
for topic in topics:
    print(topic)

speech = ''
decade = []
for i in range(data.shape[0]):
    year = int(data['year'][i])
    speech = speech + " " + data['speech'][i]
    if (year + 1) % 10 == 0:
        decade.append(speech)
        speech = ''
decade.append(speech)

len(decade)
Ejemplo n.º 11
0
        'parking_lot', 'trail_starts', 'mile_turn', 'north_south',
        'mountain_bike', 'mountain_biking', 'single_track',
        'mountain_bike_trail', 'trail_head'
    ])
    second_stopwords = my_stopwords.union(STOPWORDS).union(bitri_stops)

    # Gensim LDA
    st_featurizer = Featurizer(first_stopwords=first_stopwords,
                               second_stopwords=second_stopwords,
                               bigrams=True,
                               trigrams=True)
    processed_docs = st_featurizer.featurize(X)
    bow_corpus, id2word = make_gensim_bow(processed_docs,
                                          no_below=3,
                                          no_above=0.6,
                                          keep_n=10000)

    k = 6
    lda_model = LdaMulticore(bow_corpus,
                             num_topics=k,
                             id2word=id2word,
                             passes=5,
                             workers=2,
                             iterations=100)
    perplexity, coherence = get_perplexity_coherence(lda_model, bow_corpus,
                                                     processed_docs, id2word)
    print(
        f'LDA with {k} topics: Perplexity is {perplexity:0.2} and coherence is {coherence:0.2}.'
    )
    pprint(lda_model.print_topics())
        with open("temp_corpus.pickle", "rb") as f:
            comments_corpus, dictionary = np.array(pickle.load(f))

    print("created corpus")
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(comments_corpus))

    num_topics = 150
    if args.load:
        model = LdaMulticore.load("topic_models/model_comments")
    else:
        model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics)
        print("model done")
        model.save("topic_models/model_comments")

    print(model.print_topics(20))

    top_topics = model.top_topics(comments_corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    #from pprint import pprint
    #pprint(top_topics)

    for _ in range(10):
        idx = np.random.randint(0, len(comments_text))

        print("comment: {} - topics: {}".format(comments_text[idx],
                [(model.show_topic(tid, topn=10), v) for tid, v
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# Train LDA model.

# Set training parameters.
num_topics = 15
chunksize = 20000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token


model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every, workers=3)

#model = LdaModel(corpus, num_topics=num_topics, id2word = id2word, passes=passes)
model.save('reddit_autism_pass20_topic15_iter400_lda.model')

#hdp = HdpModel(corpus, dictionary)
#hdp.save('reddit_autism_hdp.model')

print(model.print_topics(num_topics=num_topics, num_words=12))
print("Program Ended at: " + str(datetime.now()))
Ejemplo n.º 14
0
class model:
    def __init__(self, data):
        print('model class instantiating')
        self.__data = data
        self.__modelfilename = 'topicmodel.pkl'

    def createbasemodel(self):
        print('Creating base model')
        #Topics	Alpha	Beta	Coherence
        #6	asymmetric	symmetric	0.723863804

        self.__model = LdaMulticore(corpus=self.__data.corpus_tfidf,
                                    id2word=self.__data.id2word,
                                    num_topics=6,
                                    alpha='asymmetric',
                                    eta='symmetric',
                                    workers=2,
                                    random_state=100,
                                    chunksize=100,
                                    passes=10,
                                    per_word_topics=True)
        print(self.__model.print_topics())
        print(self.__model[self.__data.gensim_bow])
        print('calculating coherence')
        __cohe_model = CoherenceModel(model=self.__model,
                                      texts=self.__data.processeddata,
                                      dictionary=self.__data.id2word,
                                      coherence='c_v')
        __cohe = __cohe_model.get_coherence()
        print('coherence :', __cohe)
        #print('hyper param tuning')
        #self.__hyperparamtunning()
        print('saving model')
        self.__savemodel()

    def __savemodel(self):
        with open(self.__modelfilename, 'wb') as file:
            pickle.dump(self.__model, file)

    def __getcoh(self, corpus, dictionary, k, a, b):

        __model = LdaMulticore(corpus=corpus,
                               id2word=dictionary,
                               num_topics=k,
                               alpha=a,
                               eta=b,
                               random_state=100,
                               chunksize=100,
                               passes=10,
                               per_word_topics=True)

        __cohe_model = CoherenceModel(model=__model,
                                      texts=self.__data.processeddata,
                                      dictionary=dictionary,
                                      coherence='c_v')

        return __cohe_model.get_coherence()

    def __hyperparamtunning(self):
        print('hyper param tuning')
        topics_range = list(np.arange(2, 10, 1))
        alpha_range = list(np.arange(0.01, 1, 0.3))
        alpha_range.extend(['symmetric', 'asymmetric'])
        beta_range = list(np.arange(0.01, 1, 0.3))
        beta_range.extend(['symmetric'])

        noofdocs = len(self.__data.processeddata)
        corpus = self.__data.corpus_tfidf
        print('no of docs : ', noofdocs)
        print('dividing corpus  0.25, 0.5, 0.75, 1 shares for testing ')
        corpus_sets = [  #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.25),
            #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.5),
            #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.75),
            corpus
        ]
        corpus_title = ['100% corpus']
        model_results = {
            'Validation_Set': [],
            'Topics': [],
            'Alpha': [],
            'Beta': [],
            'Coherence': []
        }
        if 1 == 1:
            pbar = tqdm.tqdm(total=540)
            for i in range(len(corpus_sets)):
                for k in topics_range:
                    for a in alpha_range:
                        for b in beta_range:
                            cv = self.__getcoh(corpus_sets[i],
                                               self.__data.id2word, k, a, b)
                            model_results['Validation_Set'].append(
                                corpus_title[i])
                            model_results['Topics'].append(k)
                            model_results['Alpha'].append(a)
                            model_results['Beta'].append(b)
                            model_results['Coherence'].append(cv)
                            pbar.update(1)

            results = pd.DataFrame(model_results)
            results.to_csv('lda_tuning_results.csv', index=False)
            print(results)
            pbar.close()
Ejemplo n.º 15
0
###############################################################################
###  LDA Code
###############################################################################

#LDA using bag of words
dictionary = corpora.Dictionary(processed_text)
corpus = [dictionary.doc2bow(doc) for doc in processed_text]

ldamodel = LdaMulticore(corpus,
                        num_topics=3,
                        id2word=dictionary,
                        passes=2,
                        workers=2)

for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

#LDA using TFIDF
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = LdaMulticore(corpus_tfidf,
                        num_topics=3,
                        id2word=dictionary,
                        passes=2,
                        workers=2)

for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

###############################################################################
Ejemplo n.º 16
0
import numpy as np
from gensim.models import LdaMulticore as LdaModel
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train LDA model')
    parser.add_argument('tweet_file', help=('path to twitter downloader dump where each line is a cleaned tweet'))
    parser.add_argument('out_dir', help=('path output file to save the model'))
    parser.add_argument('--include_list', nargs='?', default=None)  # see preprocess for default list
    parser.add_argument('--exclude_list', nargs='?', default=None)  # see preprocess for default list

    parser.add_argument('num_topics', type=int)
    parser.add_argument('--npasses', type=int, default=50)
    parser.add_argument('--decay', type=float, default=.5)
    parser.add_argument('--chunksize', type=int, default=2000)

    lda_filename    = 'lda/middle_east_100.lda'
    args = parser.parse_args()
    corpus = corpora.MmCorpus('lda/lda_middle_east.mm')
    dictionary = corpora.Dictionary.load('lda/lda_middle_east.dict')

    lda = LdaModel(corpus, num_topics=100,
                   alpha=1./100, eta=.2, chunksize=10000,
                   workers=5, passes=100, decay=0.75,
                   id2word=dictionary)

    print('Saving model')
    lda.print_topics()
    lda.save(lda_filename)
    print("lda saved in %s " % lda_filename)
Ejemplo n.º 17
0
class ModelTraining(NewsPipeline):
    def __init__(self):
        super().__init__()
        self.__modelfilename = 'BaseModel.pkl'
        print('ModelTraining instantiated')

    @mlflowtimed
    def _process(self, features):
        self.__config = PipelineConfig.getPipelineConfig(self)
        self.id2word, self.gensim_bow, self.tfidfmodel, self.processeddata = features[
            0], features[1], features[2], features[3]
        self.corpus_tfidf = self.tfidfmodel[self.gensim_bow]
        self.__createbasemodel()

        self._storeMLflowData()
        return features

    def __createbasemodel(self):
        print('Creating base model')
        #Topics	Alpha	Beta	Coherence
        #6	asymmetric	symmetric	0.723863804

        self.__model = LdaMulticore(corpus=self.corpus_tfidf,
                                    id2word=self.id2word,
                                    num_topics=6,
                                    alpha='asymmetric',
                                    eta='symmetric',
                                    workers=2,
                                    random_state=100,
                                    chunksize=100,
                                    passes=10,
                                    per_word_topics=True)
        if self.__config['Storemodel']:
            self.__savemodel()
        print(self.__model.print_topics())
        print(self.__model[self.gensim_bow])
        print('calculating coherence')
        #__cohe_model = CoherenceModel(model=self.__model,texts=self.processeddata,dictionary=self.id2word,coherence='c_v')
        __cohe_model = CoherenceModel(model=self.__model,
                                      corpus=self.corpus_tfidf,
                                      coherence='u_mass')
        __cohe = __cohe_model.get_coherence()
        print('coherence :', __cohe)

        self._addMLflowMetric('BaseModel.Coherence', __cohe)
        return self.__model

    def fit(self, x, y=None):
        print('ModelTraining.fit')
        return self

    def transform(self, x):
        print('ModelTraining.transform')
        return self._process(x)

    def __savemodel(self):
        print('storing topic model')
        try:
            today = datetime.today().strftime('%Y-%m-%d')
            topicmodelfile = os.path.join(DATA_PATH, today,
                                          self.__modelfilename)
            if os.path.isfile(topicmodelfile):
                os.remove(topicmodelfile)
            with open(topicmodelfile, 'wb') as plkfile:
                pickle.dump(self.__model, plkfile)

            return True
        except Exception as ex:
            print(ex)
            return False
Ejemplo n.º 18
0
class ReviewLDA():
    def __init__(self):
        self.__tokenizer_type = None
        self.dataset = None
        self.context = []
        self.vocab_dict = None
        self.lda_model = None
        self.regex = re.compile('[^ 가-힣]+')

    def __check_dataset(self, dataset):
        if type(dataset) != pd.core.frame.DataFrame:
            raise TypeError('데이터 형식을 확인하십시오 (pandas dataframe)')
        if '리뷰' not in dataset.columns:
            raise ValueError('데이터에 "리뷰" column이 없습니다.')

    def load_pkl(self, dataset_path):
        """Pickle 형식의 데이터를 읽습니다."""
        with open(dataset_path, 'rb') as f:
            dataset = pickle.load(f)
        self.__check_dataset(dataset)
        self.dataset = dataset['리뷰'].values

    def load_excel(self, dataset_path):
        dataset = pd.read_excel(dataset_path)
        self.__check_dataset(dataset)
        self.dataset = dataset['리뷰'].values

    def load_csv(self, dataset_path):
        dataset = pd.read_csv(dataset_path)
        self.__check_dataset(dataset)
        self.dataset = dataset['리뷰'].values

    def load_tokenizer(self, method, spm_path=None):
        """method(str) : spm, mecab
           spm_path(str) : method가 spm이면 spm 모델 경로 입력"""
        if method not in ['spm', 'mecab', 'okt']:
            raise ValueError('잘못된 method 입력됨')
        self.__tokenizer_type = method
        if method == 'spm':
            if not spm_path:
                raise ValueError('spm_path가 존재하지 않습니다.')
            self.tokenizer = spm.SentencePieceProcessor()
            self.tokenizer.load(spm_path)
        elif method == 'okt':
            self.tokenizer = Okt()
        else:
            self.tokenizer = Mecab()

    def __tokenize(self, text):
        text = self.regex.sub('', text)
        if not self.__tokenizer_type:
            raise ValueError('Tokenizer를 먼저 load하세요')
        if self.__tokenizer_type == 'spm':
            return [
                repeat_normalize(token.replace("▁", ""))
                for token in self.tokenizer.EncodeAsPieces(text)
            ]
        elif self.__tokenizer_type == 'okt':
            tag_list = ['Noun', 'Verb', 'Adjective', 'Adverb']
            return [
                repeat_normalize(token)
                for token, pos in self.tokenizer.pos(text) if pos in tag_list
            ]
        else:
            return [
                repeat_normalize(token) for token in self.tokenizer.nouns(text)
            ]

    def kor_preprocess(self):
        for review in tqdm(self.dataset):
            self.context.append(self.__tokenize(review))

    def make_ngram(self, n, min_count=10):
        result = list()
        bigram = Phrases(self.context, min_count=min_count, threshold=10)
        if n == 2:
            for doc in tqdm(self.context):
                result.append(bigram[doc])
        elif n == 3:
            trigram = Phrases(bigram[self.context])
            for doc in tqdm(self.context):
                result.append(trigram[doc])
        else:
            raise ValueError('n그램 값이 너무 큽니다.(2 혹은 3)')
        self.context = result

    def make_vocab(self):
        count_dict = Counter()
        for review in tqdm(self.context):
            count_dict.update(Counter(review))
        self.vocab_dict = dict(
            sorted(count_dict.items(), key=(lambda x: x[1]), reverse=True))

    def get_vocab(self, reverse=True):
        if reverse:
            return self.vocab_dict
        else:
            return dict(
                sorted(self.vocab_dict.items(),
                       key=(lambda x: x[1]),
                       reverse=False))

    def filter_vocab(self, min_count=0, max_count=None):
        """min_count (int) : 이 이상 등장한 단어만 사용
           max_count (int) : 이 이하 등장한 단어만 사용"""
        if not self.vocab_dict:
            raise ValueError("vocab을 먼저 생성해주십시요 (make_vocab method)")
        if not max_count:
            max_count = len(self.context)
        self.vocab_dict = dict([(k, v) for k, v in self.vocab_dict.items()
                                if v > min_count if v < max_count])

    def do_lda(self, num_topics, workers=8, iterations=400, passes=15):
        self.id2word = dict([(i, k)
                             for i, k in enumerate(self.vocab_dict.keys())])
        self.word2id = dict([(k, i)
                             for i, k in enumerate(self.vocab_dict.keys())])
        self.corpus = list()

        for review in tqdm(self.context):
            self.corpus.append(self.__get_doc2bow(review))

        print('Fitting Start.')
        self.lda_model = LdaMulticore(corpus=self.corpus,
                                      num_topics=num_topics,
                                      id2word=Dictionary().from_corpus(
                                          self.corpus, self.id2word),
                                      workers=workers,
                                      iterations=iterations,
                                      passes=passes)
        print('Model Fitted.')

    def print_lda(self):
        if not self.lda_model:
            raise ValueError('모델을 먼저 학습하십시오')
        pprint.pprint(self.lda_model.print_topics())

    def __get_doc2bow(self, review):
        counter = dict()
        for token in review:
            if self.word2id.get(token):
                counter[self.word2id[token]] = counter.get(
                    self.word2id[token], 1) + 1
        return list(counter.items())
Ejemplo n.º 19
0
class LDAMWBase:
    def __init__(self,
                 mtype='multiple',
                 resource=None,
                 lda_work_folder=None,
                 lda_model_filename=None,
                 lda_dict_filename=None,
                 lda_topic_word_count=0,
                 lda_topics_count=0,
                 resource_language=None,
                 data_type=None):

        #
        # todo Deutsch Lemmatizer / Stemmer !!!
        #

        self.p_stemmer = PorterStemmer()
        self.wn_lemmatizer = WordNetLemmatizer()

        if resource is not None:
            # resource_lang == 'en' as default
            resource_lang = 'en'

            # hope that resource is correct and exists
            if data_type == 'db':
                resource_lang = Resources.select(Resources.lang).where(
                    Resources.resource == resource).get()
                resource_lang = resource_lang.__data__['lang'].lower()

            elif data_type == 'csv':
                if resource_language is None:
                    raise Exception(
                        "Resource language must be defined for csv data type.")
                else:
                    resource_lang = resource_language
            else:
                pass

            self.stop_words = get_stop_words(resource_lang)

        self.resource_identifier_name = resource

        def _create_model_deps(model_name,
                               twordscount,
                               tcount,
                               mini=False,
                               mini_path=None):

            if not mini:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + model_name
            else:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + mini_path

            mn = 'lda_model' + '_' + model_name
            md = 'dictionary' + '_' + model_name
            ltwordscount = twordscount
            ltcount = tcount

            _short_model_report = "{}{}: {} \n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}".format(
                INFO_FLAG, colored("Model path", 'red', None,
                                   ['bold']), mp, INFO_FLAG,
                colored("Model name", 'red', None, ['bold']), mn, INFO_FLAG,
                colored("Model dictionary", 'red', None,
                        ['bold']), md, INFO_FLAG,
                colored("Topic words count", 'red', None,
                        ['bold']), ltwordscount, INFO_FLAG,
                colored("Topics count", 'red', None, ['bold']), ltcount,
                "-" * 88)
            if model_name != 'mini':
                print(_short_model_report)

            return mp, mn, md, ltwordscount, ltcount

        if mtype == 'multiple':
            if resource is not None:
                mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                    self.resource_identifier_name, LDA_TOPIC_WORD_COUNT,
                    LDA_TOPICS_COUNT)
            else:
                raise Exception(
                    "{}Resource must be defined. Exiting... \n".format(
                        EXCEPTION_FLAG))

        elif mtype == 'single_ltc':
            mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                "mini",
                MINI_LDA_TOPIC_WORD_COUNT,
                MINI_LDA_TOPICS_COUNT,
                mini=True,
                mini_path=self.resource_identifier_name + "/mini")

        if lda_work_folder is None:
            self.lda_work_folder = mpath
        else:
            self.lda_work_folder = lda_work_folder

        if not os.path.exists(self.lda_work_folder):
            os.mkdir(self.lda_work_folder)

        if lda_model_filename is None:
            self.lda_model_filename = os.path.join(self.lda_work_folder, mname)
        else:
            self.lda_model_filename = os.path.join(self.lda_work_folder,
                                                   lda_model_filename)

        if lda_dict_filename is None:
            self.lda_dict_filename = os.path.join(self.lda_work_folder, mdict)
        else:
            self.lda_dict_filename = os.path.join(self.lda_work_folder,
                                                  lda_dict_filename)

        self.lda_topics_count = lda_topics_count
        self.lda_topic_word_count = lda_topic_word_count

        self.dictionary = None
        self.lda_model = None
        self.lda_topics = []

    @staticmethod
    def load_csv_data(csv_file):
        df = pd.read_csv(csv_file)
        train_documents = df['content'].values

        return train_documents

    @staticmethod
    def load_single_ltc(ltc_data):
        train_documents = re.split(
            r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ltc_data)

        return train_documents

    @staticmethod
    def load_db_data(resource=None):
        # if resource is None:
        #     art_content_stream = Articles.select()
        # else:
        art_content_stream = Articles.select().where(
            Articles.resource == resource)

        train_documents = (acs.content for acs in art_content_stream
                           if acs.content is not None)

        return train_documents

    def save_model(self,
                   as_name=None,
                   save_on_disk=True,
                   save_topics_into_db=False):
        if save_on_disk:
            print(" \t-> Model was saved as [ {} ]".format(as_name))
            if as_name is not None:
                self.lda_model.save(as_name)
            else:
                self.save_model(self.lda_model_filename)

        if save_topics_into_db:
            truncate_topics_tables(resource=self.resource_identifier_name)

            print(" \t-> Topics will be saved in database for [ {} ]".format(
                self.resource_identifier_name))

            model_numbers_topics = self._get_topics()

            try:
                for topic_info in model_numbers_topics:
                    tnum = topic_info[0]
                    tresourceid = topic_info[1]
                    tname = topic_info[2]

                    _topic = {
                        'ident_number': tnum,
                        'value': tname,
                        'created_at': dt.datetime.today().date()
                    }

                    t = Topics.create(**_topic)

                    t_id = t.__data__['topic']

                    _topic_resource = {
                        'resource': tresourceid,
                        'topic': t_id,
                        'created_at': dt.datetime.today().date()
                    }

                    tr = TopicsResources.create(**_topic_resource)

                print("{}[ {} ]".format(SUCCESS_FLAG,
                                        self.resource_identifier_name))
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                print("{}Failure: [ {} ]".format(
                    ERROR_FLAG, self.resource_identifier_name))

    def train_model(self,
                    data_type,
                    resource,
                    single_ltc_data=None,
                    data_file_path=None,
                    train_corpus=None,
                    train_dictionary=None,
                    save_model_as=None,
                    chunksize=LDA_CHUNKSIZE,
                    passes=LDA_PASSES):

        if train_corpus is not None:
            corpus = train_corpus

        elif data_type == 'db':
            corpus = self._make_corpus(data_type=data_type, resource=resource)

        elif data_type == 'single_ltc' and single_ltc_data is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       ltc=single_ltc_data,
                                       resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       data_file_path=data_file_path,
                                       resource=resource)

        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        if train_dictionary is not None:
            dictionary = train_dictionary
        else:
            dictionary = self.dictionary
        """
			id2word parameter need to get words in topics instead of their indexes in dict
		"""
        _tcount = self.lda_topics_count

        # self.lda_model = LdaModel(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize)
        self.lda_model = LdaMulticore(corpus=corpus,
                                      num_topics=_tcount,
                                      id2word=dictionary,
                                      passes=passes,
                                      chunksize=chunksize)

        if save_model_as is not None and not single_ltc_data:
            self.save_model(save_model_as,
                            save_on_disk=True,
                            save_topics_into_db=False)

        elif single_ltc_data:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)
        elif data_type == 'csv':
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)

        else:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=True)

        print("{}Trained".format(SUCCESS_FLAG))

    def load_model(self, model_file_path=None, dict_file_path=None):
        """
			load model and dictionary from file (need to save them in train function)
			uses to update model on another corpus
		"""

        if model_file_path is not None and os.path.exists(model_file_path):
            self.lda_model = LdaMulticore.load(model_file_path)
            # self.lda_model = LdaModel.load(model_file_path)
            self.dictionary = Dictionary.load(dict_file_path)
            print(" \t-> Loaded: [ {} ]".format(model_file_path))

        elif model_file_path is None and os.path.exists(
                self.lda_model_filename):
            self.lda_model = LdaMulticore.load(self.lda_model_filename)
            # self.lda_model = LdaModel.load(self.lda_model_filename)
            self.dictionary = Dictionary.load(self.lda_dict_filename)
            print(" \t-> Loaded: [ {} ]".format(self.lda_model_filename))

        else:
            print(
                "{}Filepath you gave is incorrect. \n     Give another one and retry."
                "\n     Exiting...".format(ERROR_FLAG))
            exit()

        for i in range(self.lda_model.num_topics):
            terms_id = self.lda_model.get_topic_terms(
                i, self.lda_topic_word_count)

            terms = [self.dictionary.get(x[0]) for x in terms_id]

            self.lda_topics.append(' '.join(terms))

    def update_model(self,
                     ondata_file_path=None,
                     resource=None,
                     data_type='db'):
        if ondata_file_path is not None and data_type == 'csv':
            corpus = self._make_corpus(data_file_path=ondata_file_path,
                                       data_type=data_type,
                                       resource=resource)
        elif data_type == 'db':
            corpus = self._make_corpus(data_file_path=None,
                                       data_type=data_type,
                                       resource=resource)
        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        self.lda_model.update(corpus)

    def process_record(self, text, data_type):
        """
			data_type - db / csv / single_ltc
		"""

        if data_type == 'single_ltc':
            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        elif self.lda_model is None:

            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        if data_type == 'db':
            if self.lda_model is None:
                return dict()

            doc = self._prepare_single_document(text)

            if doc is not None:
                topics = self._get_document_topics(doc)

                top_topic = topics[0]

                return [('topic', self.lda_topics[top_topic])]

            return [('topic', "")]

        elif data_type == 'csv':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                0], topics_in_count_by_ids[1:]

            result_topic_word_descr = re.sub(
                '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id))

            return [('topic', result_topic_word_descr),
                    ('other_topics', current_doc_other_topics)]

        elif data_type == 'single_ltc':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            if topics_in_count_by_ids is not None:
                current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                    0], topics_in_count_by_ids[1:]

                result_topic_word_descr = re.sub(
                    '[^A-Za-z]+', ' ',
                    self._get_topic_by_id(current_doc_topic_id))

                return result_topic_word_descr, current_doc_other_topics
            else:
                return "", []

    def _get_metric_fields(self):
        if self.lda_model is None:
            return []

        else:
            return ['topic']

    def _get_document_topics(self, doc, count=5):
        if doc is not None:
            bow = self.dictionary.doc2bow(doc)
            topics = self.lda_model.get_document_topics(
                bow, minimum_probability=0.0)
            topics_in_count = list(
                ident_number for (ident_number, prob) in sorted(
                    topics, key=itemgetter(1), reverse=True)[:count])

            return topics_in_count

    def _get_document_topic(self, doc_topics):
        topic_id_probs = {}

        for t_prob in doc_topics:
            topic_id_probs[t_prob[0]] = t_prob[1]

        doc_topic_id = sorted(topic_id_probs,
                              key=topic_id_probs.get,
                              reverse=True)[0]
        doc_topic_prob = topic_id_probs[doc_topic_id]

        return [doc_topic_id, doc_topic_prob]

    def _prepare_single_document(self, sd):
        if sd is None or type(sd) == np.float:
            return None

        try:
            sd = sd.lower()
            sd = nltk.tokenize.word_tokenize(sd)
            sd = (word for word in sd if word.isalpha() and len(word) > 2)
            stopped_sd = (word for word in sd if word not in self.stop_words)

            lemmatized_doc = [
                self.wn_lemmatizer.lemmatize(word) for word in stopped_sd
            ]

            return lemmatized_doc

        except AttributeError as e:
            print("{}{}".format(EXCEPTION_FLAG, e))
            return None

    def _make_bow(self, text):
        if text is not None:
            d = self._prepare_single_document(text)

            return self.dictionary.doc2bow(d)

    def _make_corpus(self,
                     data_type,
                     resource,
                     data_file_path=None,
                     save_train_dict=True,
                     save_dict_as=None,
                     ltc=None):
        """
			data type can be csv or db # or new - single_ltc
		"""
        if data_type == 'db':
            documents = self.load_db_data(resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            documents = self.load_csv_data(data_file_path)

        elif data_type == 'single_ltc' and ltc is not None:

            ltc_text = " ".join(e if type(e) is str else "" for e in ltc)
            documents = self.load_single_ltc(ltc_text)

        else:
            documents = None

            print("{}documents is None. Exiting ... \n".format(ERROR_FLAG))
            exit()

        with Pool() as pool:
            processed_docs = pool.imap(self._prepare_single_document,
                                       documents)
            pool.close()
            pool.join()

        processed_docs = (i for i in processed_docs if i is not None)
        self.dictionary = Dictionary(processed_docs)

        if save_train_dict and save_dict_as is None:
            self.dictionary.save(self.lda_dict_filename)
        else:
            self.dictionary.save(save_dict_as)

        corpus = [
            self.dictionary.doc2bow(proc_doc) for proc_doc in processed_docs
        ]

        return corpus

    def _get_topic_by_id(self, topic_id):
        if self.lda_topic_word_count is not None:
            return self.lda_model.print_topic(topic_id,
                                              self.lda_topic_word_count)

        else:
            return self.lda_model.print_topic(topic_id, 6)

    def _get_topics(self, default_view=False, for_db=True):
        """
			2-tuples (probability * word) of most probable words in topics
			num_topics=-1 <--- to print all topics
		"""
        def _get_words(probabilities_words_string):
            _pre_topic_with_digits_trash = " ".join(
                re.findall(ALL_CHARS, probabilities_words_string))
            probaply_clean_topic = re.sub(r'\b\d+(?:\.\d+)?\s+', "",
                                          _pre_topic_with_digits_trash)

            return probaply_clean_topic  # " ".join(re.findall('[a-zA-Z]+', probabilities_words_string))

        if default_view:
            return self.lda_model.print_topics(num_topics=-1)

        if for_db:
            resource_id = Resources.select().where(
                Resources.resource == self.resource_identifier_name).first()
            resource_id = resource_id.__data__['resource']

            return [(elem[0], resource_id, _get_words(elem[1]))
                    for elem in self.lda_model.print_topics(
                        num_topics=self.lda_topics_count,
                        num_words=self.lda_topic_word_count)]

        return [(elem[0], _get_words(elem[1]))
                for elem in self.lda_model.print_topics(
                    num_topics=self.lda_topics_count,
                    num_words=self.lda_topic_word_count)]
Ejemplo n.º 20
0
    passes = random.randint(100, 120)
    eval_every = None
    seed = np.random.randint(0, 999999)
    print("Seed:", seed, "\n")

    ldaModel = LdaMulticore(corpus,
                            num_topics=numberOfTopics,
                            id2word=dictionary,
                            passes=passes,
                            alpha='asymmetric',
                            eval_every=eval_every,
                            workers=3,
                            random_state=seed)

    # Check resulting topics.
    listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics,
                                         num_words=15)
    for index, i in enumerate(listOfTopics):
        string = str(i[1])
        for c in "0123456789+*\".":
            string = string.replace(c, "")
        string = string.replace("  ", " ")
        print(string)
    # calculate & display perplexity
    print('\nPerplexity: ', ldaModel.log_perplexity(
        corpus))  # a measure of how good the model is. lower the better.

    # calculate & display coherence
    coherenceModel = CoherenceModel(model=ldaModel,
                                    texts=document,
                                    dictionary=dictionary,
                                    coherence='c_v')
    eta=None)  # LDA params for topic-words (all = 0.1)
'''
: param corpus     : corpus to perform the LDA on
: param num_topics : assumed number of topics present in the corpus
: param id2word    : dictionary mapping word ids (int) to actual words (str)
: param alpha      : list of parameters for Drichlet Distribution of topics per document
                    --> # of parameters = num_topics  (# of topics)
                    --> if 'symmetric', all parameters = 0.1
: param eta        : list of parameters for Drichlet Distribution of words per topic
                    --> # of parameters = len(id2word) (# of unique words)      
                    --> if not specified, all parameters = 0.1
'''

############### 6. PRINT OUT DETECTED TOPICS & ASSOCIATED WORDS ###############
# Following prints out words occuring in each of the 10 topics & their relative weight
for i, topic in BOW_lda_model.print_topics(-1):
    print("Topic {}: \n{}\n".format(i, topic))

############### 7. PREDICT A TOPIC CLASS FOR A SAMPLE DOCUMENT ################
# Use BOW_lda_model to predict which topic this document belongs to:
sample_doc_i = 827

for i, score in sorted(BOW_lda_model[BOW_corpus[sample_doc_i]],
                       key=lambda tup: -1 * tup[1]):
    print("\nScore: {}\nTopic: {}".format(score,
                                          BOW_lda_model.print_topic(i, 10)))

################# 8. PREDICT A TOPIC CLASS FOR A NEW DOCUMENT #################
# Use BOW_lda_model to predict which topic a new document belongs to:
new_doc = "Syria gets terrorist attack kills 22 people"
Ejemplo n.º 22
0
    def lda(self,
            cat_list: list,
            below: int = 100,
            above: float = 0.1,
            eta: float = 0.9):

        assert set(cat_list).issubset(set(self.table.category.unique()))

        df_topic2 = self.table[self.table.category.isin(
            cat_list)].reset_index().iloc[:, 1:]
        instances = df_topic2.clean_text.apply(str.split)
        d = Dictionary(instances)
        print("Dictionary is:", d)
        d.filter_extremes(no_below=below, no_above=above)
        print("Dictionary after filtering:", d)
        ldacorpus = [d.doc2bow(text) for text in instances]
        tfidfmodel = TfidfModel(ldacorpus)
        model_corpus = tfidfmodel[ldacorpus]
        num_topics = len(df_topic2.groupby(['category']).count())
        temp = df_topic2.groupby(['category']).count()
        prior_probabilities = temp["app"] / temp["app"].sum()
        alpha = prior_probabilities.values
        print("Prior probabilities of the topics -alpha- are:", alpha)
        num_passes = 10
        chunk_size = len(model_corpus) * num_passes / 200
        print("Preliminary steps to prepare the model done")
        model = LdaMulticore(
            num_topics=num_topics,  # number of topics
            corpus=model_corpus,  # what to train on 
            id2word=d,  # mapping from IDs to words
            workers=min(10,
                        multiprocessing.cpu_count() -
                        1),  # choose 10 cores, or whatever computer has
            passes=num_passes,  # make this many passes over data
            chunksize=chunk_size,  # update after this many instances
            alpha=alpha,
            eta=eta,
            random_state=5)
        print("Model is ready")
        topic_corpus = model[model_corpus]
        topic_sep = re.compile(r"0\.[0-9]{3}\*")
        model_topics = [(topic_no, re.sub(topic_sep, '',
                                          model_topic).split(' + '))
                        for topic_no, model_topic in model.print_topics(
                            num_topics=num_topics, num_words=5)]

        descriptors = []
        for i, m in model_topics:
            print(i + 1, ", ".join(m[:3]))
            descriptors.append(", ".join(m[:2]).replace('"', ''))
        print(descriptors)
        scores = [[t[1] for t in topic_corpus[entry]]
                  for entry in range(len(instances))]
        topic_distros = pd.DataFrame(data=scores, columns=descriptors)
        topic_distros['category'] = df_topic2['category']
        #%matplotlib inline

        print("Preparing graph")

        sns.set_context('poster')

        fig, ax = plt.subplots(figsize=(20, 10))

        aggregate_by_category = topic_distros.groupby(
            topic_distros.category).mean()

        aggregate_by_category[descriptors].plot.bar(ax=ax)

        fig.set_size_inches(30, 30)
        plt.legend(loc='center left',
                   bbox_to_anchor=(1.0, 0.5),
                   prop={'size': 25})
Ejemplo n.º 23
0
def train_LDA_model(data, num_topics, CPUs):

    # Pre-processing
    sentences = [nltk.tokenize.sent_tokenize(doc) for doc in data]
    sentences = [val for sublist in sentences for val in sublist]
    data_words = list(sent_to_words(sentences))

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(
        data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    # ## Train LDA Model

    # Build LDA model
    lda_model = LdaMulticore(corpus=corpus,
                             id2word=id2word,
                             num_topics=num_topics,
                             random_state=50,
                             chunksize=100,
                             passes=10,
                             per_word_topics=True,
                             workers=CPUs)

    model_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/lda_model_all_years.model'
    lda_model.save(model_dest)

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]

    # Visualize the topics
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    storage_dest_lda_html = lda_data_dir + 'LDA_model/all_years_2007_2017/all_years_2007_2017_local_lda.html'
    pyLDAvis.save_html(vis, storage_dest_lda_html)

    wordcloud_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/wordclouds/'

    for t in range(lda_model.num_topics):
        plt.figure()
        dictionary = {}
        plt.imshow(WordCloud().fit_words(
            Convert(lda_model.show_topic(t, 30), dictionary)))
        plt.axis("off")
        plt.title("Topic_" + str(t))
        plt.show()
        plt.savefig(wordcloud_dest + "Topic #" + str(t) +
                    '.png')  # set location on server

    return lda_model
Ejemplo n.º 24
0
                             chunksize=1000,
                             batch=False,
                             alpha='asymmetric',
                             decay=0.5,
                             offset=64,
                             eta=None,
                             eval_every=0,
                             iterations=100,
                             gamma_threshold=0.001,
                             per_word_topics=True)

    # save the model
    lda_model.save('tmp/lda_model.model')

    # See the topics
    lda_model.print_topics(-1)

    for c in lda_model[corpus[5:8]]:
        print("Document Topics      : ", c[0])  # [(Topics, Perc Contrib)]
        print("Word id, Topics      : ", c[1][:3])  # [(Word id, [Topics])]
        print("Phi Values (word id) : ",
              c[2][:2])  # [(Word id, [(Topic, Phi Value)])]
        print("Word, Topics         : ",
              [(dct[wd], topic)
               for wd, topic in c[1][:2]])  # [(Word, [Topics])]
        print("Phi Values (word)    : ",
              [(dct[wd], topic)
               for wd, topic in c[2][:2]])  # [(Word, [(Topic, Phi Value)])]
        print("------------------------------------------------------\n")

    train_vecs = []
Ejemplo n.º 25
0
class TopicModel:
    """
    Create a topic model.

    Filtering parts of speech is currently done using tools.Farasa.
    """
    def __init__(self,
                 pos_to_use: List[str],
                 stop_words: Union[Set[str], List[str], str],
                 min_df: Union[int, float] = 5,
                 max_df: Union[int, float] = 0.85,
                 num_workers: int = 1):
        """
        Initialize model.

        :param pos_to_use: Parts of speech to use, possible values are (Farasa-specific)
        ['S', 'E',
         'V', 'NOUN', 'PRON', 'ADJ', 'NUM',
         'CONJ', 'PART', 'NSUFF', 'CASE', 'FOREIGN',
         'DET', 'PREP', 'ABBREV', 'PUNC']

        :param stop_words: list/set of stop words or filepath to the file containing the stop words.

        :param max_df: When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).

        :param min_df: When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is also
        called cut-off in the literature.

        :param num_workers: Number of worker to use for preprocessing and training.
        """
        if isinstance(stop_words, str):
            stop_words = open(stop_words).read().split('\n')

        if isinstance(stop_words, list):
            stop_words = set(stop_words)

        self.pos_to_use = pos_to_use
        self.num_workers = num_workers

        self.min_df = min_df
        self.max_df = max_df

        self.stop_words = stop_words

        self.vectorizer = TfidfVectorizer(stop_words=stop_words,
                                          max_df=self.max_df)

        self.bigram_model: Optional[Phraser] = None
        self.trigram_model: Optional[Phraser] = None
        self.id2word: Optional[Dict] = None

        self._farasa: Optional[Farasa] = None

    @staticmethod
    def _init_pool():
        """
        Intialize pool.

        Ran only one.
        """
        global farasa
        farasa = Farasa(singelton=False)

    def preprocess_document(self, document: str) -> str:
        """
        Preprocess document.

        :param document: document to preprocess.
        """
        return _preprocess_arabic_text(document,
                                       remove_non_arabic=True,
                                       remove_punctuation=True,
                                       remove_numbers=True,
                                       remove_emails_urls_html=True,
                                       remove_hashtags_mentions=True)

    def _unit_of_work(self, pos_to_use: List[str], document: str) -> str:
        """
        Apply unit of work.

        :param pos_to_use: Parts of speech to keep.
        :param document: Document to process.
        """
        global farasa

        return farasa.filter_pos(  # type: ignore
            self.preprocess_document(document),
            parts_of_speech_to_keep=pos_to_use)

    def preprocess_documents(
            self, documents: Sequence[str]) -> Generator[str, None, None]:
        """
        Preprocess documents.

        :param documents: documents to preprocess.
        """
        progress = tqdm(total=len(documents))

        LOGGER.info('Launching %d workers..', self.num_workers)
        pool = Pool(self.num_workers, initializer=self._init_pool)

        LOGGER.info('Preprocessing documents using %d workers..',
                    self.num_workers)

        results = []
        for document in documents:
            result = pool.apply_async(self._unit_of_work,
                                      (self.pos_to_use, document),
                                      callback=lambda *args: progress.update())
            results.append(result)

        for result in results:
            document = result.get()

            if document != '':
                yield document

        LOGGER.info('Preprocessing is done.')

    def tokenize(self, document: str) -> List[str]:
        """
        Tokenize a document.

        Uses NLTK word tokenizer.
        """
        tokens = word_tokenize(document)
        return [token for token in tokens if token not in self.stop_words]

    def create_trigrams(self, tokens: List[str]) -> List[str]:
        """
        Create trigrams.

        :param tokens: list of tokens.
        :returns: n-gram where n is between 1-3.
        """
        if self.trigram_model and self.bigram_model:
            return self.trigram_model[self.bigram_model[tokens]]

        raise ValueError('trigram model is not fitted yet!')

    def build_vocab(
            self,
            documents_tokens: List[List[str]]) -> Tuple[List[List[str]], Dict]:
        """
        Build vocabualry.

        :param documents_tokens: documents as list of tokens, e.g. [
            ['the', 'brown', 'fox'],
            ['another', 'word', ..],
            ...
        ]

        :returns: a tuple consisting of list of documents as word counts (Bag-of-words),
        and Id2Word dictionary.
        """
        LOGGER.info('Fitting bigram model..')
        bigram = Phrases(documents_tokens,
                         min_count=self.min_df,
                         threshold=100,
                         progress_per=100,
                         common_terms=self.stop_words)

        self.bigram_model = Phraser(bigram)

        LOGGER.info('Fitting trigram model..')
        self.trigram_model = Phraser(
            Phrases(bigram[documents_tokens], threshold=100))

        documents_trigrams = []

        LOGGER.info('Creating trigrams..')
        for index in range(len(documents_tokens) - 1, -1, -1):
            documents_trigrams.append(
                self.create_trigrams(documents_tokens[index]))
            documents_tokens.pop()

        id2word = Dictionary(documents_trigrams)
        return [id2word.doc2bow(text) for text in documents_trigrams], id2word

    def fit(self,
            documents: Sequence[str],
            preprocess: bool,
            passes: int,
            random_state: int,
            num_topics: int,
            chunksize: int = 1000):
        """
        Fit model.

        :param documents: documents to fit the model on.
        :param preprocess: whether to preprocess documens before training the model.
        :param passes: number of passes over the training dataset, 1 is enough if dataset is large.
        :param random_state: random state seed for reproducibility.
        :param num_topics: number of topics.
        :param chunksize: number of document to use per update.
        """
        self.vectorizer = self.vectorizer.fit(documents)
        self.stop_words |= self.vectorizer.stop_words_

        documents_iter: Iterable = documents if not preprocess else self.preprocess_documents(
            documents)

        LOGGER.info('Building vocab..')
        corpus, self.id2word = self.build_vocab(
            [self.tokenize(x) for x in documents_iter])

        LOGGER.info('Fitting lda..')
        self._lda_model = LdaMulticore(
            corpus=corpus,
            id2word=self.id2word,
            num_topics=num_topics,
            random_state=random_state,
            chunksize=chunksize,
            passes=passes,
            per_word_topics=True,
            workers=self.num_workers,
        )

        self.topics = self._lda_model.print_topics(num_topics=num_topics,
                                                   num_words=100)

    def predict(self, document, topics_map: Dict[int, str],
                num_topics: int) -> List[str]:
        """
        Predict topics distribution for a document.

        :params document: document to predict topics for.
        :params topics_map: a mapping of topic number to topic name.
        :params num_topics: return the top num_topics.
        :returns: a list of topic numbers sorted by their probabilities.
        """
        tokens = (
            seq([document]).map(self.preprocess_document).map(
                lemmatize)  # type: ignore
            .map(self.tokenize).map(self.create_trigrams).flat_map(
                self.id2word.doc2bow)  # type: ignore
            .to_list())

        topics = (seq(
            self._lda_model[tokens][0]).sorted(key=lambda x: -x[1]).map(
                get(0)).filter(None).distinct().take(num_topics))

        if topics_map:
            topics = topics.map(lambda topic: topics_map[topic])

        return topics.to_list()

    @staticmethod
    def load(path: str) -> 'TopicModel':
        """
        Load model.

        :param path: path to the model.
        """
        return dill.load(open(path, 'rb'))

    def save(self, path: str):
        """
        Save model.

        :param path: path to save the model to.
        """
        farasa: Farasa = self.__dict__.pop('_farasa')
        dill.dump(self, open(path, 'wb'))

        self._farasa = farasa
class GensimMalletTopicExtractor:
    def __init__(self, language='english', stopwords_extent=None):
        self.language2la = {
            'english': 'en',
            'french': 'fr',
            'spanish': 'es'
        }
        if language not in self.language2la:
            raise ValueError('Language must be "english", "french" or "spanish"')
        self.language = language
        self.stop_words = stopwords.words(self.language)
        if stopwords_extent is str or stopwords_extent is list:
            self.stop_words.extend(stopwords_extent)
        self.df_topic_sents_keywords = None
        self.bigram = None
        self.bigram_phraser = None
        self.trigram = None
        self.trigram_phraser = None
        self.vis = None
        self.data = None
        self.data_words = None
        self.data_words_nostops = None
        self.data_words_bigrams = None
        self.data_words_trigrams = None
        self.nlp = None
        self.data_lemmatized = None
        self.id2word = None
        self.texts = None
        self.corpus = None
        self.mallet_path = None
        self.lda_model = None
        self.coherence_model_lda = None
        self.coherence_lda = None
        self.coherence_values = []
        self.model_list = []
        self.optimal_number_of_topics = None
        self.optimal_model = None
        self.optimal_topics = None

    @staticmethod
    def sent_to_words(sentences, remove_punctuation=True):
        for sentence in sentences:
            # deacc=True removes punctuations
            yield(simple_preprocess(str(sentence), deacc=remove_punctuation))

    def remove_stopwords(self, texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts]

    def make_bigrams(self, texts):
        self.bigram = Phrases(self.data_words, min_count=5, threshold=100)
        self.bigram_phraser = Phraser(self.bigram)
        return [self.bigram_phraser[doc] for doc in texts]

    def make_trigrams(self, texts):
        tokens_ = self.bigram_phraser[texts]
        self.trigram = Phrases(tokens_, threshold=100)
        self.trigram_phraser = Phraser(self.trigram)
        return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts]

    def lemmatization(self, texts, allowed_postags=None):
        if allowed_postags is None:
            allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = self.nlp(" ".join(sent))
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out

    def view_terms_frequency(self, text_id, first_words=20):
        # Human readable format of corpus (term-frequency)
        list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]]
        pprint(list_)

    def visualize_lda(self):
        # Visualize the topics
        # pyLDAvis.enable_notebook()
        self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        print(self.vis)

    def instanciate_model(self, num_topics, passes, iterations,
                          enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False):
        if enable_mallet is True:
            # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
            os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'})
            self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet'  # update this path
            self.lda_model = LdaMallet(self.mallet_path,
                                       corpus=self.corpus,
                                       num_topics=num_topics,
                                       id2word=self.id2word,
                                       iterations=iterations,
                                       optimize_interval=optimize_interval,
                                       topic_threshold=topic_threshold)
            print('Mallet LDA model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.show_topics(formatted=False))
        else:
            self.lda_model = LdaMulticore(corpus=self.corpus,
                                          id2word=self.id2word,
                                          num_topics=num_topics,
                                          random_state=100,
                                          chunksize=500,
                                          passes=passes,
                                          iterations=iterations,
                                          per_word_topics=True)
            print('LDA_MultiCore model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.print_topics())

    def extract_topics(self, data, num_topics, passes=10, iterations=500,
                       enable_mallet=True, optimize_interval=0,
                       topic_threshold=0.0):
        self.data = data
        print('\nEXTRACTING ' + str(num_topics) + ' TOPICS')
        self.data_words = list(self.sent_to_words(self.data, True))
        # Remove Stop Words
        print('\nRemoving stopwords')
        self.data_words_nostops = self.remove_stopwords(self.data_words)
        # Form Bigrams
        print('Looking for bigrams')
        self.data_words_bigrams = self.make_bigrams(self.data_words_nostops)
        # Form Trigrams
        print('Looking for trigrams')
        self.data_words_trigrams = self.make_trigrams(self.data_words_nostops)
        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        # python3 -m spacy download en
        print('Loading Spacy with ' + self.language + ' dictionary')
        self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner'])
        # Do lemmatization keeping only noun, adj, vb, adv
        print('Lemmatizing')
        self.data_lemmatized = self.lemmatization(self.data_words_trigrams,
                                                  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # Create Dictionary
        print('Creating dictionary')
        self.id2word = corpora.Dictionary(self.data_lemmatized)
        # Create Corpus
        print('Creating corpus')
        self.texts = self.data_lemmatized
        # Term Document Frequency
        print('Computing document frequency')
        self.corpus = [self.id2word.doc2bow(text) for text in self.texts]
        # Build LDA model
        print('\nEnable_mallet is', enable_mallet, '\n')
        self.instanciate_model(num_topics, passes, iterations,
                               enable_mallet, optimize_interval, topic_threshold,
                               show_topics_on_creation=True)
        # print(self.lda_model[self.corpus])
        # Compute Perplexity
        # a measure of how good the model is. lower the better.
        if hasattr(self.lda_model, 'log_perplexity'):
            print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus))

        # Compute Coherence Score
        print('\nComputing coherence model')
        self.coherence_model_lda = CoherenceModel(model=self.lda_model,
                                                  texts=self.data_lemmatized,
                                                  dictionary=self.id2word,
                                                  coherence='c_v')
        print('Getting coherence')
        self.coherence_lda = self.coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', self.coherence_lda)

        if enable_mallet is False:
            self.visualize_lda()

    def view_optimal_topics(self, num_words=20):
        pprint(self.optimal_model.print_topics(num_words=num_words))

    def compute_coherence_values(self, limit, start=2, step=3, passes=10,
                                 iterations=500, enable_mallet=True,
                                 optimize_interval=0, topic_threshold=0.0):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        for num_topics in range(start, limit, step):
            print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10)
            self.instanciate_model(num_topics, passes, iterations,
                                   enable_mallet, optimize_interval, topic_threshold,
                                   show_topics_on_creation=False)
            self.model_list.append(self.lda_model)
            coherence_model = CoherenceModel(model=self.lda_model,
                                             texts=self.data_lemmatized,
                                             dictionary=self.id2word,
                                             coherence='c_v')
            self.coherence_values.append(coherence_model.get_coherence())

        # Show graph
        x = range(start, limit, step)
        plt.plot(x, self.coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend("coherence_values", loc='best')
        plt.show()

        # Print the coherence scores
        for m, cv in zip(x, self.coherence_values):
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

        optimal_model_index = self.coherence_values.index(max(self.coherence_values))
        self.optimal_number_of_topics = start + optimal_model_index
        self.optimal_model = self.model_list[optimal_model_index]
        print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) +
              ' with coherence score : ' + str(self.coherence_values[optimal_model_index]))
        self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics,
                                                             num_words=20, formatted=False)
        self.view_optimal_topics()

    def format_topics_sentences(self, ldamodel=None):
        if ldamodel is None and self.optimal_model is not None:
            ldamodel = self.optimal_model
        elif ldamodel is None and self.lda_model is not None:
            ldamodel = self.lda_model
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[self.corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),
                                                                      round(prop_topic, 4),
                                                                      topic_keywords]),
                                                           ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(self.data)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return sent_topics_df

    def get_most_representative_documents(self):
        # Group top 5 sentences under each topic
        sent_topics_sorteddf_mallet = pd.DataFrame()

        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Format
        df_dominant_topic = self.df_topic_sents_keywords.reset_index()
        df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
        sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic')

        for i, grp in sent_topics_outdf_grpd:
            sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                                     grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                                                    axis=0)

        # Reset Index
        sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
        # Format
        sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
        # Show
        sent_topics_sorteddf_mallet.head()

        for i in range(len(sent_topics_sorteddf_mallet)):
            print(i, sent_topics_sorteddf_mallet.loc[i, 'Text'])

    def get_topic_distribution(self):
        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Number of Documents for Each Topic
        topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts()
        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts/topic_counts.sum(), 4)
        # Topic Number and Keywords
        topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
        # Concatenate Column wise
        df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
        # Change Column names
        df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
        # Show
        print(df_dominant_topics)
                     id2word=dictionary_big,
                     workers=min(10,
                                 multiprocessing.cpu_count() - 1),
                     passes=num_passes,
                     chunksize=chunk_size,
                     alpha=0.5)

print("done in {}".format(time.time() - start), flush=True)

topic_corpus = model[model_corpus]
#topic_corpus[0]

#Print the topics in a ore readable format transofmring them using RegEx
topic_sep = re.compile(r"0\.[0-9]{3}\*")
model_topics = [(topic_no, re.sub(topic_sep, '', model_topic).split(' + '))
                for topic_no, model_topic in model.print_topics(
                    num_topics=num_topics, num_words=5)]
descriptors = []
for i, m in model_topics:
    print(i + 1, ", ".join(m[:5]))
    descriptors.append(", ".join(m[:2]).replace('"', ''))

# #### 2.5.2- DYNAMIC TOPIC MODELING -- LdaSeqModel
# <a id="dynamic"></a>

# In[41]:
'''Analyzing the changes of three topics btetween the two halves of Harry Potter 1 and the two halves of Harry Potter 7
'''

#Create an object toklist_17 containing the tokens for Book1 and Book 7 together
toklist_17 = corpustot.loc[corpustot['Book'].isin(
    ['Harry Potter 1', 'Harry Potter 7'])]['Tokens'].to_list()