def main():
    parser = argparse.ArgumentParser(
        description=
        'trains a topic model from a given bag-of-words corpus file and an id2word dictionary'
    )
    parser.add_argument(
        '--bow',
        type=argparse.FileType('r'),
        help=
        'path to input text-based MatrixMarket bow corpus file (.mm/.mm.bz2)',
        required=True)
    parser.add_argument(
        '--id2word',
        type=argparse.FileType('r'),
        help='path to input text-based id2word dictionary file (.txt/.txt.bz2)',
        required=True)
    parser.add_argument('--mallet',
                        type=argparse.FileType('r'),
                        help='path to java mallet executable',
                        required=True)
    parser.add_argument('--model-prefix',
                        type=argparse.FileType('w'),
                        help='prefix of output binary lda model files',
                        required=True)
    parser.add_argument('--num-topics',
                        type=int,
                        help='number of latent topics',
                        required=True)
    parser.add_argument('--num-iterations',
                        type=int,
                        help='set training epochs',
                        required=True)
    parser.add_argument('--alpha',
                        type=float,
                        help='symmetric lda prior value',
                        required=True)
    #parser.add_argument('--beta', type=valid_gensim_prior, help='distribution-over-vocabulary prior: must be float, "symmetric" or "auto"', required=True)

    args = parser.parse_args()
    input_bow_path = args.bow.name
    input_id2word_path = args.id2word.name
    input_mallet_path = args.mallet.name
    output_model_prefix = args.model_prefix.name
    num_topics = args.num_topics
    num_iterations = args.num_iterations
    alpha = args.alpha

    logger.info('running topic model with \n{}'.format(
        pformat({
            'input_bow_path': input_bow_path,
            'input_id2word_path': input_id2word_path,
            'input_mallet_path': input_mallet_path,
            'output_model_prefix': output_model_prefix,
            'num_topics': num_topics,
            'num_iterations': num_iterations,
            'alpha': alpha
        })))

    # lade BOW-Instanz & Vokabular, erzeuge & speichere LDA-Instanz
    bow = MmCorpus(input_bow_path)
    id2word = Dictionary.load_from_text(input_id2word_path)
    lda_model = LdaMallet(input_mallet_path,
                          alpha=alpha,
                          corpus=bow,
                          num_topics=num_topics,
                          id2word=id2word,
                          workers=8,
                          prefix=output_model_prefix,
                          optimize_interval=50,
                          iterations=num_iterations)
    logger.info(
        'saving model with output prefix {}'.format(output_model_prefix))
    lda_model.save(output_model_prefix
                   )  # speichert NUR Modelldateien, keine eigentlichen Daten

    # gib chrakteristischte Terme der Topics aus
    max_printed_terms = 10
    for topicid in range(num_topics):
        logger.info('topic nr. {}: {}'.format(
            topicid, lda_model.print_topic(topicid, topn=max_printed_terms)))

    # berechne Mittelwert, Standardabweichung für Theta-Werte (d.h. Anteile der Topics an den Dokumenten)
    theta_sums = [None] * bow.num_docs
    for doc, doc_topics in enumerate(lda_model[bow]):
        theta_sums[doc] = sum(theta for term, theta in doc_topics)
    theta_sums = np.array(theta_sums)
    logger.info('mean theta sum {}'.format(np.mean(theta_sums)))
    logger.info('stddev theta sum {}'.format(np.std(theta_sums)))

    # berechne Mittelwert, Standardabweichung für Phi-Werte (d.h. Anteile der Terme an den Topics)
    phi = lda_model.get_topics()
    logger.info('phi shape {}'.format(phi.shape))
    phi_sums = phi.sum(1)
    logger.info('phi sums shape {}'.format(phi_sums.shape))
    logger.info('mean phi sum {}'.format(np.mean(phi_sums)))
    logger.info('stddev phi sum {}'.format(np.std(phi_sums)))
Beispiel #2
0
def topic_model_gensim_lda(col: str, prefix=None, min_topics=19,max_topics=19,step=2) -> None:
    def trigram_bow_generator(filepath: str):
        '''
        generator function to read docs from a file
        and yield a bag-of-words representation
        '''
        for doc in LineSentence(filepath):
            yield trigram_dictionary.doc2bow(doc)

    if prefix is None:
        prefix = ''
    # for topic modeling
    
    trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt'
    print(f'Loading input file {trigram_docs_filepath}')
    trigram_dictionary_filepath = data_dir_processed / f'{prefix}{col}_trigram_dict_all.dict'
    trigram_bow_filepath = data_dir_processed / f'{prefix}{col}_trigram_bow_corpus_all.mm'

    #resp_whytfa_trigram_transformed_docs_all.txt

    # turn to posix filepaths until gensim supports this
    # trigram_docs_filepath = trigram_docs_filepath.as_posix()
    trigram_docs_filepath =  trigram_docs_filepath.as_posix()
    trigram_dictionary_filepath = trigram_dictionary_filepath.as_posix()
    trigram_bow_filepath = trigram_bow_filepath.as_posix()

    # TODO - change 1 == 1 lines to overwrite_interim

    # this is a bit time consuming - make the if statement True
    # if you want to learn the dictionary yourself.
    if 1 == 1:
        trigram_docs = LineSentence(trigram_docs_filepath)
        # learn the dictionary by iterating over all of the docs
        trigram_dictionary = Dictionary(trigram_docs)
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')


        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=min_absolute_frequency,
                                           no_above=max_relative_frequency,
                                           keep_n=max_features,
                                           )
        trigram_dictionary.compactify()
        print(trigram_dictionary)
        #for k, v in trigram_dictionary.iteritems():
        #    print (f'{k}, {v}')

        if verbose:
            logger.info(f'Saving trigram dictionary: {trigram_dictionary_filepath} {len(trigram_dictionary)}')
        trigram_dictionary.save(trigram_dictionary_filepath)

    # load the finished dictionary from disk
    if verbose:
        logger.info(f'Loading trigram dictionary: {trigram_dictionary_filepath}')
    trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

    # this is a bit time consuming - make the if statement True
    # if you want to build the bag-of-words corpus yourself.
    if 1 == 1:
        # generate bag-of-words representations for
        # all docs and save them as a matrix
        if verbose:
            print(f'Saving corpus: {trigram_bow_filepath}')
        MmCorpus.serialize(trigram_bow_filepath,
                           trigram_bow_generator(trigram_docs_filepath))
    # load the finished bag-of-words corpus from disk
    if verbose:
        print(f'Loading corpus: {trigram_bow_filepath}')
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    num_topics_range = range(min_topics, max_topics + 1, step)

    #iterations = 2000
    #chunksize = 100  # more than the number of docs?

    passes = 10
    # iterations = 400
    iterations = 100
    # chunksize = len(trigram_bow_corpus)
    chunksize = 100  # more than the number of docs?
    eta = 'auto'
    #eval_every = None  # Don't evaluate model perplexity, takes too much time.
    workers=1
    print(f'cpu_count:{cpu_count()}')
    alpha='auto'
    if multicore:
        # for multicore; one fewer than the number of cores
        workers = cpu_count() - 1
        if verbose:
            print(f'Multiprocessing with {workers} cores (one fewer than the number of cores)')
    else:
        # for singnle core; cannot use in multicore
        alpha = 'auto'

    # now_str = datetime.now(timezone('US/Pacific')).strftime('%Y-%m-%d-%H-%M-%S')
    now_str = ''#datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    save_dir = data_dir_processed / f'{prefix}{col}_gensim_lda_models_{now_str}'
    if not save_dir.exists():
        save_dir.mkdir(parents=True, exist_ok=True)
    # save_dir_s3 = f'{data_dir_processed_s3}/{prefix}{col}_gensim_lda_models_{now_str}'

    # lm_list = []
    c_v = []
    u_mass = []
    perp = []
    #alg='LDA'
    alg='Mallet'

    for num_topics in num_topics_range:

        if(alg == 'Mallet'):
            logger.info('Using Mallet...')
            #try the Mallet implementation
            ldamallet = LdaMallet(mallet_path, corpus=trigram_bow_corpus, num_topics=num_topics, id2word=trigram_dictionary,workers=workers,iterations=iterations)

            ldamallet_filepath = (save_dir / f'gensim_ldamallet_{num_topics}_topics').as_posix()
            ldamallet.save(ldamallet_filepath)

            for t in ldamallet.show_topics(num_topics=-1, num_words=10, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            # Show Topics
            #print(ldamallet.show_topics(formatted=False))

            # Compute Coherence Score
            cm = CoherenceModel(model=ldamallet, texts=trigram_docs, dictionary=trigram_dictionary, coherence='c_v')
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamallet, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass')#, processes=workers)
            u_mass.append(cm.get_coherence())
            #perp_lower_bound = ldamallet.log_perplexity(trigram_bow_corpus)
            #perp.append(2**(-perp_lower_bound))
            perp.append(0)

        else:
            logger.info('Using LDA...')
            #TODO: try with and without alpha
            ldamodel = LdaModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
                                num_topics=num_topics, passes=passes, iterations=iterations,
                                chunksize=chunksize, eta=eta, #eval_every=eval_every,
                                alpha=alpha,
                                random_state=np.random.RandomState(seed=10101010),
                                )
            #ldamodel = LdaMulticore(corpus=trigram_bow_corpus, id2word=trigram_dictionary,
            #                     num_topics=num_topics, passes=passes, iterations=iterations,
            #                     chunksize=chunksize, eta=eta, #eval_every=eval_every,
            #                     random_state=np.random.RandomState(seed=10101010),
            #                     workers=workers
            #                     )                                 
             
            ldamodel_filepath = (save_dir / f'gensim_lda_{num_topics}_topics').as_posix()
            ldamodel.save(ldamodel_filepath)

            for t in ldamodel.show_topics(num_topics=-1, num_words=50, formatted=False):
                words = [w[0] for w in t[1]]
                logger.info('topic {:2d}\t{}'.format(t[0], ' '.join(words)))

            cm = CoherenceModel(model=ldamodel, texts=trigram_docs,
                            dictionary=trigram_dictionary, coherence='c_v')#, processes=workers)
            c_v.append(cm.get_coherence())
            cm = CoherenceModel(model=ldamodel, corpus=trigram_bow_corpus,
                            dictionary=trigram_dictionary, coherence='u_mass') #, processes=workers)
            u_mass.append(cm.get_coherence())
            perp_lower_bound = ldamodel.log_perplexity(trigram_bow_corpus)
            perp.append(2**(-perp_lower_bound))

    coh_perp = pd.DataFrame(
        data=np.array([c_v, u_mass, perp]).T,
        columns=['c_v', 'u_mass', 'perp'],
        index=list(num_topics_range))
    coh_perp.index.name = 'num_topics'
    coh_perp_filepath = save_dir / 'coherence_perplexity.csv'
    coh_perp.to_csv(coh_perp_filepath)
    logger.info('coherence_docs={0}, coherence_corpus={1}, perplexity={2}'.format(c_v, u_mass, perp))
Beispiel #3
0
import json
from collections import Counter
from multiprocessing import cpu_count

from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet
from gensim.corpora import Dictionary

MALLET_PATH = '/usr/local/Cellar/mallet/2.0.7/bin/mallet'
DATA_PATH = '../data/arXiv'

if __name__ == '__main__':
    all_words = []
    ts = 17
    ti = 30
    topic_frequency = []
    dictionary = Dictionary.load(DATA_PATH + '/arxiv_dict.dict')
    for y in range(2000, 2017):
        lst = json.load(
            open(DATA_PATH + '/processed/processed_{}.json'.format(y)))
        # constructing a document-term matrixcorpus = [dictionary.doc2bow(x) for x in lst]
        mallet = LdaMallet.load(
            DATA_PATH + '/mallet_files/arxiv_{}_mallet_model'.format(y))
        corpus = [dictionary.doc2bow(x) for x in lst]
        result = mallet[corpus]
        topics = [x[0][0] for x in result]

        topic_frequency.append(topics)
    print(topic_frequency)
    json.dump(topic_frequency, open('topic_inference.json', 'w'))
Beispiel #4
0
from gensim.models.wrappers.ldamallet import LdaMallet
import json

DATA_PATH = '../data/enron'

result = []
for y in [2000, 2001]:
    for m in range(1, 13):
        topic_keys = []
        mallet_name = 'mallet_models/{}-{}_mallet'.format(y, m)
        lda = LdaMallet.load(mallet_name)
        for i in range(30):
            topic_keys.append(
                {w: str(p)
                 for p, w in lda.show_topic(i, num_words=30)})
        result.append(topic_keys)

json.dump(result, open('json_files/topic_keys.json', 'w'))
Beispiel #5
0
MALLET_PATH = '/usr/local/Cellar/mallet/2.0.7/bin/mallet'
DATA_PATH = '../data/arXiv'

if __name__ == '__main__':
    entire_corpus = []
    for y in range(2000, 2017):
        lst = json.load(
            open(DATA_PATH + '/processed/processed_{}.json'.format(y)))
        entire_corpus.extend(lst)
        # constructing a document-term matrix
    dictionary = corpora.Dictionary(entire_corpus)
    dictionary.filter_extremes(5, 0.1)
    dictionary.save(DATA_PATH + '/arxiv_dict.dict')
    dictionary = corpora.Dictionary.load(DATA_PATH + '/arxiv_dict.dict')
    for y in range(2000, 2017):
        print(y)
        lst = json.load(
            open(DATA_PATH + '/processed/processed_{}.json'.format(y)))
        # print(dictionary)
        corpus = [dictionary.doc2bow(x) for x in lst]

        lda = LdaMallet(
            mallet_path=MALLET_PATH,
            corpus=corpus,
            id2word=dictionary,
            num_topics=30,
            optimize_interval=10,
            iterations=2000,
            workers=cpu_count(),
        )
        lda.save(DATA_PATH + '/mallet_files/arxiv_{}_mallet_model'.format(y))
Beispiel #6
0
import json
from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet
from multiprocessing import cpu_count

MALLET_PATH = '/usr/local/Cellar/mallet/2.0.7/bin/mallet'
DATA_PATH = '../data/arXiv/mallet_files'
# For debuggering

if __name__ == '__main__':
    # lst = json.load(open('tmp.json'))
    # print(lst)
    # dictionary = corpora.Dictionary(lst)
    # dictionary.filter_extremes(5, 0.1)
    # corpus = [dictionary.doc2bow(x) for x in lst]
    #
    # lda = LdaMallet(
    #     mallet_path=MALLET_PATH,
    #     corpus=corpus,
    #     id2word=dictionary,
    #     num_topics=30,
    #     optimize_interval=10,
    #     iterations=2000,
    #     workers=cpu_count(),
    # )
    lda = LdaMallet.load(DATA_PATH + '/arxiv_2013_mallet_model')
    for i in range(30):
        print([x for x, p in lda.show_topic(i, num_words=15)])
def compute_sentiment(): 
    # load the trained LDA model
    lda = LdaMallet.load('../data/lda_8_topic_neighborhood_review_final.lda')
    lda_topic_object_array = lda.show_topics(num_topics=-1, num_words=20, log=False, formatted=False)      
    lda.print_topics(num_topics=-1, num_words=20)
    
    
    # load the sentiment dictionary, and convert the initial scale from -5 to 5 to 1 to 5
    sentiment_dict = {}
    with open('../data/AFINN-111.txt') as AFINN_file: 
        for line in AFINN_file:
            splitted = line.split('\t')
            word = splitted[0].strip()
            init_score = float(splitted[1].strip())
            new_score = (((init_score - (-5)) * (5 - 1)) / (5 - (-5))) + 1
            sentiment_dict[word] = new_score
            
      
    default_stopwords = stopwords.words('english')
    # avoid place names be considered as normal words
    additional_stopwords = ["manhattan","new york","nyc","brooklyn","bronx","queens"] 

    # file for storing the word-based sentiment analysis result
    wordbased_output_file = open('../data/word_sentiment.csv', 'w')
    
    # file for storing the naive sentiment analysis result
    naive_output_file = open('../data/naive_sentiment.csv', 'w')
    
    fieldnames = ['reviewID','neighborhood','reviewer','rating','content','crimeSafety','housingCondition','transportationConvenience','employmentOpportunity','lifeConvenience','localWeather','cultureDiversity','communityFriendliness']
    topic_mapping_dict = {'crimeSafety':5,'housingCondition':1,'transportationConvenience':2,'employmentOpportunity':7,'lifeConvenience':3,'localWeather':6,'cultureDiversity':0,'communityFriendliness':4}
    
    review_writer = csv.DictWriter(wordbased_output_file, fieldnames=fieldnames)
    review_writer.writeheader()
    
    naive_review_writer = csv.DictWriter(naive_output_file, fieldnames=fieldnames)
    naive_review_writer.writeheader()

    
    with open('../data/all_reviews.csv', 'rb') as csvfile:
        csvreader = csv.DictReader(csvfile)   
        for row in csvreader:
            #review_obj = {'reviewID':row['reviewID'],"neighborhood":row["neighborhood"], "authorID":row["authorID"],"overall_rating": float(row["overall_rating"]),"review_content":row["review_content"]}
            #print(review_obj)                
            #reviews.append(review_obj)
            #print("The review is: "+row["review_content"])
            
            neighborhood_name = row["neighborhood"].lower().strip()
            review_content = row["review_content"].lower()
            review_content = review_content.replace(neighborhood_name,'')
            
            for special_word in additional_stopwords:
                review_content = review_content.replace(special_word,'')
            review_content = re.sub('\s+', ' ', review_content)
            review_content = review_content.strip()
            
            # since we split sentences with comma, question mark, and others, we need to concatenate the short phrases
            review_sentence_array_raw = re.split('[;!?.,]', review_content)
            
            review_sentence_array = []
            for raw_sentence in review_sentence_array_raw:
                if len(raw_sentence.split()) <= 3:
                    if len(review_sentence_array) > 0:
                        review_sentence_array[-1] += " " + raw_sentence.strip()
                    else:
                        review_sentence_array.append(raw_sentence.strip())
                else:
                    review_sentence_array.append(raw_sentence)
                    
                    
            # create a dict object to store the sentiments of various aspects; -1 is for overall      
            review_senti_score_dict = {-1:{'count':0,'score':0}}
            for this_topic_object in lda_topic_object_array:
                review_senti_score_dict[this_topic_object[0]] = {'count':0,'score':0}
            
            
            for sentence in review_sentence_array:
                #print('The sentence is: ' + sentence)
                sentence = re.sub('[^a-zA-Z]', ' ', sentence)
                sentence = re.sub('\s+', ' ', sentence)
                sentence = sentence.strip()
                if len(sentence) == 0:
                    continue
                sentence_words = [word for word in sentence.split() if word not in default_stopwords and len(word) > 1]
                
                # judge which topic this sentence is about
                sentence_topic_dict = {}
                for this_topic_object in lda_topic_object_array:
                    sentence_topic_dict[this_topic_object[0]] = 0
                    for this_keyword_object in this_topic_object[1]:
                        for this_sentence_word in sentence_words:
                            if this_keyword_object[0] == this_sentence_word:
                                sentence_topic_dict[this_topic_object[0]] += this_keyword_object[1]
      
                sentence_final_topic = -1
                maxi_topic_value = 0
                for topic in sentence_topic_dict:
                    if (sentence_topic_dict[topic] >= maxi_topic_value) and (sentence_topic_dict[topic]!= 0):
                        
                        if sentence_topic_dict[topic] == maxi_topic_value:
                            print("find tie "+ str(sentence_topic_dict[topic]))
                            
                        sentence_final_topic = topic
                        maxi_topic_value = sentence_topic_dict[topic]
                
                
                
                # add the sentiment score for each sentence
                for this_sentence_word in sentence_words:
                    if(sentiment_dict.has_key(this_sentence_word)):
                        if sentence_final_topic != -1:
                            review_senti_score_dict[sentence_final_topic]['score'] += sentiment_dict[this_sentence_word]
                            review_senti_score_dict[sentence_final_topic]['count'] += 1
                            
                        review_senti_score_dict[-1]['score'] += sentiment_dict[this_sentence_word]
                        review_senti_score_dict[-1]['count'] += 1
                
            
            
            review_senti_result = ''
            naive_review_senti_result = ''
            overallRating = float(row["overall_rating"])
            
            naive_review_senti_score_dict = {}  # this is for the naive approach (where all the aspects are the same)
            
            for topic in review_senti_score_dict:
                count = review_senti_score_dict[topic]['count']
                score = review_senti_score_dict[topic]['score']
                
                if count > 0:
                    avg_score = float(score)/float(count)
                    review_senti_score_dict[topic]['score'] = avg_score
                    review_senti_result += " topic:"+str(topic)+", score:"+str(avg_score)+"; "
                    
                    naive_review_senti_score_dict[topic] = overallRating
                    naive_review_senti_result += " topic:"+str(topic)+", score:"+str(overallRating)+"; "
                    
                else:
                    review_senti_score_dict[topic]['score'] = -1
                    naive_review_senti_score_dict[topic] = -1
            
            print(review_senti_result.strip())
            
            
            print(str(review_senti_score_dict))
            
            
            review_writer.writerow({'reviewID':row['reviewID'],'neighborhood':row["neighborhood"],'reviewer':row["authorID"],'rating':float(row["overall_rating"]),'content':row["review_content"],'crimeSafety':review_senti_score_dict[topic_mapping_dict['crimeSafety']]['score'],'housingCondition':review_senti_score_dict[topic_mapping_dict['housingCondition']]['score'],'transportationConvenience':review_senti_score_dict[topic_mapping_dict['transportationConvenience']]['score'],'employmentOpportunity':review_senti_score_dict[topic_mapping_dict['employmentOpportunity']]['score'],'lifeConvenience':review_senti_score_dict[topic_mapping_dict['lifeConvenience']]['score'],'localWeather':review_senti_score_dict[topic_mapping_dict['localWeather']]['score'],'cultureDiversity':review_senti_score_dict[topic_mapping_dict['cultureDiversity']]['score'],'communityFriendliness':review_senti_score_dict[topic_mapping_dict['communityFriendliness']]['score']})
            naive_review_writer.writerow({'reviewID':row['reviewID'],'neighborhood':row["neighborhood"],'reviewer':row["authorID"],'rating':float(row["overall_rating"]),'content':row["review_content"],'crimeSafety':naive_review_senti_score_dict[topic_mapping_dict['crimeSafety']],'housingCondition':naive_review_senti_score_dict[topic_mapping_dict['housingCondition']],'transportationConvenience':naive_review_senti_score_dict[topic_mapping_dict['transportationConvenience']],'employmentOpportunity':naive_review_senti_score_dict[topic_mapping_dict['employmentOpportunity']],'lifeConvenience':naive_review_senti_score_dict[topic_mapping_dict['lifeConvenience']],'localWeather':naive_review_senti_score_dict[topic_mapping_dict['localWeather']],'cultureDiversity':naive_review_senti_score_dict[topic_mapping_dict['cultureDiversity']],'communityFriendliness':naive_review_senti_score_dict[topic_mapping_dict['communityFriendliness']]})
              
                
    wordbased_output_file.close() 
    naive_output_file.close()          
Beispiel #8
0
from gensim.models.wrappers.ldamallet import LdaMallet
import json

DATA_PATH = '/Users/ranxiao/Desktop/data/arXiv'
result = []
for y in range(2000, 2017):
    topic_keys = []
    model_path = DATA_PATH + '/mallet_files/arxiv_{}_mallet_model'.format(y)
    lda = LdaMallet.load(model_path)
    for i in range(30):  # num of topics
        topic_keys.append(
            {w: str(p)
             for w, p in lda.show_topic(i, num_words=100)})
    result.append(topic_keys)

json.dump(result, open('topic_keys.json', 'w'))
Beispiel #9
0
class Topics(object):
    __dict_path = os.path.join(os.path.dirname(__file__),
                               'models/mallet-dict.pkl')
    __model_path = os.path.join(os.path.dirname(__file__),
                                'models/mallet-model.model')
    __mallet_path = os.path.join(os.path.dirname(__file__),
                                 'models/mallet/bin/mallet')
    __topic_file_path = os.path.join(os.path.dirname(__file__),
                                     'models/topic-files/')

    dictionary = unpickle(__dict_path)
    model = LdaMallet.load(__model_path)
    model.mallet_path = __mallet_path
    model.prefix = __topic_file_path

    model_fast = malletmodel2ldamodel(model, 0.1, 1000)

    topic_map = {
        0: 'education',
        1: 'dating',
        2: 'change',
        3: 'communication',
        4: 'broken relationship',  # relationship status
        5: 'finances and accounting',
        6: 'excessive thoughts',
        7: 'politics',
        8: 'financial investments',
        9: 'physical health',
        10: 'work',
        11: 'sleep',
        12: 'emotions',
        13: 'medication regimen',
        14: 'past experiences / decisions',  # or decisions
        15: 'general apathy',
        16: 'NaN',  # ignore
        17: 'relocation',
        18: 'social stressors',
        19: 'memories',
        20: 'financial decisions',
        21: 'family',
        22: 'nutrition and weight',
        23: 'relationships',
        24: 'marital issues',
        25: 'religion and belief systems',
        26: 'experiences',
        27: 'financial pressure',
        28: 'romantic relationship',
        29: 'relationship issues',
        30: 'routines',
        31: 'taxes and claims',  # income and benefits
        32: 'symptoms of mental illness',
        33: 'dispute and argument',
        34: 'lack of motivation',
        35: 'reflection and mindfulness',
        36: 'event or festivity',
        37: 'self-harm',  # suicide
        38: 'resources and information',
        39: 'addiction',
        40: 'addiction recovery',
        41: 'leisure'
    }

    def get_topics(self, topics):
        top_topics = topics[:, 1].argsort()[-5:][::-1]

        # TODO: weight down scoring
        scores = 0.
        results = {}
        for idx, entry in enumerate(top_topics):
            topic = int(topics[entry][0])
            score = topics[entry][1]

            if idx == 0 and score <= .1:
                return None

            if scores < .55:
                if self.topic_map[topic] != 'NaN':
                    results[self.topic_map[topic]] = score
                scores += score
            else:
                break

        return results

    # def retrieve(self, doc):
    #    tokens = self.get_tokens(doc)
    #    bow = self.dictionary.doc2bow(tokens)
    #    topics = np.array(self.model[bow])
    #    return self.get_topics(topics)

    def retrieve(self, doc):
        tokens = self.get_tokens(doc)
        bow = self.dictionary.doc2bow(tokens)
        topics = np.array(self.model_fast[bow])
        return self.get_topics(topics)

    @staticmethod
    def get_tokens(doc):
        result = []
        for tok in doc:

            if tok.pos_ in ['IN', 'MD', 'CD']:
                continue

            if tok.is_digit or tok.like_num:
                continue

            if tok.is_punct:
                continue

            elif tok.is_stop:
                continue

            else:
                result.append(tok.text.lower())

        return result
Beispiel #10
0
import json
from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet
from multiprocessing import cpu_count

MALLET_PATH = '/usr/local/mallet-2.0.6/bin/mallet'
#DATA_PATH = '/Users/ranxiao/Desktop/data/arXiv'
# For debuggering

if __name__ == '__main__':
    lst = json.load(open('processed_2000_1.json'))
    dictionary = corpora.Dictionary(lst)
    dictionary.filter_extremes(5, 0.1)
    corpus = [dictionary.doc2bow(x) for x in lst]

    lda = LdaMallet(
        mallet_path=MALLET_PATH,
        corpus=corpus,
        id2word=dictionary,
        num_topics=30,
        optimize_interval=10,
        iterations=4000,
        workers=cpu_count(),
    )