Beispiel #1
0
import json
from collections import Counter
from multiprocessing import cpu_count

from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet
from gensim.corpora import Dictionary

MALLET_PATH = '/usr/local/Cellar/mallet/2.0.7/bin/mallet'
DATA_PATH = '../data/arXiv'

if __name__ == '__main__':
    all_words = []
    ts = 17
    ti = 30
    topic_frequency = []
    dictionary = Dictionary.load(DATA_PATH + '/arxiv_dict.dict')
    for y in range(2000, 2017):
        lst = json.load(
            open(DATA_PATH + '/processed/processed_{}.json'.format(y)))
        # constructing a document-term matrixcorpus = [dictionary.doc2bow(x) for x in lst]
        mallet = LdaMallet.load(
            DATA_PATH + '/mallet_files/arxiv_{}_mallet_model'.format(y))
        corpus = [dictionary.doc2bow(x) for x in lst]
        result = mallet[corpus]
        topics = [x[0][0] for x in result]

        topic_frequency.append(topics)
    print(topic_frequency)
    json.dump(topic_frequency, open('topic_inference.json', 'w'))
Beispiel #2
0
from gensim.models.wrappers.ldamallet import LdaMallet
import json

DATA_PATH = '../data/enron'

result = []
for y in [2000, 2001]:
    for m in range(1, 13):
        topic_keys = []
        mallet_name = 'mallet_models/{}-{}_mallet'.format(y, m)
        lda = LdaMallet.load(mallet_name)
        for i in range(30):
            topic_keys.append(
                {w: str(p)
                 for p, w in lda.show_topic(i, num_words=30)})
        result.append(topic_keys)

json.dump(result, open('json_files/topic_keys.json', 'w'))
Beispiel #3
0
import json
from gensim import corpora
from gensim.models.wrappers.ldamallet import LdaMallet
from multiprocessing import cpu_count

MALLET_PATH = '/usr/local/Cellar/mallet/2.0.7/bin/mallet'
DATA_PATH = '../data/arXiv/mallet_files'
# For debuggering

if __name__ == '__main__':
    # lst = json.load(open('tmp.json'))
    # print(lst)
    # dictionary = corpora.Dictionary(lst)
    # dictionary.filter_extremes(5, 0.1)
    # corpus = [dictionary.doc2bow(x) for x in lst]
    #
    # lda = LdaMallet(
    #     mallet_path=MALLET_PATH,
    #     corpus=corpus,
    #     id2word=dictionary,
    #     num_topics=30,
    #     optimize_interval=10,
    #     iterations=2000,
    #     workers=cpu_count(),
    # )
    lda = LdaMallet.load(DATA_PATH + '/arxiv_2013_mallet_model')
    for i in range(30):
        print([x for x, p in lda.show_topic(i, num_words=15)])
def compute_sentiment(): 
    # load the trained LDA model
    lda = LdaMallet.load('../data/lda_8_topic_neighborhood_review_final.lda')
    lda_topic_object_array = lda.show_topics(num_topics=-1, num_words=20, log=False, formatted=False)      
    lda.print_topics(num_topics=-1, num_words=20)
    
    
    # load the sentiment dictionary, and convert the initial scale from -5 to 5 to 1 to 5
    sentiment_dict = {}
    with open('../data/AFINN-111.txt') as AFINN_file: 
        for line in AFINN_file:
            splitted = line.split('\t')
            word = splitted[0].strip()
            init_score = float(splitted[1].strip())
            new_score = (((init_score - (-5)) * (5 - 1)) / (5 - (-5))) + 1
            sentiment_dict[word] = new_score
            
      
    default_stopwords = stopwords.words('english')
    # avoid place names be considered as normal words
    additional_stopwords = ["manhattan","new york","nyc","brooklyn","bronx","queens"] 

    # file for storing the word-based sentiment analysis result
    wordbased_output_file = open('../data/word_sentiment.csv', 'w')
    
    # file for storing the naive sentiment analysis result
    naive_output_file = open('../data/naive_sentiment.csv', 'w')
    
    fieldnames = ['reviewID','neighborhood','reviewer','rating','content','crimeSafety','housingCondition','transportationConvenience','employmentOpportunity','lifeConvenience','localWeather','cultureDiversity','communityFriendliness']
    topic_mapping_dict = {'crimeSafety':5,'housingCondition':1,'transportationConvenience':2,'employmentOpportunity':7,'lifeConvenience':3,'localWeather':6,'cultureDiversity':0,'communityFriendliness':4}
    
    review_writer = csv.DictWriter(wordbased_output_file, fieldnames=fieldnames)
    review_writer.writeheader()
    
    naive_review_writer = csv.DictWriter(naive_output_file, fieldnames=fieldnames)
    naive_review_writer.writeheader()

    
    with open('../data/all_reviews.csv', 'rb') as csvfile:
        csvreader = csv.DictReader(csvfile)   
        for row in csvreader:
            #review_obj = {'reviewID':row['reviewID'],"neighborhood":row["neighborhood"], "authorID":row["authorID"],"overall_rating": float(row["overall_rating"]),"review_content":row["review_content"]}
            #print(review_obj)                
            #reviews.append(review_obj)
            #print("The review is: "+row["review_content"])
            
            neighborhood_name = row["neighborhood"].lower().strip()
            review_content = row["review_content"].lower()
            review_content = review_content.replace(neighborhood_name,'')
            
            for special_word in additional_stopwords:
                review_content = review_content.replace(special_word,'')
            review_content = re.sub('\s+', ' ', review_content)
            review_content = review_content.strip()
            
            # since we split sentences with comma, question mark, and others, we need to concatenate the short phrases
            review_sentence_array_raw = re.split('[;!?.,]', review_content)
            
            review_sentence_array = []
            for raw_sentence in review_sentence_array_raw:
                if len(raw_sentence.split()) <= 3:
                    if len(review_sentence_array) > 0:
                        review_sentence_array[-1] += " " + raw_sentence.strip()
                    else:
                        review_sentence_array.append(raw_sentence.strip())
                else:
                    review_sentence_array.append(raw_sentence)
                    
                    
            # create a dict object to store the sentiments of various aspects; -1 is for overall      
            review_senti_score_dict = {-1:{'count':0,'score':0}}
            for this_topic_object in lda_topic_object_array:
                review_senti_score_dict[this_topic_object[0]] = {'count':0,'score':0}
            
            
            for sentence in review_sentence_array:
                #print('The sentence is: ' + sentence)
                sentence = re.sub('[^a-zA-Z]', ' ', sentence)
                sentence = re.sub('\s+', ' ', sentence)
                sentence = sentence.strip()
                if len(sentence) == 0:
                    continue
                sentence_words = [word for word in sentence.split() if word not in default_stopwords and len(word) > 1]
                
                # judge which topic this sentence is about
                sentence_topic_dict = {}
                for this_topic_object in lda_topic_object_array:
                    sentence_topic_dict[this_topic_object[0]] = 0
                    for this_keyword_object in this_topic_object[1]:
                        for this_sentence_word in sentence_words:
                            if this_keyword_object[0] == this_sentence_word:
                                sentence_topic_dict[this_topic_object[0]] += this_keyword_object[1]
      
                sentence_final_topic = -1
                maxi_topic_value = 0
                for topic in sentence_topic_dict:
                    if (sentence_topic_dict[topic] >= maxi_topic_value) and (sentence_topic_dict[topic]!= 0):
                        
                        if sentence_topic_dict[topic] == maxi_topic_value:
                            print("find tie "+ str(sentence_topic_dict[topic]))
                            
                        sentence_final_topic = topic
                        maxi_topic_value = sentence_topic_dict[topic]
                
                
                
                # add the sentiment score for each sentence
                for this_sentence_word in sentence_words:
                    if(sentiment_dict.has_key(this_sentence_word)):
                        if sentence_final_topic != -1:
                            review_senti_score_dict[sentence_final_topic]['score'] += sentiment_dict[this_sentence_word]
                            review_senti_score_dict[sentence_final_topic]['count'] += 1
                            
                        review_senti_score_dict[-1]['score'] += sentiment_dict[this_sentence_word]
                        review_senti_score_dict[-1]['count'] += 1
                
            
            
            review_senti_result = ''
            naive_review_senti_result = ''
            overallRating = float(row["overall_rating"])
            
            naive_review_senti_score_dict = {}  # this is for the naive approach (where all the aspects are the same)
            
            for topic in review_senti_score_dict:
                count = review_senti_score_dict[topic]['count']
                score = review_senti_score_dict[topic]['score']
                
                if count > 0:
                    avg_score = float(score)/float(count)
                    review_senti_score_dict[topic]['score'] = avg_score
                    review_senti_result += " topic:"+str(topic)+", score:"+str(avg_score)+"; "
                    
                    naive_review_senti_score_dict[topic] = overallRating
                    naive_review_senti_result += " topic:"+str(topic)+", score:"+str(overallRating)+"; "
                    
                else:
                    review_senti_score_dict[topic]['score'] = -1
                    naive_review_senti_score_dict[topic] = -1
            
            print(review_senti_result.strip())
            
            
            print(str(review_senti_score_dict))
            
            
            review_writer.writerow({'reviewID':row['reviewID'],'neighborhood':row["neighborhood"],'reviewer':row["authorID"],'rating':float(row["overall_rating"]),'content':row["review_content"],'crimeSafety':review_senti_score_dict[topic_mapping_dict['crimeSafety']]['score'],'housingCondition':review_senti_score_dict[topic_mapping_dict['housingCondition']]['score'],'transportationConvenience':review_senti_score_dict[topic_mapping_dict['transportationConvenience']]['score'],'employmentOpportunity':review_senti_score_dict[topic_mapping_dict['employmentOpportunity']]['score'],'lifeConvenience':review_senti_score_dict[topic_mapping_dict['lifeConvenience']]['score'],'localWeather':review_senti_score_dict[topic_mapping_dict['localWeather']]['score'],'cultureDiversity':review_senti_score_dict[topic_mapping_dict['cultureDiversity']]['score'],'communityFriendliness':review_senti_score_dict[topic_mapping_dict['communityFriendliness']]['score']})
            naive_review_writer.writerow({'reviewID':row['reviewID'],'neighborhood':row["neighborhood"],'reviewer':row["authorID"],'rating':float(row["overall_rating"]),'content':row["review_content"],'crimeSafety':naive_review_senti_score_dict[topic_mapping_dict['crimeSafety']],'housingCondition':naive_review_senti_score_dict[topic_mapping_dict['housingCondition']],'transportationConvenience':naive_review_senti_score_dict[topic_mapping_dict['transportationConvenience']],'employmentOpportunity':naive_review_senti_score_dict[topic_mapping_dict['employmentOpportunity']],'lifeConvenience':naive_review_senti_score_dict[topic_mapping_dict['lifeConvenience']],'localWeather':naive_review_senti_score_dict[topic_mapping_dict['localWeather']],'cultureDiversity':naive_review_senti_score_dict[topic_mapping_dict['cultureDiversity']],'communityFriendliness':naive_review_senti_score_dict[topic_mapping_dict['communityFriendliness']]})
              
                
    wordbased_output_file.close() 
    naive_output_file.close()          
Beispiel #5
0
from gensim.models.wrappers.ldamallet import LdaMallet
import json

DATA_PATH = '/Users/ranxiao/Desktop/data/arXiv'
result = []
for y in range(2000, 2017):
    topic_keys = []
    model_path = DATA_PATH + '/mallet_files/arxiv_{}_mallet_model'.format(y)
    lda = LdaMallet.load(model_path)
    for i in range(30):  # num of topics
        topic_keys.append(
            {w: str(p)
             for w, p in lda.show_topic(i, num_words=100)})
    result.append(topic_keys)

json.dump(result, open('topic_keys.json', 'w'))
Beispiel #6
0
class Topics(object):
    __dict_path = os.path.join(os.path.dirname(__file__),
                               'models/mallet-dict.pkl')
    __model_path = os.path.join(os.path.dirname(__file__),
                                'models/mallet-model.model')
    __mallet_path = os.path.join(os.path.dirname(__file__),
                                 'models/mallet/bin/mallet')
    __topic_file_path = os.path.join(os.path.dirname(__file__),
                                     'models/topic-files/')

    dictionary = unpickle(__dict_path)
    model = LdaMallet.load(__model_path)
    model.mallet_path = __mallet_path
    model.prefix = __topic_file_path

    model_fast = malletmodel2ldamodel(model, 0.1, 1000)

    topic_map = {
        0: 'education',
        1: 'dating',
        2: 'change',
        3: 'communication',
        4: 'broken relationship',  # relationship status
        5: 'finances and accounting',
        6: 'excessive thoughts',
        7: 'politics',
        8: 'financial investments',
        9: 'physical health',
        10: 'work',
        11: 'sleep',
        12: 'emotions',
        13: 'medication regimen',
        14: 'past experiences / decisions',  # or decisions
        15: 'general apathy',
        16: 'NaN',  # ignore
        17: 'relocation',
        18: 'social stressors',
        19: 'memories',
        20: 'financial decisions',
        21: 'family',
        22: 'nutrition and weight',
        23: 'relationships',
        24: 'marital issues',
        25: 'religion and belief systems',
        26: 'experiences',
        27: 'financial pressure',
        28: 'romantic relationship',
        29: 'relationship issues',
        30: 'routines',
        31: 'taxes and claims',  # income and benefits
        32: 'symptoms of mental illness',
        33: 'dispute and argument',
        34: 'lack of motivation',
        35: 'reflection and mindfulness',
        36: 'event or festivity',
        37: 'self-harm',  # suicide
        38: 'resources and information',
        39: 'addiction',
        40: 'addiction recovery',
        41: 'leisure'
    }

    def get_topics(self, topics):
        top_topics = topics[:, 1].argsort()[-5:][::-1]

        # TODO: weight down scoring
        scores = 0.
        results = {}
        for idx, entry in enumerate(top_topics):
            topic = int(topics[entry][0])
            score = topics[entry][1]

            if idx == 0 and score <= .1:
                return None

            if scores < .55:
                if self.topic_map[topic] != 'NaN':
                    results[self.topic_map[topic]] = score
                scores += score
            else:
                break

        return results

    # def retrieve(self, doc):
    #    tokens = self.get_tokens(doc)
    #    bow = self.dictionary.doc2bow(tokens)
    #    topics = np.array(self.model[bow])
    #    return self.get_topics(topics)

    def retrieve(self, doc):
        tokens = self.get_tokens(doc)
        bow = self.dictionary.doc2bow(tokens)
        topics = np.array(self.model_fast[bow])
        return self.get_topics(topics)

    @staticmethod
    def get_tokens(doc):
        result = []
        for tok in doc:

            if tok.pos_ in ['IN', 'MD', 'CD']:
                continue

            if tok.is_digit or tok.like_num:
                continue

            if tok.is_punct:
                continue

            elif tok.is_stop:
                continue

            else:
                result.append(tok.text.lower())

        return result