def large_scale_visual_sentiment(vg_en_tn_prdct):
    lexicon = Empath()
    vg_en_tn_prdct_sentiments = defaultdict(int)
    for row in vg_en_tn_prdct:
        for tensorproduct in row:
            tpedges = tensorproduct.edges()
            tpnodes = tensorproduct.nodes()
            print "Edges:", tpedges
            print "Nodes:", tpnodes
            for tpedge in tpedges:
                sentiment00 = lexicon.analyze((tpedge[0][0]).decode("utf-8"))
                for k, v in sentiment00.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment01 = lexicon.analyze((tpedge[0][1]).decode("utf-8"))
                for k, v in sentiment01.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment10 = lexicon.analyze((tpedge[1][0]).decode("utf-8"))
                for k, v in sentiment10.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment11 = lexicon.analyze((tpedge[1][1]).decode("utf-8"))
                for k, v in sentiment11.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
    print "Sentiment Analysis of the Video:", sorted(
        vg_en_tn_prdct_sentiments.items(),
        key=operator.itemgetter(0),
        reverse=True)
    return vg_en_tn_prdct_sentiments
Ejemplo n.º 2
0
def n_analyze_emotion():
    print("Analyzing emotions from News API")
    global CATS, CATS_DICT
    lexicon = Empath()
    descrip_list = []
    DBNAME = 'final.db'
    conn = sqlite3.connect(DBNAME)
    cur = conn.cursor()
    statement = 'SELECT text from news_api'
    cur.execute(statement)
    for row in cur: 
        descrip_list.append(row[0])
    
     ### entire corpus
    str1 = ''.join(descrip_list)
    print(str1)
    n_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True)
    n_empath_dict_new = {}
    for key in n_empath_dict:
        n_empath_dict_new[key] = n_empath_dict[key]*1000

    ## Row by Row: 
    counter = 0
    for row in descrip_list: 
        n_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True)
        #print(len((list(u_row_empath_dict.keys())))) # This is 17 long 
        n_row_empath_dict_new = {}
        for key in n_row_empath_dict:
            n_row_empath_dict_new[key] = n_row_empath_dict[key]*1000
        #print(len((list(u_row_empath_dict_new.keys())))) # This is 17 long
        counter +=1
        vals_list = list(n_row_empath_dict_new.values())
        vals_list.insert(0, None)
        vals_list.insert(1, 999)
        vals_list.insert(2, 999)
        vals_list.insert(3, counter)
        vals_list.insert(4, "news_api")
        insertion = tuple(vals_list)
        DBNAME = 'final.db'
        conn = sqlite3.connect(DBNAME)
        cur = conn.cursor()
        statement = ''' 
            INSERT INTO "Emotions"
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            '''
        cur.execute(statement, insertion)
        conn.commit()

    return n_empath_dict_new
Ejemplo n.º 3
0
 def command(self, dataframe, selector, aggregate_scores):
     documents = selector.to_matrix().flatten()
     print(documents)
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     to_df = []
     if aggregate_scores == "aggregate":
         out_dict = lexicon.analyze(documents.tolist(), normalize=True)
         for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True):
             to_df.append([k,v])
         return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df)
     else:
         out_scores = [order_keys(lexicon.analyze(d, normalize=True)) for d in documents.tolist()]
         return iris_objects.IrisDataframe(column_names=order_keys.s_keys, data=out_scores)
Ejemplo n.º 4
0
def get_post_metrics(df, post_file):

    #define empath
    lexicon = Empath()
    # this is the eleven categories which we will use
    eleven_categories = [
        'family', 'friends', 'home', 'sexual', 'swears', 'work', 'leisure',
        'money', 'body', 'religion', 'health'
    ]

    # clean the post data
    post_df = cleaning(post_file)  # need the cleaning function from above!

    # find post frequency for each friend
    df['post frequency'] = 0
    df['empath'] = 0
    for friend in df.name:
        ind = df.loc[df.name == friend].index[0]
        title = post_df[post_df['title'].str.contains(friend, na=False)]
        tags = post_df[post_df['tags'].str.contains(friend, na=False)]
        friend_post = pd.concat([title, tags])
        df.at[ind, 'post frequency'] = len(friend_post.index)

        # if there is a post, find empath analysis
        if df.loc[ind, 'post frequency'] != 0:
            # sum the empath analysis for each post on eleven categories
            friend_post['empath'] = friend_post['post'].apply(lambda x: sum(
                lexicon.analyze(x, categories=eleven_categories).values()))
            # find the average of the empath score for each post
            df.at[ind, 'empath'] = np.mean(friend_post['empath'])
    return df
def hateLikeMaker(tweets):
    lexicon = Empath()
    likeness = defaultdict(int)

    for i in tweets:
        sents = sent_tokenize(i)
        for j in sents:
            j = re.sub(r'[^\w\s]', '', j)
            a = lexicon.analyze(j)
            if a['negative_emotion'] == 1:
                print("TRUE")
                for k, l in a.items():
                    if l == 1 and j != 'negative_emotion':
                        likeness[k] -= 1
            else:
                print("FALSE")
                for k, l in a.items():
                    if l == 1:
                        likeness[k] += 1
        if 'hate' in likeness:
            likeness.pop('hate')
        if 'envy' in likeness:
            likeness.pop('envy')
        likeness = sorted(likeness.items(), key=operator.itemgetter(1))
        dislikes = likeness[:3]
        length = len(likeness)
        likes = likeness[length - 3:length]
        likes = dict(likes)
        dislikes = dict(dislikes)
        return likes, dislikes
Ejemplo n.º 6
0
def train_text_classification(file):
    import pandas as pd
    train = pd.read_csv(file)
    train = train.dropna()
    train = pd.DataFrame(train.labeldata.str.split('\r\r\n').tolist(), index=train.labelname).stack()
    train = train.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0
    train.columns = ['labeldata', 'labelname'] # renaming var1
    import html
    train = html.unescape(train)
    # Shuffle data
    train = train.sample(frac=1, random_state=1).reset_index(drop=True)
    train = train.dropna()
    train = train[train['labeldata']!='']
    
    
	
    from empath import Empath
    lexicon = Empath()
    
    train_features = []
    for data in train['labeldata']:
        feature = lexicon.analyze(data, normalize=True)
        train_features.append(feature)
        
    train_features = pd.DataFrame(train_features)  
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(solver='sag')
    
    model.fit(train_features,train['labelname'])

    return model
Ejemplo n.º 7
0
def empath_extraction(text):
    # You need to 'pip install empath' first
    from empath import Empath
    lexicon = Empath()
    # Get the Empath result
    '''Note : The output sequence of empath is random!!!'''
    result = lexicon.analyze(text, normalize=True)
    # Filter out those equal to zero
    output = dict()
    for (k, v) in result.items():
        if (v > 0):
            output[k] = v
    '''You can either print the result directly
        With format :  '#the class#':#the score#'''
    # Change the form of output as you need
    # for (k,v) in output.items():
    #     print(k + "-" + str(v))
    '''Or you can get the predefined classes in ./empathClass.txt
        and convert the #the class# into number,
        which may be more convenient and efficient for other programs to process.
        But this will definitely slow down this python code'''
    class_empath = dict()
    with open(r"./empathClass.txt") as file:
        for line in file:
            tmp = line.split(",")
        for index, item in enumerate(tmp):
            class_empath[item] = index
    output_convet = []
    for (k, v) in output.items():
        output_convet.append([str(class_empath[k]), str(v)])
    # Change the form of output as you need
    for item in output_convet:
        print("-".join(item))
Ejemplo n.º 8
0
def subcommand_sentiment(texts, docnames, args):
    nlp = spacy.load('en')
    lexicon = Empath()
    if args.posneg_only:
        cats = ['positive_emotion','negative_emotion']
    else:
        cats = None # all the categories

    analyze = lambda t: lexicon.analyze(t, categories=cats, normalize= not args.no_normalize)
    sentiments = [analyze(t) for t in texts]


    df = pd.DataFrame(sentiments,index=docnames)
    summarydf = make_summary(df)

    sheets = list()
    if args.human_readable:
        hdf = make_human_report(df)
        sheets.append( ('report',hdf) )
    else:
        sheets.append( ('report',df))
    sheets.append(('summary',summarydf))

    final_fname = write_report(
        args.outfile, 
        sheets, 
        hdf_if_fail=not args.nohdfonfail and not args.human_readable, 
        verbose=True,
    )

    return final_fname
Ejemplo n.º 9
0
def use_text_classification(file, model):    
    import pandas as pd
    test = pd.read_csv(file)
    test = test.dropna()
    test = pd.DataFrame(test.labeldata.str.split('\r\r\n').tolist(), index=test.labelname).stack()
    test = test.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0
    test.columns = ['labeldata', 'labelname'] # renaming var1
    
    import html
    test = html.unescape(test)
    # Shuffle data
    test = test.dropna()
    test = test[test['labeldata']!='']
    
    from empath import Empath
    lexicon = Empath()
    
    test_features = []
    for sentence in test['labeldata']:
        feature = lexicon.analyze(sentence, normalize=True)
        test_features.append(feature)
        
    test_features = pd.DataFrame(test_features)
    prediction = model.predict(test_features)   
    test['prediction'] = prediction
	
    return test
Ejemplo n.º 10
0
def count_unconnect(u):

    # espero que seja um grupo bem diverso
    lexicon = Empath()
    # print(len(u))

    lexicon.create_category("support", support, model="nytimes")
    lexicon.create_category("conflict", conflict, model="nytimes")
    lexicon.create_category("conclusion", conclusion, model="nytimes")
    lexicon.create_category("complementary", complementary, model="nytimes")
    lexicon.create_category("causal_argument",
                            causal_argument,
                            model="nytimes")
    lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes")

    #["because", "only", "before", "so", "if", "though", "then", "until", "once", "even", "since", "although", "so", "while", "having", "because", "already", "thus", "time", "unless", "now", "actually", "eventually"]
    #["though", "although", "except", "yet", "but", "even", "because", "only", "Though", "Although", "Yet", "either", "nevertheless", "whereas", "though", "fact", "however", "unlike", "Furthermore", "because", "nonetheless", "And", "However", "none", "either", "still", "Even", "despite", "if", "so", "Yet", "meaning", "indeed", "consequently"]
    #[]
    #["while", "whereas", "though", "only", "yet", "While", "thus", "even", "Thus", "Instead", "although", "instead", "Though", "Moreover", "actually", "nevertheless", "sometimes", "still", "rather"]
    #["means", "therefore", "means", "merely", "mechanism", "democratic_process", "Therefore", "simply", "free_market", "consequence", "because"]
    # cat_all = lexicon.analyze(u, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_all = lexicon.analyze(u, categories=['verbs_hedging'], normalize=True)
    #cat_all = {}
    #for arg in u:
    #   cat = lexicon.analyze(arg)
    #   if cat["children"] != 0:
    #       print(arg, cat["children"])
    return cat_all
Ejemplo n.º 11
0
def analyze_tokens(word_list, topk=10):
    lexicon = Empath()

    word_list_analyzed = lexicon.analyze(word_list, normalize=True)

    return sorted(word_list_analyzed.items(),
                  key=lambda kv: kv[1],
                  reverse=True)[:topk]
def create_empath_cats(text):
    lexicon = Empath()
    try:
        cat_scores = lexicon.analyze(text, normalize=True)
    except Exception as e:
        print(e)
        return 0
    return pd.Series(cat_scores)
Ejemplo n.º 13
0
def executeEmpathOnISEAR(ISEAR, DATADIR):

	try:
		corpus = pd.read_csv(ISEAR, sep=',',header=None)

		if not os.path.isfile(DATADIR + "/labels_empath_on_ISEAR.txt"):

			lexicon = Empath()		#instance of empath analyser
			emotions_list = ['fear', 'joy', 'anger', 'sadness', 'disgust']
			model = "reddit"

			res = {}

			best_em = []		# will contain empath analysis results
			emotions_results = []

			for i in range(len(emotions_list)):			# creates a category for each emotion
				lexicon.create_category(emotions_list[i],[emotions_list[i]], model=model)

			for sentence in corpus[1]:
				for k in range(len(emotions_list)):			# tokenizes and analyzes the sentences
					tokens = nltk.word_tokenize(sentence)
					emotions_results = lexicon.analyze(tokens, normalize=True, categories=[emotions_list[k]])
					res = {**res, **emotions_results}		# merge all results in one dictionary

					emotion_results = []

				max_likely_emotions_empath = max(res.items(), key=operator.itemgetter(1))[0]

				if res[max_likely_emotions_empath] != 0.0:
					best_em.append(max_likely_emotions_empath)
				else:
					best_em.append('no_idea')

			best_em = np.asarray(best_em)
			np.savetxt(DATADIR + "/labels_empath_on_ISEAR.txt", best_em, fmt="%s")      #saves empath detection

		# ---------------------------------- if labels already exist: --------------------------------

		ISEAR_labels = corpus[0]

		empath_labels = pd.read_csv(DATADIR + '/labels_empath_on_ISEAR.txt', sep=',',header=None)

		detected_labels = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] != 'no_idea']
		matches = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] == ISEAR_labels[i]]

		detected_percentage = len(detected_labels)/len(ISEAR_labels)
		overall_accuracy = len(matches)/len(ISEAR_labels)
		detected_accuracy = len(matches)/len(detected_labels)

		print('detected_percentage:', detected_percentage)
		print('detected_accuracy:', detected_accuracy)
		print('overall_accuracy:', overall_accuracy)
		return 0
		
	except Exception as e:
		print(str(e))
		return 51
Ejemplo n.º 14
0
def Empath_List(List):
    lexicon = Empath()
    EMs = {}
    for text in List:
        EM = lexicon.analyze(text, normalize=True)
        #v = max(EM, key=EM.get)
        EMs[text] = [(k, v) for k, v in EM.items() if v != 0]

    return EMs
Ejemplo n.º 15
0
def count_connect(u):

    cat_all = {}
    lexicon = Empath()
    lexicon.create_category("support", support, model="nytimes")
    lexicon.create_category("conflict", conflict, model="nytimes")
    lexicon.create_category("conclusion", conclusion, model="nytimes")
    lexicon.create_category("complementary", complementary, model="nytimes")
    lexicon.create_category("causal_argument",
                            causal_argument,
                            model="nytimes")
    lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes")

    heads = []
    not_heads = []

    for (arg1, arg2) in u:
        heads.append(arg1)
        not_heads.append(arg2)

    norep_heads = list(set(heads))
    norep_not_heads = list(set(not_heads))
    args_conn = list(set(heads) | set(not_heads))

    lexicon = Empath()
    #cat_heads = lexicon.analyze(norep_heads, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_heads = lexicon.analyze(norep_heads,
                                categories=['verbs_hedging'],
                                normalize=True)
    # cat_heads = {}
    # for h in norep_heads:
    #    cat_heads = lexicon.analyze(h, normalize=True)
    #    if cat_heads["fun"] != 0:
    #        print(h, cat_heads["fun"])
    # cat_not_heads = lexicon.analyze(norep_not_heads,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_not_heads = lexicon.analyze(norep_not_heads,
                                    categories=['verbs_hedging'],
                                    normalize=True)
    # cat_all = lexicon.analyze(args_conn,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_all = lexicon.analyze(args_conn,
                              categories=['verbs_hedging'],
                              normalize=True)

    return cat_heads, cat_not_heads, cat_all
Ejemplo n.º 16
0
def u_analyze_emotion():
    print("Analyzing emotions from SOTU")
    global CATS, CATS_DICT
    lexicon = Empath()
    descrip_list = []
    DBNAME = 'final.db'
    conn = sqlite3.connect(DBNAME)
    cur = conn.cursor()
    statement = 'SELECT text from sotu'
    cur.execute(statement)
    for row in cur: 
        descrip_list.append(row[0])
    
    ### entire corpus
    str1 = ''.join(descrip_list)
    u_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True)
    u_empath_dict_new = {}
    for key in u_empath_dict:
        #print (key, 'corresponds to', u_empath_dict[key]*1000)
        u_empath_dict_new[key] = u_empath_dict[key]*1000

    ## Row by Row: 
    counter = 0
    for row in descrip_list: 
        u_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True)
        u_row_empath_dict_new = {}
        for key in u_row_empath_dict:
            u_row_empath_dict_new[key] = u_row_empath_dict[key]*1000
        counter +=1
        vals_list = list(u_row_empath_dict_new.values())
        vals_list.insert(0, None)
        vals_list.insert(1, 999)
        vals_list.insert(2, counter)
        vals_list.insert(3, 999)
        vals_list.insert(4, "sotu")
        insertion = tuple(vals_list)
        statement = ''' 
            INSERT INTO "Emotions"
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            '''
        cur.execute(statement, insertion)
        conn.commit()

    return u_empath_dict_new
Ejemplo n.º 17
0
 def command(self, documents):
     documents = documents.to_matrix().flatten()
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     to_df = []
     out_dict = lexicon.analyze(documents.tolist(), normalize=True)
     for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True):
         to_df.append([k,v])
     return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df)
def get_raw_empath_categories_for_topics(model):
    lex = Empath()
    categories = []
    for topic in model["topics"]:
        word_categories = {}
        for word in topic:
            result = lex.analyze(word)
            word_categories[word] = [
                key for key in result.keys() if result[key] > 0
            ]
        categories.append(word_categories)
    return categories
def process_lexicon(texts):
    lexicon = Empath()
    data = {}

    for i, text in texts.iteritems():
        data[i] = [
            k for k, v in lexicon.analyze(text, normalize=False).items()
            if v > 0
        ]

        print("{:<5}%".format(round(i * 100 / len(texts), 2)), end='\r')
    return data
    def parseInput(self, input):
        self.history.append(input)

        # topic modeling and additional topic generation
        lexicon = Empath()
        topicVector = lexicon.analyze(input, normalize=False)

        topics = []
        for key in topicVector.keys():
            if topicVector[key] > 0:
                topics.append(key)
        self.topics = topics
        return topics
def semantics(df):

    df['Clean_Text'] = df['Text'].apply(lambda x: clean_text(x))
    df.dropna(inplace=True)

    lexicon = Empath()
    semantic = []

    # adding it to respective categories

    for article in df['Clean_Text']:
        d = lexicon.analyze(article, normalize=False)
        x = []
        for key, value in d.items():
            x.append(value)
        x = np.asarray(x)
        semantic.append(x)
    df['Semantic'] = semantic

    categories = []
    a = lexicon.analyze("")
    for key, value in a.items():
        categories.append(key)
    categories

    # replacing test with categories
    sem = []
    for i in range(df.shape[0]):
        a = []
        for j in range(len(semantic[0])):
            for k in range(int(semantic[i][j])):
                a.append(categories[j])
        b = " ".join(a)
        sem.append(b)
    df['Semantics'] = sem
    data = df['Semantics']

    return data
Ejemplo n.º 22
0
 def command(self, documents, top_n):
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     data = np.array([
         order_keys(lexicon.analyze(doc, normalize=True))
         for doc in documents
     ])
     types_ = ["Number" for _ in order_keys.s_keys]
     return top_n, iris_objects.IrisDataframe(
         column_names=order_keys.s_keys,
         column_types=types_,
         data=data,
         do_conversion=False)
Ejemplo n.º 23
0
def empath_vector(text):
    """
    Returns a normalised vector (list) of 15 hand-picked categories from Empath: http://empath.stanford.edu/
    """
    categories = [
        'hate', 'aggression', 'dispute', 'swearing_terms', 'ridicule',
        'exasperation', 'fight', 'politeness', 'disgust', 'rage', 'warmth',
        'sadness', 'shame', 'negative_emotion', 'positive_emotion'
    ]
    lex = Empath()
    d = lex.analyze(text, categories=categories, normalize=True)
    if d == None:
        return 15 * [0.0]
    return list(d.values())
Ejemplo n.º 24
0
def empath_analytics(speech: str) -> list:
    categories_to_include = ['hate', 'cheerfulness', 'aggression', 'envy', 'anticipation', 'masculine', 'pride',
                             'dispute', 'nervousness', 'weakness', 'horror', 'swearing_terms', 'suffering', 'art',
                             'ridicule', 'optimism', 'divine', 'fear', 'religion', 'worship', 'confusion', 'death',
                             'violence', 'dominant_heirarchical', 'neglect', 'dominant_personality', 'love', 'order',
                             'sympathy', 'trust', 'deception', 'politeness', 'disgust', 'sadness', 'ugliness', 'lust',
                             'torment', 'politics', 'power', 'disappointment', 'pain', 'negative_emotion', 'competing',
                             'friends', 'achievement', 'feminine', 'positive_emotion']
    lexicon = Empath()
    results = lexicon.analyze(speech, categories=categories_to_include)
    output = {}
    for (key, value) in results.items():
        if value != 0:
            output[key] = value
    return sorted(output, key=output.get, reverse=True)[0:5]
def large_scale_visual_sentiment(vg_en_tn_prdct):
	lexicon=Empath()
	vg_en_tn_prdct_sentiments=defaultdict(int)
	for row in vg_en_tn_prdct:
		for tensorproduct in row:
			tpedges=tensorproduct.edges()
			tpnodes=tensorproduct.nodes()
			print "Edges:",tpedges
			print "Nodes:",tpnodes
			for tpedge in tpedges:	
				sentiment00=lexicon.analyze((tpedge[0][0]).decode("utf-8"))
				for k,v in sentiment00.iteritems():
					vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v
				sentiment01=lexicon.analyze((tpedge[0][1]).decode("utf-8"))
				for k,v in sentiment01.iteritems():	
					vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v
				sentiment10=lexicon.analyze((tpedge[1][0]).decode("utf-8"))
				for k,v in sentiment10.iteritems():
					vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v
				sentiment11=lexicon.analyze((tpedge[1][1]).decode("utf-8"))
				for k,v in sentiment11.iteritems():
					vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v
	print "Sentiment Analysis of the Video:", sorted(vg_en_tn_prdct_sentiments.items(), key=operator.itemgetter(0), reverse=True)
	return vg_en_tn_prdct_sentiments
def ts_mod(tokens):
    """
    This function implements the topic signal approach of Empath. Empath uses a trained (Neuronal Networks) word category list with the aim to detect topic signals in tokenized text. It then sorts the topics by value and shortlist it to the 10 highest ranked topics.
    :param tokens:  tokenized list of words f.ex.: ["cheese","fighting","dog","cold","man","war"]
    :return:        dictionary (created by empath) shortlist of the 10 highest ranked topics - key: detected topic / value: calculated value of importance
    """
    lexicon = Empath()
    lexicon = lexicon.analyze(tokens, normalize=True)

    if lexicon == None:
        return

    topics = threshold_filter(lexicon)
    topics_sorted = sort_topics_by_value(topics)
    topics_shortlist = shortlist_topics(topics_sorted)
    return topics_shortlist
Ejemplo n.º 27
0
def analyze_tweets_liwc(tweets):
    """ Uses the Empath library to gather topics found in labeled tweet data
        Keyword arguments:
        tweets -- list of labeled tweet objects
    """
    lexicon = Empath()

    results = {CATEGORY_HATE: {}, CATEGORY_NON_HATE: {}}

    num_hate = num_non_hate = 0

    for tweet in tweets:
        category = ""
        text = clean_tweet_text(get_tweet_text(tweet))

        if (tweet["hate_speech"]):
            category = CATEGORY_HATE
            num_hate += 1
        else:
            category = CATEGORY_NON_HATE
            num_non_hate += 1
        topics = lexicon.analyze(text, normalize=False)
        for topic in topics.keys():
            if topics[topic] > 0:
                if topic in results[category]:
                    results[category][topic] += topics[topic]
                else:
                    results[category][topic] = topics[topic]

    # Sort the topics by total raw counts
    results[CATEGORY_HATE] = sorted(results[CATEGORY_HATE].items(),
                                    key=lambda kv: (kv[1], kv[0]),
                                    reverse=True)
    results[CATEGORY_NON_HATE] = sorted(results[CATEGORY_NON_HATE].items(),
                                        key=lambda kv: (kv[1], kv[0]),
                                        reverse=True)

    # Normalize topic counts by dividing by the total number of tweets in each category
    results[CATEGORY_HATE] = [(x, y / num_hate)
                              for x, y in results[CATEGORY_HATE]]
    results[CATEGORY_NON_HATE] = [(x, y / num_non_hate)
                                  for x, y in results[CATEGORY_NON_HATE]]

    return results
Ejemplo n.º 28
0
  class LexiconFeatures() :
    def __init__(self) :
      self.lexicon = Empath()

    def tokenize(self, text):
      text = [str(w) for w in tokenizer(text)]
      return text

    def get_features(self, text):
      features = list(self.lexicon.analyze(text, normalize=True).values())
      features = torch.as_tensor([features])
      return(features)

    def parse_sentences(self, sentences) :
      sent_features = []
      for sent in sentences:
        sent_features.append(self.get_features(sent))
      sent_features = torch.cat(sent_features, dim=0)
      print("Empath features: {}".format(sent_features.shape))
      return sent_features
Ejemplo n.º 29
0
    def get_empath(self, empathCol):
        """Get empath score """

        tweet_dict = self.convert_dict()
        lexicon = Empath()

        empath_dict = {}
        for tweetid, tweet in tweet_dict.items():

            result = lexicon.analyze(tweet, normalize=True)
            empath_dict[tweet] = result[empathCol]

        with open(self.path + 'empath.json', 'a') as f:
            json.dump(empath_dict, f)

        # empath_df = pd.DataFrame.from_dict(data, orient='index')
        # empath_df['tweet_id'] = empath_df.index
        # empath_df.columns = [empathCol, 'tweet_id']

        return empath_dict
Ejemplo n.º 30
0
class LexiconFeatures() :
    def __init__(self):
        self.lexicon = Empath()

    def tokenize(self, text):
        text = [str(w) for w in tokenizer(text)]
        return text

    def get_features(self, text):
        features = list(self.lexicon.analyze(text,normalize=True).values())
        features = torch.as_tensor([features])
        return(features)

    def parse_sentences(self, sentences) :
        temp = []
        for i in tqdm(range(len(sentences))):
            sent = sentences[i]
            temp.append(self.get_features(sent))
        temp = torch.cat(temp, dim=0)
        print("liwc features: {}".format(temp.shape))
        return temp
Ejemplo n.º 31
0
def process_csv(in_file, out_file, column):
    start_time = time.time()
    
    f = open(in_file, "r", encoding='utf-8')
    csv_reader = csv.reader(f, delimiter=',', quotechar='"')
    
    result = open(out_file, mode='w', encoding='utf-8')
    csv_writer = csv.writer(result)
    
    review_col = int(column)
    empath_column = 'categories'
    lexicon = Empath()
    line_count = 0
    
    for row in csv_reader:
        if line_count == 0:
            row.append(empath_column)
            csv_writer.writerow(row)
            line_count += 1
        else: 
            content = list(row)
            review = row[review_col]
            categories = lexicon.analyze(review, normalize=True)
            trimmed_categories = dict()
            if categories:
                for category, score in categories.items():
                    if score != 0:
                        trimmed_categories[category] = score
            
            row.append(trimmed_categories)
 
            #print(row)
            csv_writer.writerow(row)
            line_count += 1

    print('Processed', line_count, 'rows in', "{:.2f}".format(time.time() - start_time), 'seconds.')
from empath import Empath
import sys
lexicon = Empath()

lyric_filename = sys.argv[1]

with open(lyric_filename) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

i = 0
while (i < len(content)):
    print content[i]
    i+=1
    print lexicon.analyze(content[i], normalize=True)
    i+=1
topN = []

#For each interest i,
for i,w in interests_and_weights.iteritems():
    iTopN = [None]*count
    max_tweets = 500 # the number of requests consumed will be this number / 100 * the number of interests
    #Search the latest tweets t_i related to i
    searched_tweets = [status for status in tweepy.Cursor(api.search, rpp=100, q=i + "  -filter:retweets", since=since_date,languages=["en"],tweet_mode='extended', count=max_tweets).items(max_tweets)]

    for idx,tweet in enumerate(searched_tweets):
        if len(tweet.full_text) < 100:
            continue
        #print idx,tweet.text
        #For each tweet t_i, compute the empath vector
        empath_vec = lexicon.analyze(tweet.full_text, normalize=True)
        #print empath_vec
        score = emotional_score(empath_vec) 
        #print "Score:",score,"*",w,"=",(score*w)
        score *= w
        for j,topI in enumerate(iTopN):
            if topI == None or score > topI[1]:
                tweet.persona_interest = i
                iTopN.insert(j,[tweet,score,empath_vec])
                iTopN.pop()
                break;
    topN += iTopN

shuffle(topN)

topN = topN[0:count]
	# Not sure if tese count. If you don't include these we have 16 emotions!
	#answer.append(lexicon.create_category("ambiguous",["no","different","disagree"]))
	#answer.append(lexicon.create_category("neutral",["whatever","alright","anything"]))
	return answer

# Removes the emotional categories with scores of 0.0
def removeZeros(analysisDict):
	ans = {}
	for key,value in analysisDict.iteritems():
		if value > 0.0:
			ans[key] = value
	return ans

# Start of Script
emotionalCats = getEmotionalCategories()
for filename in os.listdir("processedTweets"):
	with open('processedTweets/'+filename) as csv_file:
		csv_reader = csv.reader(csv_file, delimiter=',')
		for row in csv_reader:
			row = " ".join(row) # remove commas, and turn token list into string
			try:
				# Analyze text over all set emotional categories normalized by words in each tweet:
				analysis=removeZeros(lexicon.analyze(row, categories=emotionalCats, normalize=True))
				if len(analysis) !=0:
					print(analysis)
			except:
				pass
	break # remove break to keep iterating through directory 'processedTweets'
			
			
Ejemplo n.º 35
0
from empath import Empath
from textblob import TextBlob
import matplotlib
import matplotlib.mlab as mlab
matplotlib.use('Agg')
import matplotlib.pyplot as plt

lexicon = Empath()

pos_txt = open('persuasiveargs.txt', 'r').read()
neg_txt = open('notpersuasiveargs.txt', 'r').read()

cat_pos = lexicon.analyze(pos_txt, normalize=True)
cat_neg = lexicon.analyze(neg_txt, normalize=True)

for k in cat_pos:
    if cat_neg[k] != 0:
        r = cat_pos[k] / cat_neg[k]
        if r > 2:
            print('1 Categoria: ', k, 'Pos:', cat_pos[k], 'Neg:', cat_neg[k])

    if cat_pos[k] != 0:
        r = cat_neg[k] / cat_pos[k]
        if r > 2:
            print('2 Categoria: ', k, 'Pos:', cat_pos[k], 'Neg:', cat_neg[k])

blob_pos = TextBlob(pos_txt)
polarity_pos = []
subjectivity_pos = []
for sentence in blob_pos.sentences:
    polarity_pos.append(sentence.sentiment.polarity)
from empath import Empath
import csv
from HTMLParser import HTMLParser
import sys
lexicon = Empath()

import re

reload(sys)
sys.setdefaultencoding('utf8')

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext.replace("&","and")

lyric_filename = sys.argv[1]
h = HTMLParser()
with open(lyric_filename, "r") as f:
    reader = csv.reader(f, delimiter=",")
    for i, line in enumerate(reader):
        if i == 0:
            line.append('empath_vec')
            print line
        else:
            line[3]=cleanhtml(h.unescape(line[3]).encode('utf-8'))
            line.append(lexicon.analyze(line[3], normalize=True))
            print line