def hateLikeMaker(tweets):
    lexicon = Empath()
    likeness = defaultdict(int)

    for i in tweets:
        sents = sent_tokenize(i)
        for j in sents:
            j = re.sub(r'[^\w\s]', '', j)
            a = lexicon.analyze(j)
            if a['negative_emotion'] == 1:
                print("TRUE")
                for k, l in a.items():
                    if l == 1 and j != 'negative_emotion':
                        likeness[k] -= 1
            else:
                print("FALSE")
                for k, l in a.items():
                    if l == 1:
                        likeness[k] += 1
        if 'hate' in likeness:
            likeness.pop('hate')
        if 'envy' in likeness:
            likeness.pop('envy')
        likeness = sorted(likeness.items(), key=operator.itemgetter(1))
        dislikes = likeness[:3]
        length = len(likeness)
        likes = likeness[length - 3:length]
        likes = dict(likes)
        dislikes = dict(dislikes)
        return likes, dislikes
Esempio n. 2
0
def use_text_classification(file, model):    
    import pandas as pd
    test = pd.read_csv(file)
    test = test.dropna()
    test = pd.DataFrame(test.labeldata.str.split('\r\r\n').tolist(), index=test.labelname).stack()
    test = test.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0
    test.columns = ['labeldata', 'labelname'] # renaming var1
    
    import html
    test = html.unescape(test)
    # Shuffle data
    test = test.dropna()
    test = test[test['labeldata']!='']
    
    from empath import Empath
    lexicon = Empath()
    
    test_features = []
    for sentence in test['labeldata']:
        feature = lexicon.analyze(sentence, normalize=True)
        test_features.append(feature)
        
    test_features = pd.DataFrame(test_features)
    prediction = model.predict(test_features)   
    test['prediction'] = prediction
	
    return test
Esempio n. 3
0
class CustomEmpUser(User):
    emp = Empath()

    def __init__(self, username):
        super().__init__(username)

    def process_data(self, clean):
        # Try to get topic list if it exists
        try:
            with open("data/topics.json", "r") as file:
                topics = json.load(file)
            raw_data = self.emp.analyze(clean, categories=topics["topics"])

        # Otherwise, run empath without topics filter
        except IOError as err:
            print(err, "\r\nUsing all categories")
            raw_data = self.emp.analyze(clean)

        # Delete interests generated if they have a hit score of 0
        keys_to_delete = []
        for key in raw_data:
            if raw_data[key] <= 0:
                keys_to_delete.append(key)
        for key in keys_to_delete:
            del raw_data[key]

        return raw_data
def large_scale_visual_sentiment(vg_en_tn_prdct):
    lexicon = Empath()
    vg_en_tn_prdct_sentiments = defaultdict(int)
    for row in vg_en_tn_prdct:
        for tensorproduct in row:
            tpedges = tensorproduct.edges()
            tpnodes = tensorproduct.nodes()
            print "Edges:", tpedges
            print "Nodes:", tpnodes
            for tpedge in tpedges:
                sentiment00 = lexicon.analyze((tpedge[0][0]).decode("utf-8"))
                for k, v in sentiment00.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment01 = lexicon.analyze((tpedge[0][1]).decode("utf-8"))
                for k, v in sentiment01.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment10 = lexicon.analyze((tpedge[1][0]).decode("utf-8"))
                for k, v in sentiment10.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
                sentiment11 = lexicon.analyze((tpedge[1][1]).decode("utf-8"))
                for k, v in sentiment11.iteritems():
                    vg_en_tn_prdct_sentiments[
                        k] = vg_en_tn_prdct_sentiments[k] + v
    print "Sentiment Analysis of the Video:", sorted(
        vg_en_tn_prdct_sentiments.items(),
        key=operator.itemgetter(0),
        reverse=True)
    return vg_en_tn_prdct_sentiments
Esempio n. 5
0
def count_unconnect(u):

    # espero que seja um grupo bem diverso
    lexicon = Empath()
    # print(len(u))

    lexicon.create_category("support", support, model="nytimes")
    lexicon.create_category("conflict", conflict, model="nytimes")
    lexicon.create_category("conclusion", conclusion, model="nytimes")
    lexicon.create_category("complementary", complementary, model="nytimes")
    lexicon.create_category("causal_argument",
                            causal_argument,
                            model="nytimes")
    lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes")

    #["because", "only", "before", "so", "if", "though", "then", "until", "once", "even", "since", "although", "so", "while", "having", "because", "already", "thus", "time", "unless", "now", "actually", "eventually"]
    #["though", "although", "except", "yet", "but", "even", "because", "only", "Though", "Although", "Yet", "either", "nevertheless", "whereas", "though", "fact", "however", "unlike", "Furthermore", "because", "nonetheless", "And", "However", "none", "either", "still", "Even", "despite", "if", "so", "Yet", "meaning", "indeed", "consequently"]
    #[]
    #["while", "whereas", "though", "only", "yet", "While", "thus", "even", "Thus", "Instead", "although", "instead", "Though", "Moreover", "actually", "nevertheless", "sometimes", "still", "rather"]
    #["means", "therefore", "means", "merely", "mechanism", "democratic_process", "Therefore", "simply", "free_market", "consequence", "because"]
    # cat_all = lexicon.analyze(u, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_all = lexicon.analyze(u, categories=['verbs_hedging'], normalize=True)
    #cat_all = {}
    #for arg in u:
    #   cat = lexicon.analyze(arg)
    #   if cat["children"] != 0:
    #       print(arg, cat["children"])
    return cat_all
Esempio n. 6
0
def get_post_metrics(df, post_file):

    #define empath
    lexicon = Empath()
    # this is the eleven categories which we will use
    eleven_categories = [
        'family', 'friends', 'home', 'sexual', 'swears', 'work', 'leisure',
        'money', 'body', 'religion', 'health'
    ]

    # clean the post data
    post_df = cleaning(post_file)  # need the cleaning function from above!

    # find post frequency for each friend
    df['post frequency'] = 0
    df['empath'] = 0
    for friend in df.name:
        ind = df.loc[df.name == friend].index[0]
        title = post_df[post_df['title'].str.contains(friend, na=False)]
        tags = post_df[post_df['tags'].str.contains(friend, na=False)]
        friend_post = pd.concat([title, tags])
        df.at[ind, 'post frequency'] = len(friend_post.index)

        # if there is a post, find empath analysis
        if df.loc[ind, 'post frequency'] != 0:
            # sum the empath analysis for each post on eleven categories
            friend_post['empath'] = friend_post['post'].apply(lambda x: sum(
                lexicon.analyze(x, categories=eleven_categories).values()))
            # find the average of the empath score for each post
            df.at[ind, 'empath'] = np.mean(friend_post['empath'])
    return df
Esempio n. 7
0
def subcommand_sentiment(texts, docnames, args):
    nlp = spacy.load('en')
    lexicon = Empath()
    if args.posneg_only:
        cats = ['positive_emotion','negative_emotion']
    else:
        cats = None # all the categories

    analyze = lambda t: lexicon.analyze(t, categories=cats, normalize= not args.no_normalize)
    sentiments = [analyze(t) for t in texts]


    df = pd.DataFrame(sentiments,index=docnames)
    summarydf = make_summary(df)

    sheets = list()
    if args.human_readable:
        hdf = make_human_report(df)
        sheets.append( ('report',hdf) )
    else:
        sheets.append( ('report',df))
    sheets.append(('summary',summarydf))

    final_fname = write_report(
        args.outfile, 
        sheets, 
        hdf_if_fail=not args.nohdfonfail and not args.human_readable, 
        verbose=True,
    )

    return final_fname
Esempio n. 8
0
def train_text_classification(file):
    import pandas as pd
    train = pd.read_csv(file)
    train = train.dropna()
    train = pd.DataFrame(train.labeldata.str.split('\r\r\n').tolist(), index=train.labelname).stack()
    train = train.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0
    train.columns = ['labeldata', 'labelname'] # renaming var1
    import html
    train = html.unescape(train)
    # Shuffle data
    train = train.sample(frac=1, random_state=1).reset_index(drop=True)
    train = train.dropna()
    train = train[train['labeldata']!='']
    
    
	
    from empath import Empath
    lexicon = Empath()
    
    train_features = []
    for data in train['labeldata']:
        feature = lexicon.analyze(data, normalize=True)
        train_features.append(feature)
        
    train_features = pd.DataFrame(train_features)  
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(solver='sag')
    
    model.fit(train_features,train['labelname'])

    return model
Esempio n. 9
0
def empath_extraction(text):
    # You need to 'pip install empath' first
    from empath import Empath
    lexicon = Empath()
    # Get the Empath result
    '''Note : The output sequence of empath is random!!!'''
    result = lexicon.analyze(text, normalize=True)
    # Filter out those equal to zero
    output = dict()
    for (k, v) in result.items():
        if (v > 0):
            output[k] = v
    '''You can either print the result directly
        With format :  '#the class#':#the score#'''
    # Change the form of output as you need
    # for (k,v) in output.items():
    #     print(k + "-" + str(v))
    '''Or you can get the predefined classes in ./empathClass.txt
        and convert the #the class# into number,
        which may be more convenient and efficient for other programs to process.
        But this will definitely slow down this python code'''
    class_empath = dict()
    with open(r"./empathClass.txt") as file:
        for line in file:
            tmp = line.split(",")
        for index, item in enumerate(tmp):
            class_empath[item] = index
    output_convet = []
    for (k, v) in output.items():
        output_convet.append([str(class_empath[k]), str(v)])
    # Change the form of output as you need
    for item in output_convet:
        print("-".join(item))
Esempio n. 10
0
def analyze_tokens(word_list, topk=10):
    lexicon = Empath()

    word_list_analyzed = lexicon.analyze(word_list, normalize=True)

    return sorted(word_list_analyzed.items(),
                  key=lambda kv: kv[1],
                  reverse=True)[:topk]
def create_empath_cats(text):
    lexicon = Empath()
    try:
        cat_scores = lexicon.analyze(text, normalize=True)
    except Exception as e:
        print(e)
        return 0
    return pd.Series(cat_scores)
Esempio n. 12
0
def executeEmpathOnISEAR(ISEAR, DATADIR):

	try:
		corpus = pd.read_csv(ISEAR, sep=',',header=None)

		if not os.path.isfile(DATADIR + "/labels_empath_on_ISEAR.txt"):

			lexicon = Empath()		#instance of empath analyser
			emotions_list = ['fear', 'joy', 'anger', 'sadness', 'disgust']
			model = "reddit"

			res = {}

			best_em = []		# will contain empath analysis results
			emotions_results = []

			for i in range(len(emotions_list)):			# creates a category for each emotion
				lexicon.create_category(emotions_list[i],[emotions_list[i]], model=model)

			for sentence in corpus[1]:
				for k in range(len(emotions_list)):			# tokenizes and analyzes the sentences
					tokens = nltk.word_tokenize(sentence)
					emotions_results = lexicon.analyze(tokens, normalize=True, categories=[emotions_list[k]])
					res = {**res, **emotions_results}		# merge all results in one dictionary

					emotion_results = []

				max_likely_emotions_empath = max(res.items(), key=operator.itemgetter(1))[0]

				if res[max_likely_emotions_empath] != 0.0:
					best_em.append(max_likely_emotions_empath)
				else:
					best_em.append('no_idea')

			best_em = np.asarray(best_em)
			np.savetxt(DATADIR + "/labels_empath_on_ISEAR.txt", best_em, fmt="%s")      #saves empath detection

		# ---------------------------------- if labels already exist: --------------------------------

		ISEAR_labels = corpus[0]

		empath_labels = pd.read_csv(DATADIR + '/labels_empath_on_ISEAR.txt', sep=',',header=None)

		detected_labels = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] != 'no_idea']
		matches = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] == ISEAR_labels[i]]

		detected_percentage = len(detected_labels)/len(ISEAR_labels)
		overall_accuracy = len(matches)/len(ISEAR_labels)
		detected_accuracy = len(matches)/len(detected_labels)

		print('detected_percentage:', detected_percentage)
		print('detected_accuracy:', detected_accuracy)
		print('overall_accuracy:', overall_accuracy)
		return 0
		
	except Exception as e:
		print(str(e))
		return 51
Esempio n. 13
0
def Empath_List(List):
    lexicon = Empath()
    EMs = {}
    for text in List:
        EM = lexicon.analyze(text, normalize=True)
        #v = max(EM, key=EM.get)
        EMs[text] = [(k, v) for k, v in EM.items() if v != 0]

    return EMs
Esempio n. 14
0
def count_connect(u):

    cat_all = {}
    lexicon = Empath()
    lexicon.create_category("support", support, model="nytimes")
    lexicon.create_category("conflict", conflict, model="nytimes")
    lexicon.create_category("conclusion", conclusion, model="nytimes")
    lexicon.create_category("complementary", complementary, model="nytimes")
    lexicon.create_category("causal_argument",
                            causal_argument,
                            model="nytimes")
    lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes")

    heads = []
    not_heads = []

    for (arg1, arg2) in u:
        heads.append(arg1)
        not_heads.append(arg2)

    norep_heads = list(set(heads))
    norep_not_heads = list(set(not_heads))
    args_conn = list(set(heads) | set(not_heads))

    lexicon = Empath()
    #cat_heads = lexicon.analyze(norep_heads, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_heads = lexicon.analyze(norep_heads,
                                categories=['verbs_hedging'],
                                normalize=True)
    # cat_heads = {}
    # for h in norep_heads:
    #    cat_heads = lexicon.analyze(h, normalize=True)
    #    if cat_heads["fun"] != 0:
    #        print(h, cat_heads["fun"])
    # cat_not_heads = lexicon.analyze(norep_not_heads,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_not_heads = lexicon.analyze(norep_not_heads,
                                    categories=['verbs_hedging'],
                                    normalize=True)
    # cat_all = lexicon.analyze(args_conn,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True)
    cat_all = lexicon.analyze(args_conn,
                              categories=['verbs_hedging'],
                              normalize=True)

    return cat_heads, cat_not_heads, cat_all
Esempio n. 15
0
 def command(self, documents):
     documents = documents.to_matrix().flatten()
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     to_df = []
     out_dict = lexicon.analyze(documents.tolist(), normalize=True)
     for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True):
         to_df.append([k,v])
     return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df)
def get_raw_empath_categories_for_topics(model):
    lex = Empath()
    categories = []
    for topic in model["topics"]:
        word_categories = {}
        for word in topic:
            result = lex.analyze(word)
            word_categories[word] = [
                key for key in result.keys() if result[key] > 0
            ]
        categories.append(word_categories)
    return categories
def process_lexicon(texts):
    lexicon = Empath()
    data = {}

    for i, text in texts.iteritems():
        data[i] = [
            k for k, v in lexicon.analyze(text, normalize=False).items()
            if v > 0
        ]

        print("{:<5}%".format(round(i * 100 / len(texts), 2)), end='\r')
    return data
Esempio n. 18
0
 def __init__(self,
              token_indexers: Optional[Dict[str, TokenIndexer]] = None,
              max_doc: int = 50,
              max_sent: int = 16,
              max_word: int = 64,
              lazy: bool = False) -> None:
     super().__init__(token_indexers=token_indexers,
                      max_doc=max_doc,
                      max_sent=max_sent,
                      max_word=max_word,
                      lazy=lazy)
     self.empath_lexicon = Empath()
     self.lexicon_categories = sorted(list(self.empath_lexicon.cats.keys()))
    def parseInput(self, input):
        self.history.append(input)

        # topic modeling and additional topic generation
        lexicon = Empath()
        topicVector = lexicon.analyze(input, normalize=False)

        topics = []
        for key in topicVector.keys():
            if topicVector[key] > 0:
                topics.append(key)
        self.topics = topics
        return topics
Esempio n. 20
0
 def command(self, documents, top_n):
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     data = np.array([
         order_keys(lexicon.analyze(doc, normalize=True))
         for doc in documents
     ])
     types_ = ["Number" for _ in order_keys.s_keys]
     return top_n, iris_objects.IrisDataframe(
         column_names=order_keys.s_keys,
         column_types=types_,
         data=data,
         do_conversion=False)
Esempio n. 21
0
def empath_vector(text):
    """
    Returns a normalised vector (list) of 15 hand-picked categories from Empath: http://empath.stanford.edu/
    """
    categories = [
        'hate', 'aggression', 'dispute', 'swearing_terms', 'ridicule',
        'exasperation', 'fight', 'politeness', 'disgust', 'rage', 'warmth',
        'sadness', 'shame', 'negative_emotion', 'positive_emotion'
    ]
    lex = Empath()
    d = lex.analyze(text, categories=categories, normalize=True)
    if d == None:
        return 15 * [0.0]
    return list(d.values())
Esempio n. 22
0
def n_analyze_emotion():
    print("Analyzing emotions from News API")
    global CATS, CATS_DICT
    lexicon = Empath()
    descrip_list = []
    DBNAME = 'final.db'
    conn = sqlite3.connect(DBNAME)
    cur = conn.cursor()
    statement = 'SELECT text from news_api'
    cur.execute(statement)
    for row in cur: 
        descrip_list.append(row[0])
    
     ### entire corpus
    str1 = ''.join(descrip_list)
    print(str1)
    n_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True)
    n_empath_dict_new = {}
    for key in n_empath_dict:
        n_empath_dict_new[key] = n_empath_dict[key]*1000

    ## Row by Row: 
    counter = 0
    for row in descrip_list: 
        n_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True)
        #print(len((list(u_row_empath_dict.keys())))) # This is 17 long 
        n_row_empath_dict_new = {}
        for key in n_row_empath_dict:
            n_row_empath_dict_new[key] = n_row_empath_dict[key]*1000
        #print(len((list(u_row_empath_dict_new.keys())))) # This is 17 long
        counter +=1
        vals_list = list(n_row_empath_dict_new.values())
        vals_list.insert(0, None)
        vals_list.insert(1, 999)
        vals_list.insert(2, 999)
        vals_list.insert(3, counter)
        vals_list.insert(4, "news_api")
        insertion = tuple(vals_list)
        DBNAME = 'final.db'
        conn = sqlite3.connect(DBNAME)
        cur = conn.cursor()
        statement = ''' 
            INSERT INTO "Emotions"
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            '''
        cur.execute(statement, insertion)
        conn.commit()

    return n_empath_dict_new
Esempio n. 23
0
def empath_analytics(speech: str) -> list:
    categories_to_include = ['hate', 'cheerfulness', 'aggression', 'envy', 'anticipation', 'masculine', 'pride',
                             'dispute', 'nervousness', 'weakness', 'horror', 'swearing_terms', 'suffering', 'art',
                             'ridicule', 'optimism', 'divine', 'fear', 'religion', 'worship', 'confusion', 'death',
                             'violence', 'dominant_heirarchical', 'neglect', 'dominant_personality', 'love', 'order',
                             'sympathy', 'trust', 'deception', 'politeness', 'disgust', 'sadness', 'ugliness', 'lust',
                             'torment', 'politics', 'power', 'disappointment', 'pain', 'negative_emotion', 'competing',
                             'friends', 'achievement', 'feminine', 'positive_emotion']
    lexicon = Empath()
    results = lexicon.analyze(speech, categories=categories_to_include)
    output = {}
    for (key, value) in results.items():
        if value != 0:
            output[key] = value
    return sorted(output, key=output.get, reverse=True)[0:5]
Esempio n. 24
0
 def command(self, dataframe, selector, aggregate_scores):
     documents = selector.to_matrix().flatten()
     print(documents)
     import numpy as np
     from empath import Empath
     lexicon = Empath()
     to_df = []
     if aggregate_scores == "aggregate":
         out_dict = lexicon.analyze(documents.tolist(), normalize=True)
         for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True):
             to_df.append([k,v])
         return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df)
     else:
         out_scores = [order_keys(lexicon.analyze(d, normalize=True)) for d in documents.tolist()]
         return iris_objects.IrisDataframe(column_names=order_keys.s_keys, data=out_scores)
def ts_mod(tokens):
    """
    This function implements the topic signal approach of Empath. Empath uses a trained (Neuronal Networks) word category list with the aim to detect topic signals in tokenized text. It then sorts the topics by value and shortlist it to the 10 highest ranked topics.
    :param tokens:  tokenized list of words f.ex.: ["cheese","fighting","dog","cold","man","war"]
    :return:        dictionary (created by empath) shortlist of the 10 highest ranked topics - key: detected topic / value: calculated value of importance
    """
    lexicon = Empath()
    lexicon = lexicon.analyze(tokens, normalize=True)

    if lexicon == None:
        return

    topics = threshold_filter(lexicon)
    topics_sorted = sort_topics_by_value(topics)
    topics_shortlist = shortlist_topics(topics_sorted)
    return topics_shortlist
Esempio n. 26
0
def u_analyze_emotion():
    print("Analyzing emotions from SOTU")
    global CATS, CATS_DICT
    lexicon = Empath()
    descrip_list = []
    DBNAME = 'final.db'
    conn = sqlite3.connect(DBNAME)
    cur = conn.cursor()
    statement = 'SELECT text from sotu'
    cur.execute(statement)
    for row in cur: 
        descrip_list.append(row[0])
    
    ### entire corpus
    str1 = ''.join(descrip_list)
    u_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True)
    u_empath_dict_new = {}
    for key in u_empath_dict:
        #print (key, 'corresponds to', u_empath_dict[key]*1000)
        u_empath_dict_new[key] = u_empath_dict[key]*1000

    ## Row by Row: 
    counter = 0
    for row in descrip_list: 
        u_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True)
        u_row_empath_dict_new = {}
        for key in u_row_empath_dict:
            u_row_empath_dict_new[key] = u_row_empath_dict[key]*1000
        counter +=1
        vals_list = list(u_row_empath_dict_new.values())
        vals_list.insert(0, None)
        vals_list.insert(1, 999)
        vals_list.insert(2, counter)
        vals_list.insert(3, 999)
        vals_list.insert(4, "sotu")
        insertion = tuple(vals_list)
        statement = ''' 
            INSERT INTO "Emotions"
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            '''
        cur.execute(statement, insertion)
        conn.commit()

    return u_empath_dict_new
Esempio n. 27
0
def analyze_tweets_liwc(tweets):
    """ Uses the Empath library to gather topics found in labeled tweet data
        Keyword arguments:
        tweets -- list of labeled tweet objects
    """
    lexicon = Empath()

    results = {CATEGORY_HATE: {}, CATEGORY_NON_HATE: {}}

    num_hate = num_non_hate = 0

    for tweet in tweets:
        category = ""
        text = clean_tweet_text(get_tweet_text(tweet))

        if (tweet["hate_speech"]):
            category = CATEGORY_HATE
            num_hate += 1
        else:
            category = CATEGORY_NON_HATE
            num_non_hate += 1
        topics = lexicon.analyze(text, normalize=False)
        for topic in topics.keys():
            if topics[topic] > 0:
                if topic in results[category]:
                    results[category][topic] += topics[topic]
                else:
                    results[category][topic] = topics[topic]

    # Sort the topics by total raw counts
    results[CATEGORY_HATE] = sorted(results[CATEGORY_HATE].items(),
                                    key=lambda kv: (kv[1], kv[0]),
                                    reverse=True)
    results[CATEGORY_NON_HATE] = sorted(results[CATEGORY_NON_HATE].items(),
                                        key=lambda kv: (kv[1], kv[0]),
                                        reverse=True)

    # Normalize topic counts by dividing by the total number of tweets in each category
    results[CATEGORY_HATE] = [(x, y / num_hate)
                              for x, y in results[CATEGORY_HATE]]
    results[CATEGORY_NON_HATE] = [(x, y / num_non_hate)
                                  for x, y in results[CATEGORY_NON_HATE]]

    return results
Esempio n. 28
0
    def get_empath(self, empathCol):
        """Get empath score """

        tweet_dict = self.convert_dict()
        lexicon = Empath()

        empath_dict = {}
        for tweetid, tweet in tweet_dict.items():

            result = lexicon.analyze(tweet, normalize=True)
            empath_dict[tweet] = result[empathCol]

        with open(self.path + 'empath.json', 'a') as f:
            json.dump(empath_dict, f)

        # empath_df = pd.DataFrame.from_dict(data, orient='index')
        # empath_df['tweet_id'] = empath_df.index
        # empath_df.columns = [empathCol, 'tweet_id']

        return empath_dict
Esempio n. 29
0
def appendEmpath(frame, targetCol):
    assert targetCol in frame.columns
    empathScore = Empath().analyze
    tagDict = dict(J='a', N='n', V='v', R='r')

    pBar = ProgBar(range(len(frame) + 1))

    def scoreRow(row):
        pBar.makeProg()
        tb = TextBlob(row[targetCol])
        # USING POS_TAG + LEMMATIZE
        words_and_tags = [(w, tagDict.get(pos[0], 'n')) for w, pos in tb.tags]
        lemmatized_list = [word.lemmatize(tag) for word, tag in words_and_tags]
        lemmaSent = " ".join(lemmatized_list)
        scored = empathScore(lemmaSent, normalize=True)
        row = row.append(pd.Series(data=scored))
        return row

    # APPEND EMPATHS COLUMNS TO FRAME
    print("Doing empath on", len(frame), "rows...")
    frame = frame.apply(scoreRow, axis='columns')
    return frame
def semantics(df):

    df['Clean_Text'] = df['Text'].apply(lambda x: clean_text(x))
    df.dropna(inplace=True)

    lexicon = Empath()
    semantic = []

    # adding it to respective categories

    for article in df['Clean_Text']:
        d = lexicon.analyze(article, normalize=False)
        x = []
        for key, value in d.items():
            x.append(value)
        x = np.asarray(x)
        semantic.append(x)
    df['Semantic'] = semantic

    categories = []
    a = lexicon.analyze("")
    for key, value in a.items():
        categories.append(key)
    categories

    # replacing test with categories
    sem = []
    for i in range(df.shape[0]):
        a = []
        for j in range(len(semantic[0])):
            for k in range(int(semantic[i][j])):
                a.append(categories[j])
        b = " ".join(a)
        sem.append(b)
    df['Semantics'] = sem
    data = df['Semantics']

    return data