def get_feature_array(tweets):
    sentiment_analyzer = VS()
    feats=[]

    for t in tweets:
        feats.append(other_features(t, sentiment_analyzer))
    return np.array(feats)
Beispiel #2
0
def getscores():
	SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
	store = file.Storage('storage.json')
	creds = store.get()
	# nltk.download ('stopwords')
	# stop_words = get_stop_words('en')
	analyzer = VS()
	# stopwords_list = stopwords.words('english') + stopwords.words('portuguese')
	# tvd = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0.003,max_df=0.01,max_features=5000,norm='l1',stop_words=stopwords_list)
	if not creds or creds.invalid:
		flow = client.flow_from_clientsecrets('client_secret.json',SCOPES)
		creds = tools.run_flow(flow,store)
	GMAIL = discovery.build('gmail','v1',http=creds.authorize(Http()))

	label = 'INBOX'

	Spl_char = [",","-",".",";",">","<","=","&",":","#","!"]
	threads = ListThreadsWithLabels(GMAIL,'me',[label])
	# all_threads = []
	all = []
	for t in threads:
		conversation, subject = GetThread(GMAIL,'me',t['id'])
		score = []
		for msg in conversation:
			if "Team" not in msg:
				vs = analyzer.polarity_scores(msg)
				score.append(vs['compound'])

		one = [subject,conversation,score]
		
		all.append(one)
	return all
def other_features(tweet):
    ##SENTIMENT
    sentiment = VS(tweet)
    ##READABILITY
    #See https://pypi.python.org/pypi/textstat/
    flesch = round(textstat.flesch_reading_ease(tweet), 3)
    flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3)
    gunning_fog = round(textstat.gunning_fog(tweet), 3)
    ##TEXT-BASED
    length = len(tweet)
    num_terms = len(tweet.split())
    ##TWITTER SPECIFIC TEXT FEATURES
    hashtag_count = tweet.count("#")
    mention_count = tweet.count("@")
    url_count = tweet.count("http")
    retweet = 0
    if tweet.lower().startswith("rt") is True:
        retweet = 1
    #Checking if RT is in the tweet
    words = tweet.lower().split()
    if "rt" in words or "#rt" in words:
        retweet = 1
    features = [
        sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length,
        num_terms, hashtag_count, mention_count, url_count, retweet
    ]
    return features
Beispiel #4
0
def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment_analyzer = VS()
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features
Beispiel #5
0
def get_sentiment(message):
    sentiment_analyzer = VS()
    res = sentiment_analyzer.polarity_scores(message)
    results = {}
    results['Positive'] = res['pos'] * 100
    results['Negative'] = res['neg'] * 100
    results['Neutral'] = res['neu'] * 100
    return results
Beispiel #6
0
    def __init__(self, ):
        cur_path = os.path.dirname(os.path.abspath(__file__))
        self.model = joblib.load(os.path.join(cur_path, 'final_model.pkl'))
        self.tf_vectorizer = joblib.load(
            os.path.join(cur_path, 'final_tfidf.pkl'))
        self.idf_vector = joblib.load(os.path.join(cur_path, 'final_idf.pkl'))
        self.pos_vectorizer = joblib.load(
            os.path.join(cur_path, 'final_pos.pkl'))

        self.stemmer = PorterStemmer()
        self.sentiment_analyzer = VS()
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
import csv

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

stopwords = stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

sentiment_analyzer = VS()

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
def logregress_linsvc(input):

    # nltk.download('averaged_perceptron_tagger')
    # nltk.download('punkt')
    # nltk.download('wordnet')







    #base_tweets=input['text']
    #tweets = [x for x in base_tweets if type(x) == str]

    stopwords = nltk.corpus.stopwords.words("english")

    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)

    # stemmer = PorterStemmer()

    #################################################################################################
    '''Preprocess tweets, tokenize, and gather feature,POS tags'''
    ################################################################################################


    def basic_tokenize(tweet):
        """Same as tokenize but without the stemming"""
        tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
        return tweet.split()

    sentiment_analyzer = VS()

    def count_twitter_objs(text_string):
        """
        Accepts a text string and replaces:
        1) urls with URLHERE
        2) lots of whitespace with one instance
        3) mentions with MENTIONHERE
        4) hashtags with HASHTAGHERE

        This allows us to get standardized counts of urls and mentions
        Without caring about specific people mentioned.

        Returns counts of urls, mentions, and hashtags.
        """
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        hashtag_regex = '#[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text_string)
        parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
        parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
        parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
        return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

    def other_features(tweet):
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        ##SENTIMENT
        sentiment = sentiment_analyzer.polarity_scores(tweet)

        words = preprocess(tweet) #Get text only

        syllables = textstat.syllable_count(words) #count syllables in words
        num_chars = sum(len(w) for w in words) #num chars in words
        num_chars_total = len(tweet)
        num_terms = len(tweet.split())
        num_words = len(words.split())
        avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
        num_unique_terms = len(set(words.split()))

        ###Modified FK grade, where avg words per sentence is just num words/1
        FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
        ##Modified FRE score, where sentence fixed to 1
        FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

        twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
        features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                    num_unique_terms,sentiment['neu'], sentiment['compound'],
                    twitter_objs[2],twitter_objs[1],twitter_objs[0]]

        #features = pandas.DataFrame(features)
        return features

    def get_feature_array(tweets):
        feats=[]
        for t in tweets:
            feats.append(other_features(t))
        return np.array(feats)

    def get_pos_tags(tweets):
        """Takes a list of strings (tweets) and
        returns a list of strings of (POS tags).
        """
        tweet_tags = []
        for t in tweets:
            tokens = basic_tokenize(preprocess(t))
            tags = nltk.pos_tag(tokens)
            tag_list = [x[1] for x in tags]
            #for i in range(0, len(tokens)):
            tag_str = " ".join(tag_list)
            tweet_tags.append(tag_str)
        return tweet_tags

    ###########################################################################################
    '''Preprocess End'''
    ###########################################################################################


    warnings.simplefilter(action='ignore', category=FutureWarning)



    #######################################################
    '''Construct tfidf matrix and get relevant scores'''
    #######################################################
    tf_array = tf_vectorizer.fit_transform(input['text']).toarray()
    tfidf_array = tf_array*idf_vector
    print ("Built TF-IDF array")

    #################################################
    '''Construct POS TF matrix and get vocab dict'''
    #################################################
    pos_tags = get_pos_tags(input['text'])
    pos_array = pos_vectorizer.fit_transform(pos_tags).toarray()
    print ("Built POS array")

    ###################################################
    ''' Get features'''
    ###################################################
    other_feats = get_feature_array(input['text'])
    print ("Built other features array")

    #Now join them all up
    X = np.concatenate([tfidf_array,pos_array,other_feats],axis=1)

    print(X.shape)


    #####################################################
    '''Running the Model'''
    #####################################################
    print ("Running classification model...")
    y_preds = model.predict(X)

    print ("Loading data to classify...")

    # def class_to_name(class_label):
    #     """
    #     This function can be used to map a numeric
    #     feature name to a particular class.
    #     """
    #     if class_label == 0:
    #         return "Hate speech"
    #     elif class_label == 1:
    #         return "Offensive language"
    #     elif class_label == 2:
    #         return "Neither"
    #     else:
    #         return "No label"
    hate = 0
    hurtful = 0
    neither = 0
    for x in y_preds:
       if str(x) == '0':
           hate +=1
       elif str(x) == '1':
           hurtful += 1
       elif str(x) == '2':
           neither += 1
    print ("Printing predicted values: ")

    print(f'Hateful tweets: {hate}; % of total: {hate/(hate+hurtful+neither)}')
    #hate_results = f'Hateful tweets: {hate}; % of total: {hate/(hate+hurtful+neither)}'

    print(f'Hurtful tweets: {hurtful}; % of total: {hurtful/(hate+hurtful+neither)}')
    #hurtful_results = f'Hurtful tweets: {hurtful}; % of total: {hurtful/(hate+hurtful+neither)}'

    print(f'Neither tweets: {neither}; % of total: {neither/(hate+hurtful+neither)}')
    #neither_results = f'Neither tweets: {neither}; % of total: {neither/(hate+hurtful+neither)}'
    results = {
                'hate_data':{'count':hate,
                            'percentTotal':int((hate/(hate+hurtful+neither))*100)},
                'hurt_data':{'count':hurtful,
                            'percentTotal':int((hurtful/(hate+hurtful+neither))*100)},
                'neither_data':{'count':neither,
                                'percentTotal':int((neither/(hate+hurtful+neither))*100)},
                'total_count':hate+hurtful+neither
                }
    return results
Beispiel #9
0
 def __init__(self):
     self.sentiment_analyzer = VS()
Beispiel #10
0
from json import loads
from hatebase import HatebaseAPI
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS

# hatebase = HatebaseAPI({"key": key})
# filters = {'about_nationality': '1', 'language': 'eng'}
# output = "json"
# query_type = "sightings"
# response = hatebase.performRequest(filters, output, query_type)
#
# # convert to Python object
# response = loads(response)
# print response
sentences = [
    "VADER is full of crap.",  # positive sentence example
    "VADER is not smart, handsome, nor funny.",  # negation sentence example
    "VADER is smart, handsome, and funny!",  # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "VADER is very smart, handsome, and funny."
]
analyzer = VS()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    # print("{:-<65} {}".format(sentence, str(vs)))
    print sentence, vs
Beispiel #11
0
class SentimentVectorizer(BaseEstimator, TransformerMixin):
    sentiment_analyzer = VS()

    def count_twitter_objs(self, text_string):
        """
        Accepts a text string and replaces:
        1) urls with URLHERE
        2) lots of whitespace with one instance
        3) mentions with MENTIONHERE
        4) hashtags with HASHTAGHERE

        This allows us to get standardized counts of urls and mentions
        Without caring about specific people mentioned.

        Returns counts of urls, mentions, and hashtags.
        """
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        hashtag_regex = '#[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text_string)
        parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
        parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
        parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
        return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

    def other_features(self, tweet):
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        sentiment = self.sentiment_analyzer.polarity_scores(tweet)

        words = preprocess(tweet) #Get text only

        syllables = textstat.syllable_count(words)
        num_chars = sum(len(w) for w in words)
        num_chars_total = len(tweet)
        num_terms = len(tweet.split())
        num_words = len(words.split())
        avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
        num_unique_terms = len(set(words.split()))

        ###Modified FK grade, where avg words per sentence is just num words/1
        FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
        ##Modified FRE score, where sentence fixed to 1
        FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

        twitter_objs = self.count_twitter_objs(tweet)
        retweet = 0
        if "rt" in words:
            retweet = 1
        features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                    num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1],
                    twitter_objs[0], retweet]
        return features

    def get_feature_array(self, tweets):
        feats=[]
        for t in tweets:
            feats.append(self.other_features(t))
        return np.array(feats)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return self.get_feature_array(X)
def predict(x):
    model = load("model.ml")
    tweets = x
    stopwords = stopwords = nltk.corpus.stopwords.words("english")

    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)

    stemmer = PorterStemmer()

    def preprocess(text_string):
        """
        Accepts a text string and replaces:
        1) urls with URLHERE
        2) lots of whitespace with one instance
        3) mentions with MENTIONHERE

        This allows us to get standardized counts of urls and mentions
        Without caring about specific people mentioned
        """
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                           '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
        return parsed_text

    def tokenize(tweet):
        """Removes punctuation & excess whitespace, sets to lowercase,
        and stems tweets. Returns a list of stemmed tokens."""
        tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
        tokens = [stemmer.stem(t) for t in tweet.split()]
        return tokens

    def basic_tokenize(tweet):
        """Same as tokenize but without the stemming"""
        tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
        return tweet.split()

    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        preprocessor=preprocess,
        ngram_range=(1, 3),
        stop_words=stopwords,
        use_idf=True,
        smooth_idf=False,
        norm=None,
        decode_error='replace',
        max_features=10000,
        min_df=5,
        max_df=0.75
    )
    # Construct tfidf matrix and get relevant scores
    tfidf = vectorizer.fit_transform(tweets).toarray()
    vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())}
    idf_vals = vectorizer.idf_
    idf_dict = {i: idf_vals[i] for i in vocab.values()}  # keys are indices; values are IDF scores

    # Get POS tags for tweets and save as a string
    import nltk
    nltk.download('averaged_perceptron_tagger')

    tweet_tags = []
    for t in tweets:
        tokens = basic_tokenize(preprocess(t))
        tags = nltk.pos_tag(tokens)
        tag_list = [x[1] for x in tags]
        tag_str = " ".join(tag_list)
        tweet_tags.append(tag_str)

    # We can use the TFIDF vectorizer to get a token matrix for the POS tags
    pos_vectorizer = TfidfVectorizer(
        tokenizer=None,
        lowercase=False,
        preprocessor=None,
        ngram_range=(1, 3),
        stop_words=None,
        use_idf=False,
        smooth_idf=False,
        norm=None,
        decode_error='replace',
        max_features=5000,
        min_df=5,
        max_df=0.75,
    )

    # Construct POS TF matrix and get vocab dict
    pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
    pos_vocab = {v: i for i, v in enumerate(pos_vectorizer.get_feature_names())}

    # Now get other features
    sentiment_analyzer = VS()

    def count_twitter_objs(text_string):
        """
        Accepts a text string and replaces:
        1) urls with URLHERE
        2) lots of whitespace with one instance
        3) mentions with MENTIONHERE
        4) hashtags with HASHTAGHERE

        This allows us to get standardized counts of urls and mentions
        Without caring about specific people mentioned.

        Returns counts of urls, mentions, and hashtags.
        """
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                           '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        hashtag_regex = '#[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text_string)
        parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
        parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
        parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
        return (parsed_text.count('URLHERE'), parsed_text.count('MENTIONHERE'), parsed_text.count('HASHTAGHERE'))

    def other_features(tweet):
        """This function takes a string and returns a list of features.
        These include Sentiment scores, Text and Readability scores,
        as well as Twitter specific features"""
        sentiment = sentiment_analyzer.polarity_scores(tweet)

        words = preprocess(tweet)  # Get text only

        syllables = textstat.syllable_count(words)
        num_chars = sum(len(w) for w in words)
        num_chars_total = len(tweet)
        num_terms = len(tweet.split())
        num_words = len(words.split())
        avg_syl = round(float((syllables + 0.001)) / float(num_words + 0.001), 4)
        num_unique_terms = len(set(words.split()))

        ###Modified FK grade, where avg words per sentence is just num words/1
        FKRA = round(float(0.39 * float(num_words) / 1.0) + float(11.8 * avg_syl) - 15.59, 1)
        ##Modified FRE score, where sentence fixed to 1
        FRE = round(206.835 - 1.015 * (float(num_words) / 1.0) - (84.6 * float(avg_syl)), 2)

        twitter_objs = count_twitter_objs(tweet)
        retweet = 0
        if "rt" in words:
            retweet = 1
        features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                    num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1],
                    twitter_objs[0], retweet]
        # features = pandas.DataFrame(features)
        return features

    def get_feature_array(tweets):
        feats = []
        for t in tweets:
            feats.append(other_features(t))
        return np.array(feats)

    other_features_names = ["FKRA", "FRE", "num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total",
                            "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos", "vader neu",
                            "vader compound", "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

    feats = get_feature_array(tweets)

    # Now join them all up
    M = np.concatenate([tfidf, pos, feats], axis=1)

    X = pd.DataFrame(M)

    return model.predict([[X]])[0]
Beispiel #13
0
def get_features(tweets):  
    sentiment_analyzer = VS()
    
    stopwords=stopwords = nltk.corpus.stopwords.words("english")
    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)

    vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        preprocessor=preprocess,
        ngram_range=(1, 3),
        stop_words=stopwords,  
        use_idf=True,
        smooth_idf=False,
        norm=None,
        decode_error='replace',
        max_features=10000,
        max_df = 0.501,
        min_df=5,   
        )

    # Construct tfidf matrix and get relevant scores
    tfidf = vectorizer.fit_transform(tweets).toarray()
    vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
    idf_vals = vectorizer.idf_
    idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

    # Get POS tags for tweets and save as a string
    tweet_tags = []
    for t in tweets:
        tokens = basic_tokenize(preprocess(t))
        tags = nltk.pos_tag(tokens)
        tag_list = [x[1] for x in tags]
        tag_str = " ".join(tag_list)
        tweet_tags.append(tag_str)

    # We can use the TFIDF vectorizer to get a token matrix for the POS tags
    pos_vectorizer = TfidfVectorizer(
        tokenizer=None,
        lowercase=False,
        preprocessor=None,
        ngram_range=(1, 3),
        stop_words=None, #We do better when we keep stopwords
        use_idf=False,
        smooth_idf=False,
        norm=None, #Applies l2 norm smoothing
        decode_error='replace',
        max_features=5000,
        min_df=5,
        max_df = 0.501
        )

    pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
    pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

    sentiment_analyzer = VS()

    other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

    feats = get_feature_array(tweets)
    X = np.concatenate([tfidf,pos,feats],axis=1)
    return X, vectorizer, tfidf, pos_vectorizer
def wrangle2(text):
    stopwords=pd.read_table("english").values.tolist()

    other_exclusions = ["#ff", "ff", "rt"]
    stopwords.extend(other_exclusions)

    stemmer = PorterStemmer()


    def clean(text):
        spaces = '\s+'
        urls = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mentions = '@[\w\-]+'
        parsed_text = re.sub(spaces, ' ', text)
        parsed_text = re.sub(urls, '', parsed_text)
        parsed_text = re.sub(mentions, '', parsed_text)
        return parsed_text
    
    def tokenize(text):
        text = " ".join(re.split("[^a-zA-Z]*", text.lower())).strip()
        tokens = [stemmer.stem(t) for t in text.split()]
        return tokens

        vectorizer = TfidfVectorizer(
        tokenizer=tokenize,
        preprocessor=clean,
        ngram_range=(1, 3),
        stop_words=stopwords,
        use_idf=True,
        smooth_idf=False,
        norm=None,
        decode_error='replace',
        max_features=10000,
        min_df=5,
        max_df=0.501
        )
    
    tfidf = vectorizer.transform(text).toarray()
    vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
    idf_vals = vectorizer.idf_
    idf_dict = {i:idf_vals[i] for i in vocab.values()}
    
    sentiment_analyzer = VS()

    def count_twitter_objs(text):
    
        space_pattern = '\s+'
        giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
            '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        mention_regex = '@[\w\-]+'
        hashtag_regex = '#[\w\-]+'
        parsed_text = re.sub(space_pattern, ' ', text)
        parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
        parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
        parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
        return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

    def other_features(text):
        sentiment = sentiment_analyzer.polarity_scores(text)

        words = clean(text) #Get text only

        syllables = textstat.syllable_count(words) #count syllables in words
        num_chars = sum(len(w) for w in words) #num chars in words
        num_chars_total = len(text)
        num_terms = len(text.split())
        num_words = len(words.split())
        avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
        num_unique_terms = len(set(words.split()))

        FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)

        FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

        twitter_objs = count_twitter_objs(text) #Count #, @, and http://
        retweet = 0
        if "rt" in words:
            retweet = 1
        features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                    num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                    twitter_objs[2], twitter_objs[1],
                    twitter_objs[0], retweet]

        return features

    def get_feature_array(text):
        feats=[]
        for t in text:
            feats.append(other_features(t))
        return np.array(feats)
    
    feats = get_feature_array(text)
    
    all = np.concatenate([tfidf,feats],axis=1)
    
    other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

    variables = ['']*len(vocab)
    for k,v in vocab.items():
        variables[v] = k

    features = variables+other_features_names
    X = pd.DataFrame(all)
    
    
    return X
Beispiel #15
0
def getValue():
        stopwords=stopwords = nltk.corpus.stopwords.words("english")

        other_exclusions = ["#ff", "ff", "rt"]
        stopwords.extend(other_exclusions)

        sentiment_analyzer = VS()

        stemmer = PorterStemmer()
        def preprocess(text_string):
            """
            Accepts a text string and replaces:
            1) urls with URLHERE
            2) lots of whitespace with one instance
            3) mentions with MENTIONHERE

            This allows us to get standardized counts of urls and mentions
            Without caring about specific people mentioned
            """
            space_pattern = '\s+'
            giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
            mention_regex = '@[\w\-]+'
            parsed_text = re.sub(space_pattern,' ', str(text_string))
            parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
            parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
            #parsed_text = parsed_text.code("utf-8", errors='ignore')
            return parsed_text

        def tokenize(tweet):
            """Removes punctuation & excess whitespace, sets to lowercase,
            and stems tweets. Returns a list of stemmed tokens."""
            tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
            #tokens = re.split("[^a-zA-Z]*", tweet.lower())
            tokens = [stemmer.stem(t) for t in tweet.split()]
            return tokens

        def basic_tokenize(tweet):
            """Same as tokenize but without the stemming"""
            tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
            #print tweet
            return tweet

        def get_pos_tags(tweets):
            """Takes a list of strings (tweets) and
            returns a list of strings of (POS tags).
            """
            tweet_tags = []
            #for t in tweets:
            tokens = basic_tokenize(preprocess(tweets))
            tags = nltk.pos_tag(tokens)
            tag_list = [x[1] for x in tags]
            #for i in range(0, len(tokens)):
            tag_str = " ".join(tag_list)
            tweet_tags.append(tag_str)
            return tweet_tags

        def count_twitter_objs(text_string):
            """
            Accepts a text string and replaces:
            1) urls with URLHERE
            2) lots of whitespace with one instance
            3) mentions with MENTIONHERE
            4) hashtags with HASHTAGHERE

            This allows us to get standardized counts of urls and mentions
            Without caring about specific people mentioned.

            Returns counts of urls, mentions, and hashtags.
            """
            space_pattern = '\s+'
            giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
            mention_regex = '@[\w\-]+'
            hashtag_regex = '#[\w\-]+'
            parsed_text = re.sub(space_pattern, ' ', text_string)
            parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
            parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
            parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
            return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

        def other_features_(tweet):
            """This function takes a string and returns a list of features.
            These include Sentiment scores, Text and Readability scores,
            as well as Twitter specific features.

            This is modified to only include those features in the final
            model."""

            sentiment = sentiment_analyzer.polarity_scores(tweet)

            words = preprocess(tweet) #Get text only

            syllables = textstat.syllable_count(words) #count syllables in words
            num_chars = sum(len(w) for w in words) #num chars in words
            num_chars_total = len(tweet)
            num_terms = len(tweet.split())
            num_words = len(words.split())
            avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
            num_unique_terms = len(set(words.split()))

            ###Modified FK grade, where avg words per sentence is just num words/1
            FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
            ##Modified FRE score, where sentence fixed to 1
            FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

            twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
            features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words,
                        num_unique_terms, sentiment['compound'],
                        twitter_objs[2], twitter_objs[1],]
            #features = pandas.DataFrame(features)
            return features

        def get_oth_features(tweets):
            """Takes a list of tweets, generates features for
            each tweet, and returns a numpy array of tweet x features"""
            feats=[]
            for t in tweets:
                feats.append(other_features_(t))
            return np.array(feats)


        def transform_inputs(tweets, tf_vectorizer, idf_vector, pos_vectorizer):
            """
            This function takes a list of tweets, along with used to
            transform the tweets into the format accepted by the model.

            Each tweet is decomposed into
            (a) An array of TF-IDF scores for a set of n-grams in the tweet.
            (b) An array of POS tag sequences in the tweet.
            (c) An array of features including sentiment, vocab, and readability.

            Returns a pandas dataframe where each row is the set of features
            for a tweet. The features are a subset selected using a Logistic
            Regression with L1-regularization on the training data.

            """
            tf_array = tf_vectorizer.fit_transform(tweets).toarray()
            tfidf_array = tf_array*idf_vector
            print ("Built TF-IDF array")
            #print tweets
            pos_tags = get_pos_tags(tweets)
            #print get_pos_tags(tweets)
            pos_array = pos_vectorizer.fit_transform(pos_tags).toarray()
            print ("Built POS array")
            #print pos_vectorizer.fit_transform(pos_tags).toarray()
            oth_array = get_oth_features(tweets)
            print ("Built other feature array")
            #print get_oth_features(tweets)
            M = np.concatenate([tfidf_array, pos_array, oth_array],axis=1)
            return pd.DataFrame(M)

        def predictions(X, model):
            """
            This function calls the predict function on
            the trained model to generated a predicted y
            value for each observation.
            """
            y_preds = model.predict(X)
            return y_preds

        def class_to_name(class_label):
            """
            This function can be used to map a numeric
            feature name to a particular class.
            """
            if class_label == 0:
                return "Hate speech"
            elif class_label == 1:
                return "Offensive language"
            elif class_label == 2:
                return "Neither"
            else:
                return "No label"

        def get_tweets_predictions(tweets, perform_prints=True):
            fixed_tweets = []
            #for i, t_orig in enumerate(tweets):
            s = tweets
            try:
                s = s.encode("latin1")
            except:
                try:
                    s = s.encode("utf-8")
                except:
                    pass
            if type(s) != unicode:
                fixed_tweets.append(unicode(s, errors="ignore"))
            else:
                fixed_tweets.append(s)
            #print fixed_tweets
            print tweets
            #assert len(tweets) == len(fixed_tweets), "shouldn't remove any tweets"
            tweets = fixed_tweets
            print (len(tweets), " tweets to classify")

            print ("Loading trained classifier... ")
            model = joblib.load('final_model.pkl')

            print ("Loading other information...")
            tf_vectorizer = joblib.load('final_tfidf.pkl')
            idf_vector = joblib.load('final_idf.pkl')
            pos_vectorizer = joblib.load('final_pos.pkl')
            #Load ngram dict
            #Load pos dictionary
            #Load function to transform data

            print ("Transforming inputs...")
            X = transform_inputs(tweets, tf_vectorizer, idf_vector, pos_vectorizer)

            print ("Running classification model...")
            predicted_class = predictions(X, model)

            return predicted_class	

        def initial(comt):
                print ("Loading data to classify...")
            #Tweets obtained here: https://github.com/sashaperigo/Trump-Tweets
                
                trump_tweets = comt
                trump_predictions = get_tweets_predictions(trump_tweets)

                print ("Printing predicted values: ")
            #for i,t in enumerate(trump_tweets):
                #print t
                ans = class_to_name(trump_predictions)
                print(ans)
                return ans
        if request.form['submit_but'] == 'predict':	
            comt = request.form['comt']
            my_prediction=initial(comt)
            return render_template('demoForm.html',val=comt,res = my_prediction)
        if request.form['submit_but'] == 'twitter':
            tweet2classify = rt.randtweet().encode('utf-8')
            #print(tweet2classify)
            try:
                tweet2classify = tweet2classify.encode("latin1")
            except:
                try:
                    tweet2classify = tweet2classify.encode("utf-8")
                except:
                    pass
            if type(tweet2classify) != unicode:
                tweet2classify=unicode(tweet2classify, errors="ignore")
            else:
                tweet2classify=tweet2classify
            my_prediction=initial(tweet2classify)
            return render_template('demoForm.html',val=tweet2classify,res = my_prediction)
    def __init__(self,lang='en'):
        self.lang = lang
        self.stopwords = None
        self.stemmer = None
        self.sentiment_analyzer = None
        self.text_processor = None        
        INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/"        
        common.set_resources_path(INDIC_NLP_RESOURCES)
        self.pos_tagger = None



        if lang == 'hi':
            self.ht = HindiTokenizer.Tokenizer()
            self.sentiment_analyzer = load_learner(path="../model/hi-sentiment")
            self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()]	
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = None
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens
            )
            loader.load()
            train_data = indian.tagged_sents('hindi.pos')
            self.tnt_pos_tagger = tnt.TnT()
            self.tnt_pos_tagger.train(train_data)

        if lang == 'en':
            self.sentiment_analyzer = VS()
            self.stopwords = nltk.corpus.stopwords.words("english")
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = PorterStemmer()
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens

                # corpus from which the word statistics are going to be used 
                # for word segmentation 
                segmenter="twitter", 

                # corpus from which the word statistics are going to be used 
                # for spell correction
                corrector="twitter", 

                unpack_hashtags=True,  # perform word segmentation on hashtags
                unpack_contractions=True,  # Unpack contractions (can't -> can not)
                spell_correct_elong=False,  # spell correction for elongated words

                # select a tokenizer. You can use SocialTokenizer, or pass your own
                # the tokenizer, should take as input a string and return a list of tokens
                tokenizer=SocialTokenizer(lowercase=True).tokenize,

                # list of dictionaries, for replacing tokens extracted from the text,
                # with other expressions. You can pass more than one dictionaries.
                dicts=[emoticons,slang]
            )