def get_feature_array(tweets): sentiment_analyzer = VS() feats=[] for t in tweets: feats.append(other_features(t, sentiment_analyzer)) return np.array(feats)
def getscores(): SCOPES = 'https://www.googleapis.com/auth/gmail.readonly' store = file.Storage('storage.json') creds = store.get() # nltk.download ('stopwords') # stop_words = get_stop_words('en') analyzer = VS() # stopwords_list = stopwords.words('english') + stopwords.words('portuguese') # tvd = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0.003,max_df=0.01,max_features=5000,norm='l1',stop_words=stopwords_list) if not creds or creds.invalid: flow = client.flow_from_clientsecrets('client_secret.json',SCOPES) creds = tools.run_flow(flow,store) GMAIL = discovery.build('gmail','v1',http=creds.authorize(Http())) label = 'INBOX' Spl_char = [",","-",".",";",">","<","=","&",":","#","!"] threads = ListThreadsWithLabels(GMAIL,'me',[label]) # all_threads = [] all = [] for t in threads: conversation, subject = GetThread(GMAIL,'me',t['id']) score = [] for msg in conversation: if "Team" not in msg: vs = analyzer.polarity_scores(msg) score.append(vs['compound']) one = [subject,conversation,score] all.append(one) return all
def other_features(tweet): ##SENTIMENT sentiment = VS(tweet) ##READABILITY #See https://pypi.python.org/pypi/textstat/ flesch = round(textstat.flesch_reading_ease(tweet), 3) flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3) gunning_fog = round(textstat.gunning_fog(tweet), 3) ##TEXT-BASED length = len(tweet) num_terms = len(tweet.split()) ##TWITTER SPECIFIC TEXT FEATURES hashtag_count = tweet.count("#") mention_count = tweet.count("@") url_count = tweet.count("http") retweet = 0 if tweet.lower().startswith("rt") is True: retweet = 1 #Checking if RT is in the tweet words = tweet.lower().split() if "rt" in words or "#rt" in words: retweet = 1 features = [ sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length, num_terms, hashtag_count, mention_count, url_count, retweet ] return features
def other_features(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features""" ##SENTIMENT sentiment_analyzer = VS() sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = count_twitter_objs(tweet) #Count #, @, and http:// retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] #features = pandas.DataFrame(features) return features
def get_sentiment(message): sentiment_analyzer = VS() res = sentiment_analyzer.polarity_scores(message) results = {} results['Positive'] = res['pos'] * 100 results['Negative'] = res['neg'] * 100 results['Neutral'] = res['neu'] * 100 return results
def __init__(self, ): cur_path = os.path.dirname(os.path.abspath(__file__)) self.model = joblib.load(os.path.join(cur_path, 'final_model.pkl')) self.tf_vectorizer = joblib.load( os.path.join(cur_path, 'final_tfidf.pkl')) self.idf_vector = joblib.load(os.path.join(cur_path, 'final_idf.pkl')) self.pos_vectorizer = joblib.load( os.path.join(cur_path, 'final_pos.pkl')) self.stemmer = PorterStemmer() self.sentiment_analyzer = VS()
from sklearn.feature_extraction.text import TfidfVectorizer import nltk from nltk.stem.porter import * import string import re import csv from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS from textstat.textstat import * stopwords = stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) sentiment_analyzer = VS() stemmer = PorterStemmer() def preprocess(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned """ space_pattern = '\s+'
def logregress_linsvc(input): # nltk.download('averaged_perceptron_tagger') # nltk.download('punkt') # nltk.download('wordnet') #base_tweets=input['text'] #tweets = [x for x in base_tweets if type(x) == str] stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) # stemmer = PorterStemmer() ################################################################################################# '''Preprocess tweets, tokenize, and gather feature,POS tags''' ################################################################################################ def basic_tokenize(tweet): """Same as tokenize but without the stemming""" tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip() return tweet.split() sentiment_analyzer = VS() def count_twitter_objs(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE 4) hashtags with HASHTAGHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned. Returns counts of urls, mentions, and hashtags. """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) def other_features(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features""" ##SENTIMENT sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = count_twitter_objs(tweet) #Count #, @, and http:// features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms,sentiment['neu'], sentiment['compound'], twitter_objs[2],twitter_objs[1],twitter_objs[0]] #features = pandas.DataFrame(features) return features def get_feature_array(tweets): feats=[] for t in tweets: feats.append(other_features(t)) return np.array(feats) def get_pos_tags(tweets): """Takes a list of strings (tweets) and returns a list of strings of (POS tags). """ tweet_tags = [] for t in tweets: tokens = basic_tokenize(preprocess(t)) tags = nltk.pos_tag(tokens) tag_list = [x[1] for x in tags] #for i in range(0, len(tokens)): tag_str = " ".join(tag_list) tweet_tags.append(tag_str) return tweet_tags ########################################################################################### '''Preprocess End''' ########################################################################################### warnings.simplefilter(action='ignore', category=FutureWarning) ####################################################### '''Construct tfidf matrix and get relevant scores''' ####################################################### tf_array = tf_vectorizer.fit_transform(input['text']).toarray() tfidf_array = tf_array*idf_vector print ("Built TF-IDF array") ################################################# '''Construct POS TF matrix and get vocab dict''' ################################################# pos_tags = get_pos_tags(input['text']) pos_array = pos_vectorizer.fit_transform(pos_tags).toarray() print ("Built POS array") ################################################### ''' Get features''' ################################################### other_feats = get_feature_array(input['text']) print ("Built other features array") #Now join them all up X = np.concatenate([tfidf_array,pos_array,other_feats],axis=1) print(X.shape) ##################################################### '''Running the Model''' ##################################################### print ("Running classification model...") y_preds = model.predict(X) print ("Loading data to classify...") # def class_to_name(class_label): # """ # This function can be used to map a numeric # feature name to a particular class. # """ # if class_label == 0: # return "Hate speech" # elif class_label == 1: # return "Offensive language" # elif class_label == 2: # return "Neither" # else: # return "No label" hate = 0 hurtful = 0 neither = 0 for x in y_preds: if str(x) == '0': hate +=1 elif str(x) == '1': hurtful += 1 elif str(x) == '2': neither += 1 print ("Printing predicted values: ") print(f'Hateful tweets: {hate}; % of total: {hate/(hate+hurtful+neither)}') #hate_results = f'Hateful tweets: {hate}; % of total: {hate/(hate+hurtful+neither)}' print(f'Hurtful tweets: {hurtful}; % of total: {hurtful/(hate+hurtful+neither)}') #hurtful_results = f'Hurtful tweets: {hurtful}; % of total: {hurtful/(hate+hurtful+neither)}' print(f'Neither tweets: {neither}; % of total: {neither/(hate+hurtful+neither)}') #neither_results = f'Neither tweets: {neither}; % of total: {neither/(hate+hurtful+neither)}' results = { 'hate_data':{'count':hate, 'percentTotal':int((hate/(hate+hurtful+neither))*100)}, 'hurt_data':{'count':hurtful, 'percentTotal':int((hurtful/(hate+hurtful+neither))*100)}, 'neither_data':{'count':neither, 'percentTotal':int((neither/(hate+hurtful+neither))*100)}, 'total_count':hate+hurtful+neither } return results
def __init__(self): self.sentiment_analyzer = VS()
from json import loads from hatebase import HatebaseAPI from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS # hatebase = HatebaseAPI({"key": key}) # filters = {'about_nationality': '1', 'language': 'eng'} # output = "json" # query_type = "sightings" # response = hatebase.performRequest(filters, output, query_type) # # # convert to Python object # response = loads(response) # print response sentences = [ "VADER is full of crap.", # positive sentence example "VADER is not smart, handsome, nor funny.", # negation sentence example "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted) "VADER is very smart, handsome, and funny." ] analyzer = VS() for sentence in sentences: vs = analyzer.polarity_scores(sentence) # print("{:-<65} {}".format(sentence, str(vs))) print sentence, vs
class SentimentVectorizer(BaseEstimator, TransformerMixin): sentiment_analyzer = VS() def count_twitter_objs(self, text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE 4) hashtags with HASHTAGHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned. Returns counts of urls, mentions, and hashtags. """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) def other_features(self, tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features""" sentiment = self.sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) num_chars = sum(len(w) for w in words) num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = self.count_twitter_objs(tweet) retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] return features def get_feature_array(self, tweets): feats=[] for t in tweets: feats.append(self.other_features(t)) return np.array(feats) def fit(self, X, y=None): return self def transform(self, X, y=None): return self.get_feature_array(X)
def predict(x): model = load("model.ml") tweets = x stopwords = stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) stemmer = PorterStemmer() def preprocess(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, '', parsed_text) parsed_text = re.sub(mention_regex, '', parsed_text) return parsed_text def tokenize(tweet): """Removes punctuation & excess whitespace, sets to lowercase, and stems tweets. Returns a list of stemmed tokens.""" tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip() tokens = [stemmer.stem(t) for t in tweet.split()] return tokens def basic_tokenize(tweet): """Same as tokenize but without the stemming""" tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip() return tweet.split() vectorizer = TfidfVectorizer( tokenizer=tokenize, preprocessor=preprocess, ngram_range=(1, 3), stop_words=stopwords, use_idf=True, smooth_idf=False, norm=None, decode_error='replace', max_features=10000, min_df=5, max_df=0.75 ) # Construct tfidf matrix and get relevant scores tfidf = vectorizer.fit_transform(tweets).toarray() vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())} idf_vals = vectorizer.idf_ idf_dict = {i: idf_vals[i] for i in vocab.values()} # keys are indices; values are IDF scores # Get POS tags for tweets and save as a string import nltk nltk.download('averaged_perceptron_tagger') tweet_tags = [] for t in tweets: tokens = basic_tokenize(preprocess(t)) tags = nltk.pos_tag(tokens) tag_list = [x[1] for x in tags] tag_str = " ".join(tag_list) tweet_tags.append(tag_str) # We can use the TFIDF vectorizer to get a token matrix for the POS tags pos_vectorizer = TfidfVectorizer( tokenizer=None, lowercase=False, preprocessor=None, ngram_range=(1, 3), stop_words=None, use_idf=False, smooth_idf=False, norm=None, decode_error='replace', max_features=5000, min_df=5, max_df=0.75, ) # Construct POS TF matrix and get vocab dict pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray() pos_vocab = {v: i for i, v in enumerate(pos_vectorizer.get_feature_names())} # Now get other features sentiment_analyzer = VS() def count_twitter_objs(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE 4) hashtags with HASHTAGHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned. Returns counts of urls, mentions, and hashtags. """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return (parsed_text.count('URLHERE'), parsed_text.count('MENTIONHERE'), parsed_text.count('HASHTAGHERE')) def other_features(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features""" sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) # Get text only syllables = textstat.syllable_count(words) num_chars = sum(len(w) for w in words) num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables + 0.001)) / float(num_words + 0.001), 4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words) / 1.0) + float(11.8 * avg_syl) - 15.59, 1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015 * (float(num_words) / 1.0) - (84.6 * float(avg_syl)), 2) twitter_objs = count_twitter_objs(tweet) retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE, syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] # features = pandas.DataFrame(features) return features def get_feature_array(tweets): feats = [] for t in tweets: feats.append(other_features(t)) return np.array(feats) other_features_names = ["FKRA", "FRE", "num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", "num_terms", "num_words", "num_unique_words", "vader neg", "vader pos", "vader neu", "vader compound", "num_hashtags", "num_mentions", "num_urls", "is_retweet"] feats = get_feature_array(tweets) # Now join them all up M = np.concatenate([tfidf, pos, feats], axis=1) X = pd.DataFrame(M) return model.predict([[X]])[0]
def get_features(tweets): sentiment_analyzer = VS() stopwords=stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) vectorizer = TfidfVectorizer( tokenizer=tokenize, preprocessor=preprocess, ngram_range=(1, 3), stop_words=stopwords, use_idf=True, smooth_idf=False, norm=None, decode_error='replace', max_features=10000, max_df = 0.501, min_df=5, ) # Construct tfidf matrix and get relevant scores tfidf = vectorizer.fit_transform(tweets).toarray() vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())} idf_vals = vectorizer.idf_ idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores # Get POS tags for tweets and save as a string tweet_tags = [] for t in tweets: tokens = basic_tokenize(preprocess(t)) tags = nltk.pos_tag(tokens) tag_list = [x[1] for x in tags] tag_str = " ".join(tag_list) tweet_tags.append(tag_str) # We can use the TFIDF vectorizer to get a token matrix for the POS tags pos_vectorizer = TfidfVectorizer( tokenizer=None, lowercase=False, preprocessor=None, ngram_range=(1, 3), stop_words=None, #We do better when we keep stopwords use_idf=False, smooth_idf=False, norm=None, #Applies l2 norm smoothing decode_error='replace', max_features=5000, min_df=5, max_df = 0.501 ) pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray() pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())} sentiment_analyzer = VS() other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \ "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \ "num_hashtags", "num_mentions", "num_urls", "is_retweet"] feats = get_feature_array(tweets) X = np.concatenate([tfidf,pos,feats],axis=1) return X, vectorizer, tfidf, pos_vectorizer
def wrangle2(text): stopwords=pd.read_table("english").values.tolist() other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) stemmer = PorterStemmer() def clean(text): spaces = '\s+' urls = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mentions = '@[\w\-]+' parsed_text = re.sub(spaces, ' ', text) parsed_text = re.sub(urls, '', parsed_text) parsed_text = re.sub(mentions, '', parsed_text) return parsed_text def tokenize(text): text = " ".join(re.split("[^a-zA-Z]*", text.lower())).strip() tokens = [stemmer.stem(t) for t in text.split()] return tokens vectorizer = TfidfVectorizer( tokenizer=tokenize, preprocessor=clean, ngram_range=(1, 3), stop_words=stopwords, use_idf=True, smooth_idf=False, norm=None, decode_error='replace', max_features=10000, min_df=5, max_df=0.501 ) tfidf = vectorizer.transform(text).toarray() vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())} idf_vals = vectorizer.idf_ idf_dict = {i:idf_vals[i] for i in vocab.values()} sentiment_analyzer = VS() def count_twitter_objs(text): space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) def other_features(text): sentiment = sentiment_analyzer.polarity_scores(text) words = clean(text) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(text) num_terms = len(text.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = count_twitter_objs(text) #Count #, @, and http:// retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] return features def get_feature_array(text): feats=[] for t in text: feats.append(other_features(t)) return np.array(feats) feats = get_feature_array(text) all = np.concatenate([tfidf,feats],axis=1) other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \ "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \ "num_hashtags", "num_mentions", "num_urls", "is_retweet"] variables = ['']*len(vocab) for k,v in vocab.items(): variables[v] = k features = variables+other_features_names X = pd.DataFrame(all) return X
def getValue(): stopwords=stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] stopwords.extend(other_exclusions) sentiment_analyzer = VS() stemmer = PorterStemmer() def preprocess(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' parsed_text = re.sub(space_pattern,' ', str(text_string)) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) #parsed_text = parsed_text.code("utf-8", errors='ignore') return parsed_text def tokenize(tweet): """Removes punctuation & excess whitespace, sets to lowercase, and stems tweets. Returns a list of stemmed tokens.""" tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip() #tokens = re.split("[^a-zA-Z]*", tweet.lower()) tokens = [stemmer.stem(t) for t in tweet.split()] return tokens def basic_tokenize(tweet): """Same as tokenize but without the stemming""" tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip() #print tweet return tweet def get_pos_tags(tweets): """Takes a list of strings (tweets) and returns a list of strings of (POS tags). """ tweet_tags = [] #for t in tweets: tokens = basic_tokenize(preprocess(tweets)) tags = nltk.pos_tag(tokens) tag_list = [x[1] for x in tags] #for i in range(0, len(tokens)): tag_str = " ".join(tag_list) tweet_tags.append(tag_str) return tweet_tags def count_twitter_objs(text_string): """ Accepts a text string and replaces: 1) urls with URLHERE 2) lots of whitespace with one instance 3) mentions with MENTIONHERE 4) hashtags with HASHTAGHERE This allows us to get standardized counts of urls and mentions Without caring about specific people mentioned. Returns counts of urls, mentions, and hashtags. """ space_pattern = '\s+' giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|' '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') mention_regex = '@[\w\-]+' hashtag_regex = '#[\w\-]+' parsed_text = re.sub(space_pattern, ' ', text_string) parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text) parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text) parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text) return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE')) def other_features_(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features. This is modified to only include those features in the final model.""" sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = count_twitter_objs(tweet) #Count #, @, and http:// features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['compound'], twitter_objs[2], twitter_objs[1],] #features = pandas.DataFrame(features) return features def get_oth_features(tweets): """Takes a list of tweets, generates features for each tweet, and returns a numpy array of tweet x features""" feats=[] for t in tweets: feats.append(other_features_(t)) return np.array(feats) def transform_inputs(tweets, tf_vectorizer, idf_vector, pos_vectorizer): """ This function takes a list of tweets, along with used to transform the tweets into the format accepted by the model. Each tweet is decomposed into (a) An array of TF-IDF scores for a set of n-grams in the tweet. (b) An array of POS tag sequences in the tweet. (c) An array of features including sentiment, vocab, and readability. Returns a pandas dataframe where each row is the set of features for a tweet. The features are a subset selected using a Logistic Regression with L1-regularization on the training data. """ tf_array = tf_vectorizer.fit_transform(tweets).toarray() tfidf_array = tf_array*idf_vector print ("Built TF-IDF array") #print tweets pos_tags = get_pos_tags(tweets) #print get_pos_tags(tweets) pos_array = pos_vectorizer.fit_transform(pos_tags).toarray() print ("Built POS array") #print pos_vectorizer.fit_transform(pos_tags).toarray() oth_array = get_oth_features(tweets) print ("Built other feature array") #print get_oth_features(tweets) M = np.concatenate([tfidf_array, pos_array, oth_array],axis=1) return pd.DataFrame(M) def predictions(X, model): """ This function calls the predict function on the trained model to generated a predicted y value for each observation. """ y_preds = model.predict(X) return y_preds def class_to_name(class_label): """ This function can be used to map a numeric feature name to a particular class. """ if class_label == 0: return "Hate speech" elif class_label == 1: return "Offensive language" elif class_label == 2: return "Neither" else: return "No label" def get_tweets_predictions(tweets, perform_prints=True): fixed_tweets = [] #for i, t_orig in enumerate(tweets): s = tweets try: s = s.encode("latin1") except: try: s = s.encode("utf-8") except: pass if type(s) != unicode: fixed_tweets.append(unicode(s, errors="ignore")) else: fixed_tweets.append(s) #print fixed_tweets print tweets #assert len(tweets) == len(fixed_tweets), "shouldn't remove any tweets" tweets = fixed_tweets print (len(tweets), " tweets to classify") print ("Loading trained classifier... ") model = joblib.load('final_model.pkl') print ("Loading other information...") tf_vectorizer = joblib.load('final_tfidf.pkl') idf_vector = joblib.load('final_idf.pkl') pos_vectorizer = joblib.load('final_pos.pkl') #Load ngram dict #Load pos dictionary #Load function to transform data print ("Transforming inputs...") X = transform_inputs(tweets, tf_vectorizer, idf_vector, pos_vectorizer) print ("Running classification model...") predicted_class = predictions(X, model) return predicted_class def initial(comt): print ("Loading data to classify...") #Tweets obtained here: https://github.com/sashaperigo/Trump-Tweets trump_tweets = comt trump_predictions = get_tweets_predictions(trump_tweets) print ("Printing predicted values: ") #for i,t in enumerate(trump_tweets): #print t ans = class_to_name(trump_predictions) print(ans) return ans if request.form['submit_but'] == 'predict': comt = request.form['comt'] my_prediction=initial(comt) return render_template('demoForm.html',val=comt,res = my_prediction) if request.form['submit_but'] == 'twitter': tweet2classify = rt.randtweet().encode('utf-8') #print(tweet2classify) try: tweet2classify = tweet2classify.encode("latin1") except: try: tweet2classify = tweet2classify.encode("utf-8") except: pass if type(tweet2classify) != unicode: tweet2classify=unicode(tweet2classify, errors="ignore") else: tweet2classify=tweet2classify my_prediction=initial(tweet2classify) return render_template('demoForm.html',val=tweet2classify,res = my_prediction)
def __init__(self,lang='en'): self.lang = lang self.stopwords = None self.stemmer = None self.sentiment_analyzer = None self.text_processor = None INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/" common.set_resources_path(INDIC_NLP_RESOURCES) self.pos_tagger = None if lang == 'hi': self.ht = HindiTokenizer.Tokenizer() self.sentiment_analyzer = load_learner(path="../model/hi-sentiment") self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()] other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = None self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens ) loader.load() train_data = indian.tagged_sents('hindi.pos') self.tnt_pos_tagger = tnt.TnT() self.tnt_pos_tagger.train(train_data) if lang == 'en': self.sentiment_analyzer = VS() self.stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = PorterStemmer() self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons,slang] )