def prep_text(txt): ''' Process a text string to a list of non-stopwords, be a google's vocab :param txt: text String :return: list of non-stopwords and be a google's vocab ''' # loop for filter a non-stopwords and google's vocab return [word for word in Sentence(txt.lower()).words if word not in STOPWORDS]
def WM_prep_text(txt): ''' Process a text string to list of word :param txt: text String :return: list of word that is non-stopwords and lemmatized ''' word_list = [] postags = Sentence(txt.lower()).pos_tags for word, pos in postags: if word in STOPWORDS: continue if pos[0] in POSMAP.keys(): word_list.append(Word(word).lemmatize(POSMAP[pos[0]])) else: word_list.append(word) return Sentence(' '.join(word_list))
def get_features(features, operation='train'): row = features.shape[0] phrase_vectors1 = translate(features[:, 0].astype(str), table=translator) phrase_vectors2 = translate(features[:, 1].astype(str), table=translator) filename = os.path.join(dir_path, 'data','sentiment_vectors_'+operation) if not os.path.exists(filename): sentiment_vector1 = np.array([Sentence(each).polarity for each in phrase_vectors1]).reshape(row, 1) sentiment_vector2 = np.array([Sentence(each).polarity for each in phrase_vectors2]).reshape(row, 1) with open(filename, 'wb') as f: pickle.dump(sentiment_vector1, f) pickle.dump(sentiment_vector2, f) else: with open(filename, 'rb') as f: sentiment_vector1 = pickle.load(f) sentiment_vector2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'raw_phrase_vectors_'+operation) if not os.path.exists(filename): phrase_vectors1 = np.vectorize(get_phrase_vector_obj)(phrase_vectors1) phrase_vectors2 = np.vectorize(get_phrase_vector_obj)(phrase_vectors2) with open(filename, 'wb') as f: pickle.dump(phrase_vectors1, f) pickle.dump(phrase_vectors2, f) else: with open(filename, 'rb') as f: phrase_vectors1 = pickle.load(f) phrase_vectors2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'processed_phrase_vectors_'+operation) if not os.path.exists(filename): phrase_vectors1 = get_phrase_vector(phrase_vectors1).reshape(row, 300) phrase_vectors2 = get_phrase_vector(phrase_vectors2).reshape(row, 300) with open(filename, 'wb') as f: pickle.dump(phrase_vectors1, f) pickle.dump(phrase_vectors2, f) else: with open(filename, 'rb') as f: phrase_vectors1 = pickle.load(f) phrase_vectors2 = pickle.load(f) features = np.concatenate((sentiment_vector1, sentiment_vector2, phrase_vectors1, phrase_vectors2), axis=1) return features
def get_random_lorem_ipsum_sentance() -> str: """get lorem ipsum sentence""" lorem_sentence = lorem.sentence() if decision(lorem_ipsum_fuck_probability): lorem_sentence = fix_punctuation_spacing( TreebankWordDetokenizer().detokenize( recumpile_sentence(Sentence(lorem_sentence)) ) ) return lorem_sentence
def generate_wordlist_emb(txt): ''' Process a text string to list of embedded word 'I love you' -> ['I', 'love', 'you'] -> list([emb('I'), emb('love'), emb('you')]) :param word_seq: text String :return: list of embedded word ''' word_seq = Sentence(txt.lower()).words wordlist = [] for word in word_seq: if word in WORD_EMB.vocab: wordlist.append(WORD_EMB[word]) else: # shape is depend on embedding dimension # use [1., 1., 1., ...] represent a unknown word wordlist.append( np.full(shape=[300], fill_value=1.0, dtype=np.float32)) return wordlist
def get_sentiment(df_headlines): ls_titles = df_headlines['title'].tolist() project_dir = str(Path.cwd()) + "/finbert/finBERT" cl_path = project_dir + "/models/classifier_model/finbert-sentiment" model = BertForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3) ldf_bert_title_sentiments = [predict(title, model) for title in ls_titles] df_bert_title_sentiments = pd.concat(ldf_bert_title_sentiments) df_bert_title_sentiments.reset_index(inplace=True, drop=True) # use TextBlob to seperate string into sentences, then evaluate their sentiment using finbert blob = TextBlob(ls_titles[0]) for title in ls_titles[1:]: blob.sentences.append(Sentence(title)) ss_titles = pd.Series([sentence.raw for sentence in blob.sentences]) sf_title_sentiment = pd.Series( [sentence.sentiment.polarity for sentence in blob.sentences]) df_textblob_title_sentiments = pd.DataFrame() df_textblob_title_sentiments['title'] = ss_titles df_textblob_title_sentiments[ 'textblob_sentiment_prediction'] = sf_title_sentiment i_temp_len = len(df_bert_title_sentiments) df_bert_title_sentiments['title'] = df_bert_title_sentiments['sentence'] del df_bert_title_sentiments['sentence'] df_bert_title_sentiments = df_bert_title_sentiments.merge( df_textblob_title_sentiments, on='title', how='inner') print( str((len(ldf_bert_title_sentiments) / i_temp_len) * 100) + "% of berts/textblob sentiments merged!") print("Tuned BERT model complete!! " + str(len(df_bert_title_sentiments)) + " financial headlines have been processed successfully!") print(f'Average headline sentiment is %.2f.' % (df_bert_title_sentiments.sentiment_score.mean())) return df_bert_title_sentiments
def create_pandas_dataframe_from_text_par(texts_dic, selectedTerms, ndici, titlename): dfst = pd.DataFrame( columns=["%s selected terms" % titlename, "Frequencies"]) dflines = pd.DataFrame(columns=[ "start", 'end', 'sentence_length', 'sentence', 'narrator', 'protagonists', '#_of_protagonists', 'polarity', 'subjectivity' ]) u = 1 selectedTermsDic = {} selectedTermsDics = Counter() occurlist = [] coccurlist = [] occurdict = Counter() # all_sents=blobbook.sentences sec_prot = nx.MultiGraph() uu = 0 # for sen in all_sents: # dd=sen.dict for actor in texts_dic: for countr, sent in texts_dic[actor].items(): sen = Sentence(sent) dd = sen.dict ssdd = [i for i in dd['noun_phrases'] if i in selectedTerms] # for ssdi in issdd: # print ssdi,actor,issdd # ssdd=[actor,ssdi] # ssdd.append(actor) nssdd = list(set([ndici[i] for i in ssdd])) narrat = ndici[actor] selectedTermsDics[narrat] += 1 # print dd['start_index']+countr,dd['end_index']+countr,dd['end_index']-dd['start_index'],dd['raw'],nssdd,len(nssdd),dd['polarity'],dd['subjectivity'] # print uu dflines.loc[uu] = [ dd['start_index'] + countr, dd['end_index'] + countr, dd['end_index'] - dd['start_index'], dd['raw'], narrat, nssdd, len(nssdd), dd['polarity'], dd['subjectivity'] ] if len(nssdd) > 0: for j in nssdd: selectedTermsDics[j] += 1 coccurlist.append([[narrat, j], dd['polarity'], dd['subjectivity']]) occurlist.append([(narrat, j), dd['polarity'], dd['subjectivity']]) sec_prot.add_edge(uu, j) sec_prot.add_node(uu, polarity=dd['polarity'], subjectivity=dd['subjectivity']) # if len(ssdd)==2: # coccurlist.append([[ndici[ssdd[0]],ndici[ssdd[1]]],dd['polarity'],dd['subjectivity']]) # occurlist.append([tuple(sorted([ndici[ssdd[0]],ndici[ssdd[1]]])),dd['polarity'],dd['subjectivity']]) # for jk in nssdd: # sec_prot.add_edge(uu,jk) # sec_prot.add_node(uu,polarity=dd['polarity'],subjectivity=dd['subjectivity']) # elif len(ssdd)>2: # for jj in it.combinations(ssdd,2): # occurlist.append([tuple(sorted([ndici[jj[0]],ndici[jj[1]]])),dd['polarity'],dd['subjectivity']]) # coccurlist.append([[ndici[jj[0]],ndici[jj[1]]],dd['polarity'],dd['subjectivity']]) # for jk in nssdd: # sec_prot.add_edge(uu,jk) # sec_prot.add_node(uu,polarity=dd['polarity'],subjectivity=dd['subjectivity']) uu += 1 for i in occurlist: occurdict[i[0]] += 1 u = 0 for l, v in selectedTermsDics.items(): dfst.loc[u] = [l, v] u += 1 return dfst, sec_prot, coccurlist, occurlist, dflines
def recumpile_sentence(sentence: Sentence) -> List[str]: new_tokens = [] # TODO: determine mood classifier for sentence and add respective emoji sentiment_emoji = None if decision(0.89): sentiment_emoji = get_sentiment_emoji(sentence) for token in sentence.tokenize(TweetWordTokenizer()): # TODO: this is only for discord so we dont break tokenization if re.match( r"@everyone|@here|<:[^:\s]+:[0-9]+>|<a:[^:\s]+:[0-9]+>|<(?:@!?\d+|:[A-Za-z0-9]+:)\w+>", token, ): new_tokens.append(token) continue emoji = None alias_emoji = get_cheap_emoji_alias(token) # TODO: refactor into its own mutator if decision(0.9) and (re.match("among", token, flags=re.IGNORECASE) or re.match("amogus", token, flags=re.IGNORECASE) or re.match(r"su+s", token, flags=re.IGNORECASE)): emoji = "ඞ" emoticon = get_emoticon(token) if alias_emoji: if decision(0.1) or (len(str(token)) == 1 and decision(0.9)): new_tokens.append(alias_emoji) continue else: if decision(0.5): new_tokens.append(alias_emoji) if decision(0.5): emoji = get_emoji_from_data(token) if decision(0.3): emoji = get_gloveword_emoji(token) if emoji: if decision(0.5): new_tokens.append(emoji) if decision(random_synonym_probability): token = replace_with_random_synonym(token) if decision(0.5) and profanity.contains_profanity(token): token = token.upper() if decision(censor_profanity_probability ) and profanity.contains_profanity(token): if decision(0.1): token = custom_censoring(token, 1) else: token = custom_censoring(token, censor_profanity_percent) elif decision(random_censor_probability): token = custom_censoring(token, random_censor_percent) if re.match("musk", token, flags=re.IGNORECASE): add_husky = True else: add_husky = False # processing recumpiled_token = recumpile_token(token) # post processing new_tokens.append(recumpiled_token) if emoji: if decision(0.8): new_tokens.append(emoji) if alias_emoji: if decision(0.8): new_tokens.append(alias_emoji) if emoticon: if decision(0.8): new_tokens.append(emoticon) if add_husky: new_tokens.append(recumpile_token("husky")) if add_random_garbage and decision(add_random_garbage_probability): new_tokens.append(recumpile_token(add_random_garbage_token())) if add_randomly_text_face_emoji and decision( add_randomly_text_face_emoji_probability): new_tokens.append(get_random_text_face_emojis()) if add_random_simple_text_emoji and decision( # TODO: use textblob to determine mood of text and insert faces # accordingly likely need to do this after reconstruction of the # text blob and go through this sentence by sentence rather than # word by word. add_random_simple_text_emoji_probability): new_tokens.append(get_random_simple_text_emojis()) if add_random_rp_action and decision( add_random_rp_mid_sentence_action_probability): new_tokens.append(get_random_rp_action_sentence()) if add_random_rp_action and decision( add_random_rp_end_sentence_action_probability): new_tokens.append(get_random_rp_action_sentence()) if sentiment_emoji: new_tokens.append(sentiment_emoji) if decision(0.4): for i in range(5): if decision(0.3): new_tokens.append(sentiment_emoji) else: break return new_tokens
def get_features(features, operation='train'): row = features.shape[0] operation = operation + '.den' phrase_vectors1 = translate(features[:, 0].astype(str), table=translator) phrase_vectors2 = translate(features[:, 1].astype(str), table=translator) filename = os.path.join(dir_path, 'data', 'sentiment_vectors_' + operation) if not os.path.exists(filename): sentiment_vector1 = np.array([ Sentence(each).polarity for each in phrase_vectors1 ]).reshape(row, 1) sentiment_vector2 = np.array([ Sentence(each).polarity for each in phrase_vectors2 ]).reshape(row, 1) with open(filename, 'wb') as f: pickle.dump(sentiment_vector1, f) pickle.dump(sentiment_vector2, f) else: with open(filename, 'rb') as f: sentiment_vector1 = pickle.load(f) sentiment_vector2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'subjective_vectors_' + operation) if not os.path.exists(filename): subjective_vectors1 = np.array([ Sentence(each).subjectivity for each in phrase_vectors1 ]).reshape(row, 1) subjective_vectors2 = np.array([ Sentence(each).subjectivity for each in phrase_vectors2 ]).reshape(row, 1) with open(filename, 'wb') as f: pickle.dump(subjective_vectors1, f) pickle.dump(subjective_vectors2, f) else: with open(filename, 'rb') as f: subjective_vectors1 = pickle.load(f) subjective_vectors2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'shared_tokens_' + operation) if not os.path.exists(filename): shared_token_vector = [0] * len(phrase_vectors1) current_index = 0 for each_question1, each_question2 in zip(phrase_vectors1, phrase_vectors2): shared_tokens = 0 for each_word1 in each_question1.split(' '): if each_word1 not in stopwords.words('english'): for each_word2 in each_question2.split(' '): try: wv1 = word_model[each_word1] wv2 = word_model[each_word2] if each_word2 not in stopwords.words( 'english') and cosine_similarity( wv1, wv2)[0][0] >= 0.6: shared_tokens += 1 except: if each_word1 == each_word2: shared_tokens += 1 shared_token_vector[current_index] = shared_tokens current_index += 1 with open(filename, 'wb') as f: pickle.dump(shared_token_vector, f) else: with open(filename, 'rb') as f: shared_token_vector = pickle.load(f) filename = os.path.join(dir_path, 'data', 'fuzzy_wuzzy_partial_ratio_' + operation) if not os.path.exists(filename): partial_ratio_vector1 = get_fuzzy_partial_vector( phrase_vectors1).reshape(row, 1) partial_ratio_vector2 = get_fuzzy_partial_vector( phrase_vectors2).reshape(row, 1) with open(filename, 'wb') as f: pickle.dump(partial_ratio_vector1, f) pickle.dump(partial_ratio_vector2, f) else: with open(filename, 'rb') as f: partial_ratio_vector1 = pickle.load(f) partial_ratio_vector2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'raw_phrase_vectors_' + operation) if not os.path.exists(filename): phrase_vectors1 = np.vectorize(get_phrase_vector_obj)(phrase_vectors1) phrase_vectors2 = np.vectorize(get_phrase_vector_obj)(phrase_vectors2) with open(filename, 'wb') as f: pickle.dump(phrase_vectors1, f) pickle.dump(phrase_vectors2, f) else: with open(filename, 'rb') as f: phrase_vectors1 = pickle.load(f) phrase_vectors2 = pickle.load(f) filename = os.path.join(dir_path, 'data', 'cosine_similarity_vector_' + operation) if not os.path.exists(filename): cosine_similarity_vector = get_cosine_similarity_vector( phrase_vectors1, phrase_vectors2).reshape(row, 1) with open(filename, 'wb') as f: pickle.dump(cosine_similarity_vector, f) else: with open(filename, 'rb') as f: cosine_similarity_vector = pickle.load(f) filename = os.path.join(dir_path, 'data', 'processed_phrase_vectors_' + operation) if not os.path.exists(filename): phrase_vectors1 = get_phrase_vector(phrase_vectors1).reshape(row, 300) phrase_vectors2 = get_phrase_vector(phrase_vectors2).reshape(row, 300) with open(filename, 'wb') as f: pickle.dump(phrase_vectors1, f) pickle.dump(phrase_vectors2, f) else: with open(filename, 'rb') as f: phrase_vectors1 = pickle.load(f) phrase_vectors2 = pickle.load(f) features = np.concatenate( (shared_token_vector, cosine_similarity_vector, partial_ratio_vector1, partial_ratio_vector2, subjective_vectors1, subjective_vectors2, sentiment_vector1, sentiment_vector2, phrase_vectors1, phrase_vectors2), axis=1) return features
def text(self, text): try: # check for text, transform to unicode if necessary if text is not None: if not (type(text) is unicode or type(text) is str): raise TypeError( "supplied text object of type that is not str or unicode" ) else: if not type(text) is unicode: text = text.decode('utf8') # unicode? unicode. blob = TextBlob(text) blob_nonalpha_thresh = self.nonalpha_thresh(blob) else: raise ValueError("no input text supplied") # replace all instances of sentences broken by newline # to ensure that we're dealing with contiguous text s1_text = re.sub(r'([a-z\,]+)[\n\r]+?([^A-Z0-9]+?)', r'\1 \2', text) s1_list = list() # try to remove header-type section labels through the use of some convoluted # rule bs. really not elegant, but it works. stopwords = sw.words("english") for sentence in s1_text.split('\n'): # line begins or ends with a number - maybe ToC or heading if re.match('(?:^[0-9]+?|[0-9]+?[\n\r]+?$)', sentence): continue words = sentence.split() if len(words) <= 3: continue for word in words: # boring word if word.lower() in stopwords: continue # not a word - unicode bullets or other nonsense if not re.match(r'\w+?', word): continue # links if re.match(r'^[a-zA-Z]+\:\/\/', word): continue # no title case headings if not re.match('^[0-9A-Z]{1}', word): s1_list.append(sentence) break # let's clear out anything with a nonalpha token ratio higher than the threshold s2_list = [ s for s in s1_list if self.nonalpha_pct(Sentence(s)) < (len(s) / 4) ] # now that we've got a semi-clean set of data, we can do some statistical analysis # to determine if we've got a lot of repeat data like headers/footers/copyrights # that can skew our keyword stats sentence_counts = Counter(s2_list) sc_series = [v for (k, v) in sentence_counts.iteritems()] sc_std = np.std(sc_series) sc_median = np.median(sc_series) # if we have repeating text, rebuilt it minus that noise, or anything # specified in the global blacklist if sc_median >= 1: final_list = [] # some edge cases force us to break outlier "sentences" into smaller units # for comparison later # # once the list is built, we have to check a few different ways to ensure # we are removing all the noise we can sentence_outliers = [ k.strip().lower() for (k, v) in sentence_counts.iteritems() if v >= (sc_median + (sc_std * 2)) > 1 ] self.global_filterlist += sentence_outliers for s in s2_list: if s.lower() in self.global_filterlist: continue for o in sentence_outliers: if distance(o, s.lower()) < float(len(s) * .35): break elif o in s.lower(): break else: final_list.append(s) # text had no repeats or noise to filter (rare) else: final_list = s2_list # we out return " ".join(final_list) except Exception as e: raise e
num_sentences = 0 num_documents = 0 sentence_text_dict = {} all_sentences = [] sentence_to_docid = {} # Read the data and count words / sentences for inputTextFile in listOfFiles: print(inputTextFile) with open(inputTextFile, 'r', encoding='utf-8') as content_file: csvReader = csv.reader(content_file) sentences = [Sentence(sentenceText) for row in csvReader for sentenceText in row] all_sentences += sentences for sentence in sentences: sentence_text_dict[num_sentences] = sentence sentence_to_docid[num_sentences] = num_documents num_sentences += 1 for word in sentence.words: string = word.encode("utf-8") get_word_id(string) num_documents += 1 print(all_sentences) # Generate tf.idf scores and fill in the sparse matrix tfidfMatrix = scipy.sparse.dok_matrix((num_sentences, nextWordId)) for sentence_id in sentence_text_dict:
blob.sentiment.polarity blob.sentiment.subjectivity # Getting the Sentiment of a Sentence for sentence in blob.sentences: print(sentence.sentiment) # Section 12.2.5 Self Check snippets # Exercise 1 from textblob import Sentence Sentence('The food is not good.').sentiment Sentence('The movie was not bad.').sentiment Sentence('The movie was excellent!').sentiment # Section 12.2.6 snippets from textblob.sentiments import NaiveBayesAnalyzer blob = TextBlob(text, analyzer=NaiveBayesAnalyzer()) blob blob.sentiment
def sentim_sent(stri): try: tt = Sentence(stri).sentiment except Exception, e: tt = (None, None)