def initialize_document(self, doc, docs_list_mode=False): if not docs_list_mode: self.doc = doc.lower() self.blob = TextBlob(text=self.doc, tokenizer=self.tokenizer) self.tokens = copy.deepcopy(self.blob.tokens) self.bigrams = self.bigramify(self.blob.tokens) self.tokens.extend(self.bigrams) self.trigrams = self.trigramify(self.blob.tokens) self.tokens.extend(self.trigrams) else: doc = doc.lower() blob = TextBlob(text=doc, tokenizer=self.tokenizer) tokens = copy.deepcopy(blob.tokens) bigram = self.bigramify(tokens=tokens) tokens.extend(bigram) trigram = self.trigramify(tokens=tokens) tokens.extend(trigram) return tokens
def get_half_contrast(feat, text): ''' 0 = no difference in polarity 1 = high difference in polarity ''' first_half_polarity = 0.0 second_half_polarity = 0.0 tokens = nltk.word_tokenize(text, language='english', preserve_line=False) if len(tokens) == 1: feat['half_contrast'] = 0.0 else: first_half = tokens[0:int(len(tokens) / 2)] second_half = tokens[int(len(tokens) / 2):] try: blob = TextBlob("".join([ " " + i if i not in string.punctuation else i for i in first_half ]).strip()) first_half_polarity = blob.sentiment.polarity except: first_half_polarity = 0.0 try: blob = TextBlob("".join([ " " + i if i not in string.punctuation else i for i in second_half ]).strip()) second_half_polarity = blob.sentiment.polarity except: second_half_polarity = 0.0 feat['half_contrast'] = np.abs(first_half_polarity - second_half_polarity) / 2
def get_third_contrast(feat, text): ''' 0 = no difference in polarity 1 = high difference in polarity ''' first_half_polarity = 0.0 second_half_polarity = 0.0 third_half_polarity = 0.0 if type(text) is str: tokens = nltk.word_tokenize(text, language='english', preserve_line=False) if len(tokens) < 2: feat['third_contrast_12'] = 0.0 feat['third_contrast_13'] = 0.0 feat['third_contrast_23'] = 0.0 elif len(tokens) == 2: try: blob = TextBlob(tokens[0]) first_half_polarity = blob.sentiment.polarity except: first_half_polarity = 0.0 try: blob = TextBlob(tokens[1]) second_half_polarity = blob.sentiment.polarity except: second_half_polarity = 0.0 feat['third_contrast_13'] = 0.0 feat['third_contrast_23'] = 0.0 feat['third_contrast_12'] = np.abs(first_half_polarity - second_half_polarity) / 2 else: first_half = tokens[0:int(len(tokens) / 3)] second_half = tokens[int(len(tokens) / 3):2 * int(len(tokens) / 3)] third_half = tokens[2 * int(len(tokens) / 3):] try: blob = TextBlob(first_half) first_half_polarity = blob.sentiment.polarity except: first_half_polarity = 0.0 try: blob = TextBlob(second_half) second_half_polarity = blob.sentiment.polarity except: second_half_polarity = 0.0 try: blob = TextBlob(third_half) third_half_polarity = blob.sentiment.polarity except: third_half_polarity = 0.0 feat['third_contrast_12'] = np.abs(first_half_polarity - second_half_polarity) / 2 feat['third_contrast_13'] = np.abs(first_half_polarity - third_half_polarity) / 2 feat['third_contrast_23'] = np.abs(second_half_polarity - third_half_polarity) / 2 else: feat['third_contrast_12'] = 0 feat['third_contrast_13'] = 0 feat['third_contrast_23'] = 0
def flat_doc(document, model, extremes=None): flat_doc = "" for field in document: if not isinstance(document[field], list): continue #No tomamos en cuenta los campos 'id' y '_version_': auto-generados por Solr for value in document[field]: ## Detección y traducción ## if field=='author.authors.authorName' or field=='author.authorBio' or field=='description' or field=='quotes.quoteText': value_blob = TextBlob(value) try: if value_blob.detect_language() != 'en': try: value = value_blob.translate(to='en') except Exception as e: value = value #e = NotTranslated('Translation API returned the input string unchanged.',) except Exception as e: value = value #e = TranslatorError('Must provide a string with at least 3 characters.') ############################ flat_doc += str(value)+' ' #Se aplana el documento en un solo string flat_doc = preprocess_string(flat_doc, CUSTOM_FILTERS) #Preprocesa el string flat_doc = [w for w in flat_doc if w not in stop_words] #Remueve stop words if extremes: flat_doc = [w for w in flat_doc if w not in extremes] flat_doc = [w for w in flat_doc if w in model.vocab] #Deja sólo palabras del vocabulario if flat_doc == []: flat_doc = ['book'] #Si el libro queda vacío, agregarle un token para no tener problemas más adelante return flat_doc
def file_translator(file, language): with open(file, 'r') as fh: for f in fh: sentense = TextBlob(f) if language == "english": print sentense.translate(to="en") else: print sentense.translate(to="es") return
def word_translator(words): b = TextBlob(words) if b.detect_language() == "en": print "The word " + words + " is in english and means",\ b.translate(to="es") elif b.detect_language() == "es": print "La palabra " + words +\ " esta en espanol y en ingles significa", b.translate(to="en") return
def get_translate(self, text): text = text.replace("text:", "") blob = TextBlob(text) lan = blob.detect_language() if lan != 'en': sentp = blob.translate(to="en") else: sentp = blob.translate(to="fa") sent = self.sender.sendMessage(str(sentp)) self._editor = telepot.helper.Editor(self.bot, sent) self._edit_msg_ident = telepot.message_identifier(sent)
def get_average_contrast(feat, text): ''' 0 = no contrast 1 = high contrast ''' negCount = 0 posCount = 0 negTotal = 0.0 posTotal = 0.0 polarityTemp = 0.0 polarityDif = 0.0 if type(text) is str: try: blob = TextBlob(text.strip()) for sentence in blob.sentences: polarityTemp = sentence.sentiment.polarity if polarityTemp < 0: negTotal += polarityTemp negCount += 1 elif polarityTemp > 0: posTotal += polarityTemp posCount += 1 if negCount > 0: if posCount > 0: polarityDif = ((posTotal / posCount) - (negTotal / negCount)) / 2 if polarityDif < 0.5: feat['average_contrast'] = False else: feat['average_contrast'] = True except: feat['average_contrast'] = False else: feat['average_contrast'] = False
def del_a_comment(insta_username): media_id = get_post_id(insta_username) url = BASE_URL + "/media/media-id/comments?access_token={}".format( ACCESS_TOKEN) print(url) comment_info = requests.get(url).json() if comment_info['meta']['code'] == 200: if len(comment_info['data']): #Here's a naive implementation of how to delete the negative comments : for x in range(0, len(comment_info["data"])): comment_id = comment_info['data'][x]['id'] comment_text = comment_info['data'][x]['text'] blob = TextBlob(comment_text, analyzer=NaiveBayesAnalyzer()) if (blob.sentiment.p_neg > blob.sentiment.p_pos): print('Negative comment : {}').format(comment_text) delete_url = BASE_URL + "/media/{}/comments/{}/?access_token={}".format( media_id, comment_id, ACCESS_TOKEN) print('DELETE request url : %s').format(delete_url) delete_info = requests.delete(delete_url).json() if delete_info['meta']['code'] == 200: print('Comment successfully deleted!\n') else: print('Unable to delete comment!') else: print('Positive comment : %s\n').format(comment_text) else: print('There are no existing comments on the post!') else: print('Status code other than 200 received!')
def func_5(filename): length = [] #length list polarities = [] #polarity list with open(filename, 'r') as f: #open file for line in f.readlines(): #read file line by line text = line.split('\t')[ -1] #split line with tab,the last one in splited list is text length.append(len( text)) #calculate length of text and append it to length list polarities.append( TextBlob(text).sentiment.polarity ) #get text polarity and append it to polarities list plt.scatter(length, polarities, c='g') #scatterplot length and polarities,color is green plt.title('sentiment&length') #title plt.xlabel('length') #xlabel plt.ylabel('sentiment') #ylabel length = np.array(length) #transpose list to array polarities = np.array(polarities) pearson = np.corrcoef(length, polarities) #calculate pearson's correlation print 'question 5:' print 'Correlation between length/polarity: r = {}, p = {}'.format( pearson[0], pearson[1]) #format print plt.savefig('sentiment&length.png') plt.show()
def get_extreme_contrast(feat, text): ''' 0 = no contrast 1 = high contrast ''' minPolarity = 0.0 maxPolarity = 0.0 polarityTemp = 0.0 if type(text) is str: try: blob = TextBlob(text.strip()) for sentence in blob.sentences: polarityTemp = sentence.sentiment.polarity if polarityTemp > maxPolarity: maxPolarity = polarityTemp elif polarityTemp < minPolarity: minPolarity = polarityTemp score = (maxPolarity - minPolarity) / 2 if score < 0.5: feat['extreme_contrast'] = False else: feat['extreme_contrast'] = True except: feat['extreme_contrast'] = False else: feat['extreme_contrast'] = False
def get_sentiment_for_text(text): ''' Returns the sentiment score for the given text. ''' text_analyzer = TextBlob(text, analyzer=NaiveBayesAnalyzer()) sentiment = text_analyzer.sentiment return (sentiment.classification, sentiment.p_pos, sentiment.p_neg)
def find_concepts(self, paragraph: str) -> dict: blob = TextBlob(paragraph) sentences = list(blob.sentences) to_return = dict() for sentence in sentences: medscan_markup = self.markup_sentence(str(sentence)) range2dict = dict() #{id_range:{id:obj_name}} markup_pos = medscan_markup.find('ID{') while markup_pos > 0: markup_start = markup_pos + 3 id_end = medscan_markup.find('=', markup_start) msids = list(medscan_markup[markup_start:id_end].split(',')) id_range = (int(msids[0]) // 1000000) * 1000000 first_msid = 0 if len(msids) == 1 else 1 markup_end = medscan_markup.find('}', markup_start + 5) if markup_end < 0: break #hack for broken markup for i in range(first_msid, len(msids)): msid = msids[i] try: obj_name = self.objnames[msid] except KeyError: obj_name = medscan_markup[id_end + 1:markup_end] print( '"%s" with MedScan ID %s doesn\'t have object name' % (obj_name, msid)) try: range2dict[id_range][msid] = obj_name except KeyError: range2dict[id_range] = {msid: obj_name} markup_pos = medscan_markup.find('ID{', markup_end + 1) to_return[medscan_markup] = range2dict return to_return # {medscan_markup:{id_range:{id:obj_name}}}, str
def handle_message_event(event): print(event) text = event.message.text source = event.source id = '' if isinstance(source, SourceUser): id = source.user_id elif isinstance(source, SourceGroup): id = source.group_id set_send_id(id) blob = TextBlob(text) if '狀態' in text: text = text.replace('狀態', '') if text == '': line_bot_api.reply_message( event.reply_token, TextSendMessage(text=get_all_messages())) else: name = text line_bot_api.reply_message(event.reply_token, TextSendMessage(text=get_message(name))) elif '報告' in text: matches = re.search('(.*)報告(\d*)', text) if matches.group(1) == '' and matches.group(2) == '': line_bot_api.reply_message(event.reply_token, TextSendMessage(text=get_all_reports())) elif matches.group(2) == '': line_bot_api.reply_message( event.reply_token, TextSendMessage(text=get_report(matches.group(1)))) else: line_bot_api.reply_message( event.reply_token, TextSendMessage(text=get_report_url(matches.group(1), int(matches.group(2))))) elif '敬禮' in text: line_bot_api.reply_message(event.reply_token, TextSendMessage(text='敬禮')) elif '安安' in text: line_bot_api.reply_message(event.reply_token, TextSendMessage(text='安')) elif '0.0' in text: line_bot_api.reply_message(event.reply_token, TextSendMessage(text='0.0')) elif blob.detect_language() == 'ru': line_bot_api.reply_message( event.reply_token, TextSendMessage(text=str(blob.translate(to='zh-TW'))))
def __get_blob(self, text): """ Translate text with current user locale @param text as str """ try: locales = GLib.get_language_names() user_code = locales[0].split(".")[0] try: from textblob.blob import TextBlob except: return _("You need to install python3-textblob module") blob = TextBlob(text) return str(blob.translate(to=user_code)) except Exception as e: Logger.error("LyricsView::__get_blob(): %s", e) return _("Can't translate this lyrics")
def test_tag_textblob(self): trained_tagger = PerceptronTagger() blob = TextBlob(self.text, pos_tagger=trained_tagger) # Punctuation is excluded assert_equal([w for w, t in blob.tags], [ 'Simple', 'is', 'better', 'than', 'complex', 'Complex', 'is', 'better', 'than', 'complicated' ])
def get_subjectivity_score(feat, text): ''' 0 = very objective 1 = very subjective ''' try: blob = TextBlob(text.strip()) feat['subjectivity'] = blob.sentiment.subjectivity except: feat['subjectivity'] = 0.0
def get_polarity_score(feat, text): ''' -1 = very negative 1 = very positive ''' try: blob = TextBlob(text.strip()) feat['polarity'] = blob.sentiment.polarity except: feat['polarity'] = 0.0
def import_text(text, title=None): """ Import a text. """ blob = TextBlob(text) document = Document() if title is None: first_sentence = blob.sentences[0] if len(first_sentence) > 50: chunk = first_sentence[0:47] last_space = chunk.rfind(" ") title = "{0}...".format(first_sentence[0:last_space]) document.title = title document.text = text document.language = blob.detect_language() sentiment = blob.sentiment document.polarity = sentiment[0] document.subjectivity = sentiment[1] document.is_tagged = False document.save() return document
def get_subjectivity_score(feat, text): ''' 0 = very objective 1 = very subjective ''' if type(text) is str: try: blob = TextBlob(text.strip()) score = int(blob.sentiment.subjectivity * 100) feat['subjectivity'] = score except: feat['subjectivity'] = 0 else: feat['subjectivity'] = 0
def freq(self, word, docs=None): if docs is None: return self.tokens.count(word) else: if not isinstance(docs, str): d = "" for item in docs: d = "%s %s" % (d, item) docs = d blob = TextBlob(text=docs, tokenizer=self.tokenizer) blob.tokens.extend(self.bigramify(blob)) blob.tokens.extend(self.trigramify(blob)) return blob.tokens.count(word)
def get_polarity_score(feat, text): ''' -1 = very negative 1 = very positive ''' if type(text) is str: try: blob = TextBlob(text.strip()) score = int(np.abs(blob.sentiment.polarity) * 100) feat['polarity'] = score except: feat['polarity'] = 0 else: feat['polarity'] = 0
def main(): # Get our data as an array: [title, author, date, content] from read_in() lines = read_in() title = lines[0] author = lines[1] date = lines[2] chinese_blob = TextBlob(lines[3]) en_content = chinese_blob.translate(from_lang="zh-CN", to='en') info("Translated texts: " + str(en_content)) # print translated result to web console. # Combine translated result with ada-content-en.csv to produce new csv. # Make a call to localhost:5000/update with data: [(id),title,author,date,content], "id" field will be automatically generated by reviewing csv file. # please note that in dev environment, 8080 is node app port, while 5000 is python flask app port. r = requests.get("http://localhost:5000/update", headers={'X-API-TOKEN': 'FOOBAR1'}, data={'title': title, 'author': author, 'date': date, 'content': en_content}) info("INFO: " + r.text) # training updated backup.csv. r_train = requests.get("http://localhost:5000/train", headers={'X-API-TOKEN': 'FOOBAR1'}, data={'data-url': 'backup.csv'}) info("INFO: " + r_train.text) # predicting updated backup.csv. r_predict = requests.post("http://localhost:5000/predict", headers={'X-API-TOKEN': 'FOOBAR1'}, data={'item': '-1', 'num': 2, 'data-url': 'backup.csv'}) info("INFO: " + r_predict.text)
def removeStopWords(definition): #replacing special signs definition = (" ".join(re.findall(r"[A-Za-z0-9]*", definition))).replace(" ", " ") blob = TextBlob(definition) #REMOVING STOP WORDS filtered_sentence = [] #words that are in the definitions word_cluster = [] #saving the word cluster for every word #check every word and identify word cluster for word, pos in blob.tags: #check if word in stop words if word not in stop_words: #if not add stop word, add word and cluster to arrays filtered_sentence.append(word) word_cluster.append(pos) return (filtered_sentence, word_cluster)
def get_extreme_contrast(feat, text): ''' 0 = no contrast 1 = high contrast ''' minPolarity = 0.0 maxPolarity = 0.0 polarityTemp = 0.0 try: blob = TextBlob(text.strip()) for sentence in blob.sentences: polarityTemp = sentence.sentiment.polarity if polarityTemp > maxPolarity: maxPolarity = polarityTemp elif polarityTemp < minPolarity: minPolarity = polarityTemp feat['extreme_contrast'] = (maxPolarity - minPolarity) / 2 except: feat['extreme_contrast'] = 0.0
def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger) adjective_map = dict( Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict( adjective_map.items() + { self.POLARITY_FEATURE_KEY: polarity, self.SUBJECTIVITY_FEATURE_KEY: subjectivity }.items())
def get_polarity_score(feat, text): ''' -1 = very negative 1 = very positive ''' if type(text) is str: try: blob = TextBlob(text.strip()) score = blob.sentiment.polarity if score is 0: feat['positivity'] = False feat['negativity'] = False elif score < 0: feat['positivity'] = False feat['negativity'] = True else: feat['positivity'] = True feat['negativity'] = False except: feat['positivity'] = False feat['negativity'] = False else: feat['positivity'] = False feat['negativity'] = False
def get_subjectivity_score(feat, text): ''' 0 = very objective 1 = very subjective ''' if type(text) is str: try: blob = TextBlob(text.strip()) score = blob.sentiment.subjectivity if score is 0.5: feat['subjectivity'] = False feat['objectivity'] = False elif score < 0.5: feat['subjectivity'] = False feat['objectivity'] = True else: feat['subjectivity'] = True feat['objectivity'] = False except: feat['subjectivity'] = False feat['objectivity'] = False else: feat['subjectivity'] = False feat['objectivity'] = False
def translate(p): try: return str(TextBlob(p).translate(from_lang='pl')) except (NotTranslated, TranslatorError) as e: return 'Translation error'
for row in reader: combo = url2content(row['url']) writer.writerow({'id': row['id'], 'title': combo['title'], 'author': combo['author'], 'date': combo['date'], 'url': row['url'], 'content': combo['combined_string']}) print 'Processing scraper NO.' + str(row['id']) ### Connect with ada-content.csv to translate content to english version. with open('ada-content-en.csv', 'w') as target: fieldnames = ['id', 'title', 'author', 'date', 'url', 'content'] writer = csv.DictWriter(target, fieldnames=fieldnames) writer.writeheader() with open('ada-content.csv') as source: reader = csv.DictReader(source.read().splitlines()) for row in reader: chinese_blob = TextBlob(row['content'].decode('utf-8')) en_content = chinese_blob.translate(from_lang="zh-CN", to='en') writer.writerow({'id': row['id'], 'title': row['title'], 'author': row['author'], 'date': row['date'], 'url': row['url'], 'content': en_content}) print 'Processing translator NO. ' + str(row['id'])
from textblob.blob import TextBlob blob = TextBlob('Уровень') print(str(blob.translate(to='zh-TW')))
def sentimentTB(sentence): from textblob.blob import TextBlob sentBlob = TextBlob(sentence) sentiment = sentBlob.sentiment.polarity return sentiment