def initialize_document(self, doc, docs_list_mode=False): if not docs_list_mode: self.doc = doc.lower() self.blob = TextBlob(text=self.doc, tokenizer=self.tokenizer) self.tokens = copy.deepcopy(self.blob.tokens) self.bigrams = self.bigramify(self.blob.tokens) self.tokens.extend(self.bigrams) self.trigrams = self.trigramify(self.blob.tokens) self.tokens.extend(self.trigrams) else: doc = doc.lower() blob = TextBlob(text=doc, tokenizer=self.tokenizer) tokens = copy.deepcopy(blob.tokens) bigram = self.bigramify(tokens=tokens) tokens.extend(bigram) trigram = self.trigramify(tokens=tokens) tokens.extend(trigram) return tokens
def test_tag_blob(self): blob = TextBlob(self.text, pos_tagger=self.tagger) tags = blob.tags logging.debug("tags: {0}".format(tags)) words = self.text.split() for i, word_tag in enumerate(tags): assert_equal(word_tag[0], words[i])
def sentences_sentiment(): text = get_text(request) blob = TextBlob(text) sentences = [{ "sentence": unicode(s), "sentiment": s.sentiment[0] } for s in blob.sentences] return jsonify({"result": sentences})
def sentence_to_words(sentence): """ Converts passed sentences into a list of words. Returns all words but stop words. """ blob = TextBlob(sentence) return [ word.lower() for word in blob.words if word not in stopwords.words('english') ]
def create_blob(self, request_json): options = {} if request_json.get('analyzer') == 'NaiveBayesAnalyzer': options['analyzer'] = self.naive_bayes_analyzer if request_json.get('np_extractor') == 'ConllExtractor': options['np_extractor'] = self.conll_extractor if request_json.get('pos_tagger') == 'NLTKTagger': options['pos_tagger'] = self.nltk_tagger elif request_json.get('pos_tagger') == 'PerceptronTagger': options['pos_tagger'] = self.perceptron_tagger return TextBlob(request_json['text'], **options)
def cache_sentences(self): self.cached = True self.cache_list = [] for key, value in self.__dict__.items(): if key not in filterTag: try: blob = TextBlob(value) for sentence in blob.sentences: self.cache_list.append(sentence) except Exception as e: logger.debug("textblob error| %s:%s" % (key, value))
def one_sentence_from(self, quote): """Reduce the given quote to a single sentence. The choice is biased against the first sentence, which is less likely to be the start of a real in-text sentence. """ blob = TextBlob(quote) try: sentences = blob.sentences except Exception, e: # TextBlob can't parse this. Just return the whole string return quote
def to_sentences(self): if self.cached: for sentence in self.cache_list: yield sentence else: for key, value in self.__dict__.items(): if key not in filterTag: try: blob = TextBlob(value) for sentence in blob.sentences: yield sentence except Exception as e: logger.debug("textblob error| %s:%s" % (key, value))
def truncate_at_stopword(self, string): # Truncate a string at the last stopword not preceded by # another stopword. # print "%s =>" % string if type(string) == Sentence: words = string.words else: try: words = TextBlob(string).sentences except Exception, e: # TextBlob can't parse this. Just return the whole string return string
def freq(self, word, docs=None): if docs is None: return self.tokens.count(word) else: if not isinstance(docs, str): d = "" for item in docs: d = "%s %s" % (d, item) docs = d blob = TextBlob(text=docs, tokenizer=self.tokenizer) blob.tokens.extend(self.bigramify(blob)) blob.tokens.extend(self.trigramify(blob)) return blob.tokens.count(word)
def __init__(self): # create custom components self.naive_bayes_analyzer = NaiveBayesAnalyzer() self.conll_extractor = ConllExtractor() self.nltk_tagger = NLTKTagger() self.perceptron_tagger = PerceptronTagger() if DEV_ENV: return # train all components (default and custom) text = 'TextBlob blobs great!' default_blob = TextBlob(text) default_blob.sentiment default_blob.noun_phrases default_blob.pos_tags custom_blob = TextBlob(text, analyzer=self.naive_bayes_analyzer, np_extractor=self.conll_extractor, pos_tagger=self.nltk_tagger) custom_blob.sentiment custom_blob.noun_phrases custom_blob.pos_tags custom2_blob = TextBlob(text, pos_tagger=self.perceptron_tagger) custom2_blob.pos_tags
def rate(cls, s, base_score=1.0, frequencies=None, obscurity_cutoff=None): "Rate a string's suitability as an _ebook quote." s = s.strip() score = float(base_score) # print s # print " Starting rating: %.2f" % score # People like very short or very long quotes. # if len(s) < 40: # score *= 2 if len(s) > 128: score *= 2 # print " Length bonus: %.2f" % score blob = TextBlob(s.decode("utf8")) try: words = blob.words except Exception, e: # TODO: I'm sick of trying to get TextBlob to parse # strings that include things like ". . . ". Just return # the current score. return score
def quotes_in(self, paragraph): para = textwrap.wrap(paragraph, self.wrap_at) if len(para) == 0: return probability = self.probability if para[0][0].upper() == para[0][0]: # We greatly prefer lines that start with capital letters. probability *= 5 else: probability /= 4 gathering = False in_progress = None last_yield = None for i in range(len(para)): line = para[i] if gathering: # We are currently putting together a quote. done = False if (random.random() < self.truncate_chance and len(in_progress) >= self.minimum_quote_size): # Yield a truncated quote. done = True else: potential = in_progress + ' ' + line.strip() if len(potential) >= self.maximum_quote_size: # That would be too long. We're done. done = True else: in_progress = potential if done: quote = in_progress in_progress = None gathering = done = False # Miscellaneous tweaks to increase the chance that # the quote will be funny. if random.random() < 0.6: quote = self.one_sentence_from(quote) if random.random() < 0.4: quote = self.truncate_at_stopword(quote) # Quotes that end with two consecutive stopwords # are not funny. It would be best to parse every # single quote and make sure it doesn't end with # two consecutive stopwords. But in practice it's # much faster to just check for the biggest # offenders, which all end in 'the', and then trim # the 'the'. low = quote.lower() for stopwords in ('of the', 'in the', 'and the', 'in the', 'on the', 'for the'): if low.endswith(stopwords): quote = quote[:len(" the")-1] break quote = unicode(quote) quote = self.remove_ending_punctuation(quote) quote = self.remove_beginning_punctuation(quote) if random.random() > 0.75: quote = self.truncate_to_common_word(quote) if (len(quote) >= self.minimum_quote_size and len(quote) <= self.maximum_quote_size and self.ONE_LETTER.search(quote)): yield quote last_yield = quote continue else: # We are not currently gathering a quote. Should we # be? r = random.random() if random.random() < probability: # Run the regular expression and see if it matches. m = self.SEVERAL_CAPITALIZED_WORDS.search(line) if m is not None: phrase = m.groups()[0] if "Gutenberg" in phrase or "Proofreader" in phrase: # Part of the meta, not part of text. continue # Tag the text to see if it's a proper noun. blob = TextBlob(phrase) tags = blob.tags proper_nouns = [x for x, tag in tags if tag.startswith('NNP')] if len(proper_nouns) < len(tags) / 3.0: # We're good. yield phrase continue matches = self._line_matches(line) if matches or random.random() < probability: gathering = True if matches: # A keyword match! Start gathering a quote either # at this line or some earlier line. maximum_backtrack = ( self.maximum_quote_size / self.wrap_at) - 1 backtrack = random.randint(0, maximum_backtrack) start_at = max(0, i - backtrack) in_progress = " ".join( [x.strip() for x in para[start_at:i+1]]) else: in_progress = line.strip()
('I am tired of this stuff.', 'neg'), ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'), ('My boss is horrible.', 'neg')] test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'), ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'), ('Gary is a friend of mine.', 'pos'), ("I can't believe I'm doing this.", 'neg')] cl = NaiveBayesClassifier(train) # Classify some text print(cl.classify("Their burgers are amazing.")) # "pos" print(cl.classify("I don't like their pizza.")) # "neg" # Classify a TextBlob blob = TextBlob( "The beer was amazing. But the hangover was horrible. " "My boss was not pleased.", classifier=cl) print(blob) print(blob.classify()) for sentence in blob.sentences: print(sentence) print(sentence.classify()) # Compute accuracy print("Accuracy: {0}".format(cl.accuracy(test))) # Show 5 most informative features cl.show_informative_features(5)
def sentiment(): text = get_text(request) sentiment = TextBlob(text).sentiment[0] # Polarity score return jsonify({"result": sentiment})
def noun_phrases(): text = get_text(request) noun_phrases = set(TextBlob(text).noun_phrases) # Strip punctuation from ends of noun phrases and exclude long phrases stripped = [strip_punc(np) for np in noun_phrases if len(np.split()) <= 5] return jsonify({"result": stripped})
def test_blob_analyze(self): pos_blob = TextBlob(self.pos, analyzer=self.analyzer) assert_true(pos_blob.sentiment[0] > 0.0) neg_blob = TextBlob(self.neg, analyzer=self.analyzer) assert_true(neg_blob.sentiment[0] < 0.0)
# writer.writerow(columns) global_row = 0 # xlsxwriter doesn't have a writerow(function), so we have to keep track of what row we're on columns = ['unit', 'id_article', 'position (+1)', 'unit_content', 'adjectives', 'verbs', 'article_title', 'article_content_no_tags', 'article_url'] workbook = xlsxwriter.Workbook(publication_prefix+'full_report.xlsx') # Create new spreadsheet worksheet = workbook.add_worksheet() # Make new worksheet for col in range(0, len(columns)): worksheet.write(global_row, col, columns[col]) global_row += 1 for row in ngo_mentions: # Loop through all rows in the database results # Use TextBlob to parse the article # blob.tags returns the following parts of speech (some are missing, like VBN, etc.): # noun (NN), adjective (JJ), determiner (DT), verb (VB), noun phrase (NP), # sentence subject (SBJ), and prepositional noun phrase (PNP) blob = TextBlob(row['article_content_no_tags']) # Split the article into paragraphs paragraphs = (re.split('(\n)+', row['article_content_no_tags'])) paragraphs = [paragraph for paragraph in paragraphs if paragraph != "\n"] paragraphs_lower = [paragraph.lower() for paragraph in paragraphs] # Add line numbers # enumerate(list, 1) results in (list1, 1), (list2, 2), etc. article_numbered = ['(' + str(paragraph[0]) + ') ' + paragraph[1] for paragraph in enumerate(paragraphs, 1)] csv_article = '\n'.join(article_numbered) # Get a list of all the paragraphs that mention one of the organizations paragraph_position = [i for i, x in enumerate(paragraphs_lower) if any(org.lower() in x for org in organizations)] # Split the article into sentences
def content_to_sentences(text): """ Converts passed text into a list of sentences. """ blob = TextBlob(text) return [str(sentence) for sentence in blob.sentences]
# -*- coding: utf-8 -*- """ Created on Fri Oct 4 09:44:50 2013 @author: ozdemircili """ from text.blob import TextBlob text = TextBlob( "Once upon a time a there was a program called Pycheat.It was one of the cheats" ) text.tags text.noun_phrases text.sentiment text.words text.sentences text.title text.words[-1].singularize() text.words[3].pluralize() from text.blob import Word from text.blob import Verb
def text(self, tweetObject): analysis = TextBlob(tweetObject) print("HELLO") print(analysis.sentiment)