sys.path.insert(0, os.path.join("..", "..")) from pattern.search import search, Pattern, Constraint from pattern.en import Sentence, parse # This example demonstrates an interesting search pattern that mines for comparisons. # Notice the use of the constraint "be". # If the output from the parser includes word lemmas (e.g. "doing" => "do") # these will also be matched. Using "be" then matches "is", "being", "are", ... # and if underspecification is used "could be", "will be", "definitely was", ... p = Pattern.fromstring("NP be (more) ADJP|ADVP than NP") for s in ("the turtle was faster than the hare", "Arnold Schwarzenegger is more dangerous than Dolph Lundgren"): s = s = Sentence(parse(s, lemmata=True)) # parse lemmas m = p.search(s) print s print print m print if m: print m[0].constituents() # Words grouped by chunk whenever possible. print m[0].constraints( chunk=s.chunks[0]) # The constraints that match the given chunk. print m[0].constituents( constraint=p[0]) # Constituents for the given constraint. print m[0].constituents( constraint=[0, 3, 5]) # Constituents for the given constraint indices. print
def processor(self, minePackage): print '####SEARCH_KEY:', minePackage['searchKey'] s = Sentence(parse(minePackage['searchKey'])) minePackage['searchKey'] = count( words(s), stemmer=PORTER) #Retorna diccionario {palabra: cantidad} return minePackage['searchKey']
from pattern.table import Table, pprint # "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Yahoo! and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Yahoo search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) t = Table() engine = Yahoo(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i + 1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[0] # Right NP. t.append((a.string.lower(), b.string.lower())) pprint(t) print print len(t), "results."
def extract_bias_features(text): features = {} txt_lwr = str(text).lower() words = nltk.word_tokenize(txt_lwr) words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$'] if len(words) < 1: return None unigrams = sorted(list(set(words))) bigram_tokens = nltk.bigrams(words) bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))] trigram_tokens = nltk.trigrams(words) trigrams = [" ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))] # print words # print unigrams # print bigrams # print trigrams # print "----------------------" # word count features['word_count'] = float(len(words)) # unique word count features['unique_word_count'] = float(len(unigrams)) # coherence marker count count, instances = count_feature_list_freq(coherence, words, bigrams, trigrams) # if count > 0: features['coherence_marker_count'] = count features['coherence_marker_prop'] = round(float(count) / float(len(words)), 4) features['coherence_marker_list'] = instances # degree modifier count count, instances = count_feature_list_freq(modifiers, words, bigrams, trigrams) #if count > 0: features['degree_modifier_count'] = count features['degree_modifier_prop'] = round(float(count) / float(len(words)), 4) features['degree_modifier_list'] = instances # hedge word count count, instances = count_feature_list_freq(hedges, words, bigrams, trigrams) #if count > 0: features['hedge_word_count'] = count features['hedge_word_prop'] = round(float(count) / float(len(words)), 4) features['hedge_word_list'] = instances # factive verb count count, instances = count_feature_list_freq(factives, words, bigrams, trigrams) #if count > 0: features['factive_verb_count'] = count features['factive_verb_prop'] = round(float(count) / float(len(words)), 4) features['factive_verb_list'] = instances # assertive verb count count, instances = count_feature_list_freq(assertives, words, bigrams, trigrams) #if count > 0: features['assertive_verb_count'] = count features['assertive_verb_prop'] = round(float(count) / float(len(words)), 4) features['assertive_verb_list'] = instances # implicative verb count count, instances = count_feature_list_freq(implicatives, words, bigrams, trigrams) #if count > 0: features['implicative_verb_count'] = count features['implicative_verb_prop'] = round(float(count) / float(len(words)), 4) features['implicative_verb_list'] = instances # bias words and phrases count count, instances = count_feature_list_freq(biased, words, bigrams, trigrams) #if count > 0: features['bias_count'] = count features['bias_prop'] = round(float(count) / float(len(words)), 4) features['bias_list'] = instances # opinion word count count, instances = count_feature_list_freq(opinionLaden, words, bigrams, trigrams) #if count > 0: features['opinion_count'] = count features['opinion_prop'] = round(float(count) / float(len(words)), 4) features['opinion_list'] = instances # weak subjective word count count, instances = count_feature_list_freq(subj_weak, words, bigrams, trigrams) #if count > 0: features['subjective_weak_count'] = count features['subjective_weak_prop'] = round(float(count) / float(len(words)), 4) features['subjective_weak_list'] = instances # strong subjective word count count, instances = count_feature_list_freq(subj_strong, words, bigrams, trigrams) #if count > 0: features['subjective_strong_count'] = count features['subjective_strong_prop'] = round(float(count) / float(len(words)), 4) features['subjective_strong_list'] = instances # composite sentiment score using VADER sentiment analysis package compound_sentiment = vader_sentiment_analysis.polarity_scores(text)['compound'] features['vader_composite_sentiment'] = float(compound_sentiment) # subjectivity score using Pattern.en pattern_subjectivity = pattern_sentiment(text)[1] features['subjectivity_score'] = round(pattern_subjectivity, 4) # modality (certainty) score and mood using http://www.clips.ua.ac.be/pages/pattern-en#modality sentence = parse(text, lemmata=True) sentenceObj = Sentence(sentence) features['modality'] = round(modality(sentenceObj), 4) try: features['mood'] = mood(sentenceObj) except IndexError as e: print "IndexError: %s" % e print "Ignoring..." features['mood'] = 'err' # Flesch-Kincaid Grade Level (reading difficulty) using textstat try: features['flesch-kincaid_grade_level'] = float(textstat.flesch_kincaid_grade(text)) except TypeError as e: print "TypeError: %s" % e print "Ignoring..." features['flesch-kincaid_grade_level'] = 0.0 # liwc 3rd person pronoun count (combines S/he and They) count, instances = count_liwc_list_freq(liwc_3pp, words) #if count > 0: features['liwc_3rd_person_pronoum_count'] = count features['liwc_3rd_person_pronoun_prop'] = round(float(count) / float(len(words)), 4) features['liwc_3rd_person_pronoun_list'] = instances # liwc auxiliary verb count count, instances = count_liwc_list_freq(liwc_aux, words) #if count > 0: features['liwc_auxiliary_verb_count'] = count features['liwc_auxiliary_verb_prop'] = round(float(count) / float(len(words)), 4) features['liwc_auxiliary_verb_list'] = instances # liwc adverb count count, instances = count_liwc_list_freq(liwc_adv, words) #if count > 0: features['liwc_adverb_count'] = count features['liwc_adverb_prop'] = round(float(count) / float(len(words)), 4) features['liwc_adverb_list'] = instances # liwc preposition count count, instances = count_liwc_list_freq(liwc_prep, words) #if count > 0: features['liwc_preposition_count'] = count features['liwc_preposition_prop'] = round(float(count) / float(len(words)), 4) features['liwc_preposition_list'] = instances # liwc conjunction count count, instances = count_liwc_list_freq(liwc_conj, words) #if count > 0: features['liwc_conjunction_count'] = count features['liwc_conjunction_prop'] = round(float(count) / float(len(words)), 4) features['liwc_conjunction_list'] = instances # liwc discrepency word count count, instances = count_liwc_list_freq(liwc_discr, words) #if count > 0: features['liwc_discrepency_word_count'] = count features['liwc_discrepency_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_discrepency_word_list'] = instances # liwc tentative word count count, instances = count_liwc_list_freq(liwc_tent, words) #if count > 0: features['liwc_tentative_word_count'] = count features['liwc_tentative_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_tentative_word_list'] = instances # liwc certainty word count count, instances = count_liwc_list_freq(liwc_cert, words) #if count > 0: features['liwc_certainty_word_count'] = count features['liwc_certainty_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_certainty_word_list'] = instances # liwc causation word count count, instances = count_liwc_list_freq(liwc_causn, words) #if count > 0: features['liwc_causation_word_count'] = count features['liwc_causation_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_causation_word_list'] = instances # liwc work word count count, instances = count_liwc_list_freq(liwc_work, words) #if count > 0: features['liwc_work_word_count'] = count features['liwc_work_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_work_word_list'] = instances # liwc achievement word count count, instances = count_liwc_list_freq(liwc_achiev, words) #if count > 0: features['liwc_achievement_word_count'] = count features['liwc_achievement_word_prop'] = round(float(count) / float(len(words)), 4) features['liwc_achievement_word_list'] = instances return features
print search("rabbit", "big white rabbit") print # Search words can contain wildcard characters: print search("rabbit*", "big white rabbit") print search("rabbit*", "big white rabbits") print # Search words can contain different options: print search("rabbit|cony|bunny", "big black bunny") print # Things become more interesting if we involve the pattern.en.parser module. # The parser takes a string, identifies words, and assigns a part-of-speech tag # to each word, for example NN (noun) or JJ (adjective). # A parsed sentence can be scanned for part-of-speech tags: s = Sentence(parse("big white rabbit")) print search("JJ", s) # all adjectives print search("NN", s) # all nouns print search("NP", s) # all noun phrases print # Since the search() is case-insensitive, uppercase search words # are always considered to be tags (or taxonomy terms - see further examples). # The return value is a Match object, # where Match.words is a list of Word objects that matched: m = search("NP", s) for word in m[0].words: print word.string, word.tag
] for strSentence in sentList: for word, pos in tag(strSentence): if pos in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): # Retrieve all adjectives. print("=====================>>>>> ", word, pos) else: print(word, pos) print(strSentence) a = parse(strSentence, relations=True, lemmata=True) pprint(a) sentence = Sentence(a) print(sentence.verbs) print print #print(sentence.relations) #print(sentence.subjects) #print(sentence.objects) #print(sentence.verbs) #print(sentence.chunk) sentScore = sid.polarity_scores(strSentence) # sqlite3 insert : subject / objects / verbs / CPC / Sentiment # genre, wordCount, filename, sentence # subject : Chunk('he/NP-SBJ-1'), Chunk('you/NP-SBJ-2')]
words=[], type=None, role=None, relation=None) print pnp.string # String of words (Unicode). print pnp.chunks # List of Chunk objects. # print pnp.preposition # First PP chunk in the PNP. # sentiment print sentiment( "The movie attempts to be surreal by incorporating various time paradoxes," "but it's presented in such a ridiculous way it's seriously boring.") print sentiment('Wonderfully awful! :-)').assessments # mode and modality s = "Some amino acids tend to be acidic while others may be basic." # weaseling s = parse(s, lemmata=True) s = Sentence(s) print modality(s) # wordnet s = wordnet.synsets('bird')[0] print 'Definition:', s.gloss # Definition string. print ' Synonyms:', s.synonyms # List of word forms (i.e., synonyms) print ' Hypernyms:', s.hypernyms( ) # returns a list of parent synsets (i.e., more general). Synset (semantic parent). print ' Hypernyms:', s.hypernyms(recursive=False, depth=None) print ' Hyponyms:', s.hyponyms( ) # returns a list child synsets (i.e., more specific). print ' Hyponyms:', s.hyponyms(recursive=False, depth=None) print ' Holonyms:', s.holonyms( ) # List of synsets (of which this is a member). print ' Meronyms:', s.meronyms() # List of synsets (members/parts). print ' POS:', s.pos # Part-of-speech: NOUN | VERB | ADJECTIVE | ADVERB.
# (mail/spam, positive/negative, language, author's age, ...), # you can predict the type of other "unknown" texts. # The k-Nearest Neighbor algorithm classifies texts according # to the k documents that are most similar (cosine similarity) to the given input document. m = Model() t = Twitter() # First, we mine a model of a 1000 tweets. # We'll use hashtags as type. for page in range(1, 10): for tweet in t.search('#win OR #fail', start=page, count=100, cached=True): # If the tweet contains #win hashtag, we'll set its type to 'WIN': s = tweet.text.lower() # tweet in lowercase p = '#win' in s and 'WIN' or 'FAIL' # document labels s = Sentence(parse(s)) # parse tree with part-of-speech tags s = search('JJ', s) # adjectives in the tweet s = [match[0].string for match in s] # adjectives as a list of strings s = " ".join(s) # adjectives as string if len(s) > 0: m.append(Document(s, type=p, stemmer=None)) # Train k-Nearest Neighbor on the model. # Note that this is only a simple example: to build a robust classifier # you would need a lot more training data (e.g., tens of thousands of tweets). # The more training data, the more statistically reliable the classifier becomes. # The only way to really know if your classifier is working correctly # is to test it with testing data, see the documentation for Classifier.test(). classifier = KNN(baseline=None) # By default, baseline=MAJORITY for document in m: # (classify unknown documents with the most frequent type). classifier.train(document)
def test_match(self): # Assert Pattern.match() P = search.Pattern.fromstring X = search.STRICT S = lambda s: Sentence(parse(s, relations=True, lemmata=True)) for i, (pattern, test, match) in enumerate(( (P("^rabbit"), "white rabbit", None), # 0 (P("^rabbit"), "rabbit", "rabbit"), # 1 (P("rabbit"), "big white rabbit", "rabbit"), # 2 (P("rabbit*"), "big white rabbits", "rabbits"), # 3 (P("JJ|NN"), S("big white rabbits"), "big"), # 4 (P("JJ+"), S("big white rabbits"), "big white"), # 5 (P("JJ+ NN*"), S("big white rabbits"), "big white rabbits"), # 6 (P("JJ black|white NN*"), S("big white rabbits"), "big white rabbits"), # 7 (P("NP"), S("big white rabbit"), "big white rabbit"), # 8 (P("big? rabbit", X), S("big white rabbit"), "rabbit"), # 9 strict (P("big? rabbit|NN"), S("big white rabbit"), "rabbit"), # 10 explicit (P("big? rabbit"), S("big white rabbit"), "big white rabbit"), # 11 greedy (P("rabbit VP JJ"), S("the rabbit was huge"), "the rabbit was huge"), # 12 (P("rabbit be JJ"), S("the rabbit was huge"), "the rabbit was huge"), # 13 lemma (P("rabbit be JJ", X), S("the rabbit was huge"), "rabbit was huge"), # 14 (P("rabbit is JJ"), S("the rabbit was huge"), None), # 15 (P("the NP"), S("the rabid rodents"), "the rabid rodents"), # 16 overlap (P("t*|r*+"), S("the rabid rodents"), "the rabid rodents"), # 17 (P("(DT) JJ? NN*"), S("the rabid rodents"), "the rabid rodents"), # 18 (P("(DT) JJ? NN*"), S("the rabbit"), "the rabbit"), # 19 (P("rabbit"), S("the big rabbit"), "the big rabbit"), # 20 greedy (P("eat carrot"), S("is eating a carrot"), "is eating a carrot"), # 21 (P("eat carrot|NP"), S("is eating a carrot"), "is eating a carrot"), # 22 (P("eat NP"), S("is eating a carrot"), "is eating a carrot"), # 23 (P("eat a"), S("is eating a carrot"), "is eating a"), # 24 (P("!NP carrot"), S("is eating a carrot"), "is eating a carrot"), # 25 (P("eat !pizza"), S("is eating a carrot"), "is eating a carrot"), # 26 (P("eating a"), S("is eating a carrot"), "is eating a"), # 27 (P("eating !carrot", X), S("is eating a carrot"), "eating a"), # 28 (P("eat !carrot"), S("is eating a carrot"), None), # 28 NP chunk is a carrot (P("eat !DT"), S("is eating a carrot"), None), # 30 eat followed by DT (P("eat !NN"), S("is eating a carrot"), "is eating a"), # 31 a/DT is not NN (P("!be carrot"), S("is eating a carrot"), "is eating a carrot"), # 32 is eating == eat != is (P("!eat|VP carrot"), S("is eating a carrot"), None), # 33 VP chunk == eat (P("white_rabbit"), S("big white rabbit"), None), # 34 (P("[white rabbit]"), S("big white rabbit"), None), # 35 (P("[* white rabbit]"), S("big white rabbit"), "big white rabbit"), # 36 (P("[big * rabbit]"), S("big white rabbit"), "big white rabbit"), # 37 (P("big [big * rabbit]"), S("big white rabbit"), "big white rabbit"), # 38 (P("[*+ rabbit]"), S("big white rabbit"), None), # 39 bad pattern: "+" is literal )): m = pattern.match(test) self.assertTrue(getattr(m, "string", None) == match) # Assert chunk with head at the front. s = S("Felix the cat") self.assertEqual(P("felix").match(s).string, "Felix the cat") # Assert negation + custom greedy() function. s = S("the big white rabbit") g = lambda chunk, constraint: len([w for w in chunk if not constraint.match(w)]) == 0 self.assertEqual(P("!white").match(s).string, "the big white rabbit") # a rabbit != white self.assertEqual(P("!white", greedy=g).match(s), None) # a white rabbit == white # Assert taxonomy items with spaces. s = S("Bugs Bunny is a giant talking rabbit.") t = search.Taxonomy() t.append("rabbit", type="rodent") t.append("Bugs Bunny", type="rabbit") self.assertEqual(P("RABBIT", taxonomy=t).match(s).string, "Bugs Bunny") # Assert None, the syntax cannot handle taxonomy items that span multiple chunks. s = S("Elmer Fudd fires a cannon") t = search.Taxonomy() t.append("fire cannon", type="violence") self.assertEqual(P("VIOLENCE").match(s), None) # Assert regular expressions. s = S("a sack with 3.5 rabbits") p = search.Pattern.fromstring("[] NNS") p[0].words.append(re.compile(r"[0-9|\.]+")) self.assertEqual(p.match(s).string, "3.5 rabbits") print("pattern.search.Pattern.match()")
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.search import Pattern from pattern.en import Sentence, parse # Constraints ending in + match one or more words. # Pattern.search() uses a "greedy" approach: # it will attempt to match as many words as possible. # The following pattern means: # one or more words starting with "t", # followed by one or more words starting with "f". p = Pattern.fromstring("t*+ f*+") s = Sentence(parse("one two three four five six")) m = p.search(s) print s print m print for w in m[0].words: print w, "matches", m[0].constraint(w) # Pattern.fromstring("*") matches each word in the sentence. # This yields a list with a Match object for each word. print print "* =>", Pattern.fromstring("*").search(s) # Pattern.fromstring("*+") matches all words. # This yields a list with one Match object containing all words.
print m print("") # Sentence chunks can be matched by tag (e.g. NP, VP, ADJP). # The pattern below matches anything from # "the rabbit gnaws at your fingers" to # "the white rabbit looks at the carrots": p = Pattern.fromstring("rabbit VP at NP", s) m = p.search(s) print(m) print("") if m: for w in m[0].words: print("%s\t=> %s" % (w, m[0].constraint(w))) print("") print("-------------------------------------------------------------") # Finally, constraints can also include regular expressions. # To include them we need to use the full syntax instead of the search() function: import re r = re.compile(r"[0-9|\.]+") # all numbers p = Pattern() p.sequence.append(Constraint(words=[r])) p.sequence.append(Constraint(tags=["NN*"])) s = Sentence(parse("I have 9.5 rabbits.")) print(s) print(p.search(s)) print("")
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.search import search, Pattern, Constraint from pattern.en import Sentence, parse # What we call a "search word" in example 01-search.py # is actually called a constraint, because it can contain different options. # Options are separated by "|". # The next search pattern retrieves words that are a noun OR an adjective: s = Sentence(parse("big white rabbit")) print search("NN|JJ", s) print # This pattern yields phrases containing an adjective followed by a noun. # Consecutive constraints are separated by a space: print search("JJ NN", s) print # Or a noun preceded by any number of adjectives: print search("(JJ)+ NN", s) print # Note: NN marks singular nouns, NNS marks plural nouns. # If you want to include both, use "NN*" as a constraint. # This works for NN*, VB*, JJ*, RB*. s = Sentence(parse("When I sleep the big white rabbit will stare at my feet.")) m = search("rabbit stare at my", s) print s print m
print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print(PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse s = parse('I eat pizza with a fork.') pprint(s) #tag for word, t in tag('The cat felt happy.'): print word + ' is ' + t s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring." print sentiment(s) print polarity(s) print subjectivity(s) #The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty s2 = "Some amino acids tend to be acidic while others may be basic." # weaseling se = Sentence(parse(s, chunks=False, lemmata=True)) print modality(se)
def chunklist_(self): sentence_obj = Sentence(self.PARSE) return sentence_obj.chunks
# Constraints wrapped in () are optional, matching one or no word. # Pattern.search() uses a "greedy" approach: # it will attempt to include as many optional constraints as possible. # The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns). # A preceding adjective, adverb or determiner are picked up as well. p = Pattern.fromstring("(DT) (RB) (JJ) NN+") for s in ( "the cat", # DT NN "the very black cat", # DT RB JJ NN "tasty cat food", # JJ NN NN "the funny black cat", # JJ NN "very funny", # RB JJ => no match, since there is no noun. "my cat is black and your cat is white"): # NN + NN s = Sentence(parse(s)) m = p.search(s) print print s print m if m: for w in m[0].words: print w, "matches", m[0].constraint(w) # Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+" # to include multiple adverbs/adjectives. # By combining * () and + patterns can become quite complex. # Optional constraints are useful for very specific patterns, but slow. # Also, depending on which parser you use (e.g. MBSP), words can be tagged differently # and may not match in the way you expect. # Consider using a robust Pattern.fromstring("NP").
# Pattern.fromstring("rose|lily|daisy|daffodil|begonia"). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern: p = Pattern([Constraint(taxa=["flower"])]) # or p = Pattern.fromstring("FLOWER") s = Sentence(parse("A field of white daffodils.", lemmata=True)) m = p.search(s) print s print m print from pattern.search import search taxonomy.append("chicken", type="food") taxonomy.append("chicken", type="bird") taxonomy.append("penguin", type="bird") taxonomy.append("bird", type="animal") print taxonomy.parents("chicken") print taxonomy.children("animal", recursive=True) print search("FOOD", "I'm eating chicken.") print