Exemple #1
0
sys.path.insert(0, os.path.join("..", ".."))

from pattern.search import search, Pattern, Constraint
from pattern.en import Sentence, parse

# This example demonstrates an interesting search pattern that mines for comparisons.
# Notice the use of the constraint "be".
# If the output from the parser includes word lemmas (e.g. "doing" => "do")
# these will also be matched. Using "be" then matches "is", "being", "are", ...
# and if underspecification is used "could be", "will be", "definitely was", ...

p = Pattern.fromstring("NP be (more) ADJP|ADVP than NP")

for s in ("the turtle was faster than the hare",
          "Arnold Schwarzenegger is more dangerous than Dolph Lundgren"):
    s = s = Sentence(parse(s, lemmata=True))  # parse lemmas
    m = p.search(s)
    print s
    print
    print m
    print
    if m:
        print m[0].constituents()  # Words grouped by chunk whenever possible.
        print m[0].constraints(
            chunk=s.chunks[0])  # The constraints that match the given chunk.
        print m[0].constituents(
            constraint=p[0])  # Constituents for the given constraint.
        print m[0].constituents(
            constraint=[0, 3,
                        5])  # Constituents for the given constraint indices.
        print
Exemple #2
0
 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey'] = count(
         words(s), stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}
     return minePackage['searchKey']
Exemple #3
0
from pattern.table import Table, pprint

# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Yahoo! and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'  # Yahoo search query
p = "NP (VP) more important than NP"  # Search pattern.
p = Pattern.fromstring(p)
t = Table()

engine = Yahoo(license=None)
for i in range(1):  # max=10
    for result in engine.search(q, start=i + 1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1]  # Left NP.
            b = m.constituents(constraint=5)[0]  # Right NP.
            t.append((a.string.lower(), b.string.lower()))

pprint(t)

print
print len(t), "results."
Exemple #4
0
def extract_bias_features(text):
    features = {}
    txt_lwr = str(text).lower()
    words = nltk.word_tokenize(txt_lwr)
    words = [w for w in words if len(w) > 0 and w not in '.?!,;:\'s"$']
    if len(words) < 1:
        return None
    unigrams = sorted(list(set(words)))
    bigram_tokens = nltk.bigrams(words)
    bigrams = [" ".join([w1, w2]) for w1, w2 in sorted(set(bigram_tokens))]
    trigram_tokens = nltk.trigrams(words)
    trigrams = [" ".join([w1, w2, w3]) for w1, w2, w3 in sorted(set(trigram_tokens))]
    # print words
    # print unigrams
    # print bigrams
    # print trigrams
    # print "----------------------"

    # word count
    features['word_count'] = float(len(words))

    # unique word count
    features['unique_word_count'] = float(len(unigrams))

    # coherence marker count
    count, instances = count_feature_list_freq(coherence, words, bigrams, trigrams)
    # if count > 0:
    features['coherence_marker_count'] = count
    features['coherence_marker_prop'] = round(float(count) / float(len(words)), 4)
    features['coherence_marker_list'] = instances

    # degree modifier count
    count, instances = count_feature_list_freq(modifiers, words, bigrams, trigrams)
    #if count > 0:
    features['degree_modifier_count'] = count
    features['degree_modifier_prop'] = round(float(count) / float(len(words)), 4)
    features['degree_modifier_list'] = instances

    # hedge word count
    count, instances = count_feature_list_freq(hedges, words, bigrams, trigrams)
    #if count > 0:
    features['hedge_word_count'] = count
    features['hedge_word_prop'] = round(float(count) / float(len(words)), 4)
    features['hedge_word_list'] = instances

    # factive verb count
    count, instances = count_feature_list_freq(factives, words, bigrams, trigrams)
    #if count > 0:
    features['factive_verb_count'] = count
    features['factive_verb_prop'] = round(float(count) / float(len(words)), 4)
    features['factive_verb_list'] = instances

    # assertive verb count
    count, instances = count_feature_list_freq(assertives, words, bigrams, trigrams)
    #if count > 0:
    features['assertive_verb_count'] = count
    features['assertive_verb_prop'] = round(float(count) / float(len(words)), 4)
    features['assertive_verb_list'] = instances

    # implicative verb count
    count, instances = count_feature_list_freq(implicatives, words, bigrams, trigrams)
    #if count > 0:
    features['implicative_verb_count'] = count
    features['implicative_verb_prop'] = round(float(count) / float(len(words)), 4)
    features['implicative_verb_list'] = instances

    # bias words and phrases count
    count, instances = count_feature_list_freq(biased, words, bigrams, trigrams)
    #if count > 0:
    features['bias_count'] = count
    features['bias_prop'] = round(float(count) / float(len(words)), 4)
    features['bias_list'] = instances

    # opinion word count
    count, instances = count_feature_list_freq(opinionLaden, words, bigrams, trigrams)
    #if count > 0:
    features['opinion_count'] = count
    features['opinion_prop'] = round(float(count) / float(len(words)), 4)
    features['opinion_list'] = instances

    # weak subjective word count
    count, instances = count_feature_list_freq(subj_weak, words, bigrams, trigrams)
    #if count > 0:
    features['subjective_weak_count'] = count
    features['subjective_weak_prop'] = round(float(count) / float(len(words)), 4)
    features['subjective_weak_list'] = instances

    # strong subjective word count
    count, instances = count_feature_list_freq(subj_strong, words, bigrams, trigrams)
    #if count > 0:
    features['subjective_strong_count'] = count
    features['subjective_strong_prop'] = round(float(count) / float(len(words)), 4)
    features['subjective_strong_list'] = instances

    # composite sentiment score using VADER sentiment analysis package
    compound_sentiment = vader_sentiment_analysis.polarity_scores(text)['compound']
    features['vader_composite_sentiment'] = float(compound_sentiment)

    # subjectivity score using Pattern.en
    pattern_subjectivity = pattern_sentiment(text)[1]
    features['subjectivity_score'] = round(pattern_subjectivity, 4)

    # modality (certainty) score and mood using  http://www.clips.ua.ac.be/pages/pattern-en#modality
    sentence = parse(text, lemmata=True)
    sentenceObj = Sentence(sentence)
    features['modality'] = round(modality(sentenceObj), 4)
    try:
        features['mood'] = mood(sentenceObj)
    except IndexError as e:
        print "IndexError: %s" % e
        print "Ignoring..."
        features['mood'] = 'err'

    # Flesch-Kincaid Grade Level (reading difficulty) using textstat
    try:
        features['flesch-kincaid_grade_level'] = float(textstat.flesch_kincaid_grade(text))
    except TypeError as e:
        print "TypeError: %s" % e
        print "Ignoring..."
        features['flesch-kincaid_grade_level'] = 0.0

    # liwc 3rd person pronoun count (combines S/he and They)
    count, instances = count_liwc_list_freq(liwc_3pp, words)
    #if count > 0:
    features['liwc_3rd_person_pronoum_count'] = count
    features['liwc_3rd_person_pronoun_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_3rd_person_pronoun_list'] = instances

    # liwc auxiliary verb count
    count, instances = count_liwc_list_freq(liwc_aux, words)
    #if count > 0:
    features['liwc_auxiliary_verb_count'] = count
    features['liwc_auxiliary_verb_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_auxiliary_verb_list'] = instances

    # liwc adverb count
    count, instances = count_liwc_list_freq(liwc_adv, words)
    #if count > 0:
    features['liwc_adverb_count'] = count
    features['liwc_adverb_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_adverb_list'] = instances

    # liwc preposition count
    count, instances = count_liwc_list_freq(liwc_prep, words)
    #if count > 0:
    features['liwc_preposition_count'] = count
    features['liwc_preposition_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_preposition_list'] = instances

    # liwc conjunction count
    count, instances = count_liwc_list_freq(liwc_conj, words)
    #if count > 0:
    features['liwc_conjunction_count'] = count
    features['liwc_conjunction_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_conjunction_list'] = instances

    # liwc discrepency word count
    count, instances = count_liwc_list_freq(liwc_discr, words)
    #if count > 0:
    features['liwc_discrepency_word_count'] = count
    features['liwc_discrepency_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_discrepency_word_list'] = instances

    # liwc tentative word count
    count, instances = count_liwc_list_freq(liwc_tent, words)
    #if count > 0:
    features['liwc_tentative_word_count'] = count
    features['liwc_tentative_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_tentative_word_list'] = instances

    # liwc certainty word count
    count, instances = count_liwc_list_freq(liwc_cert, words)
    #if count > 0:
    features['liwc_certainty_word_count'] = count
    features['liwc_certainty_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_certainty_word_list'] = instances

    # liwc causation word count
    count, instances = count_liwc_list_freq(liwc_causn, words)
    #if count > 0:
    features['liwc_causation_word_count'] = count
    features['liwc_causation_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_causation_word_list'] = instances

    # liwc work word count
    count, instances = count_liwc_list_freq(liwc_work, words)
    #if count > 0:
    features['liwc_work_word_count'] = count
    features['liwc_work_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_work_word_list'] = instances

    # liwc achievement word count
    count, instances = count_liwc_list_freq(liwc_achiev, words)
    #if count > 0:
    features['liwc_achievement_word_count'] = count
    features['liwc_achievement_word_prop'] = round(float(count) / float(len(words)), 4)
    features['liwc_achievement_word_list'] = instances

    return features
Exemple #5
0
print search("rabbit", "big white rabbit")
print

# Search words can contain wildcard characters:
print search("rabbit*", "big white rabbit")
print search("rabbit*", "big white rabbits")
print

# Search words can contain different options:
print search("rabbit|cony|bunny", "big black bunny")
print

# Things become more interesting if we involve the pattern.en.parser module.
# The parser takes a string, identifies words, and assigns a part-of-speech tag
# to each word, for example NN (noun) or JJ (adjective).
# A parsed sentence can be scanned for part-of-speech tags:
s = Sentence(parse("big white rabbit"))
print search("JJ", s)  # all adjectives
print search("NN", s)  # all nouns
print search("NP", s)  # all noun phrases
print

# Since the search() is case-insensitive, uppercase search words
# are always considered to be tags (or taxonomy terms - see further examples).

# The return value is a Match object,
# where Match.words is a list of Word objects that matched:
m = search("NP", s)
for word in m[0].words:
    print word.string, word.tag
Exemple #6
0
]

for strSentence in sentList:

    for word, pos in tag(strSentence):
        if pos in ("VB", "VBD", "VBG", "VBN", "VBP",
                   "VBZ"):  # Retrieve all adjectives.
            print("=====================>>>>> ", word, pos)
        else:
            print(word, pos)

    print(strSentence)
    a = parse(strSentence, relations=True, lemmata=True)
    pprint(a)

    sentence = Sentence(a)
    print(sentence.verbs)
    print
    print

    #print(sentence.relations)
    #print(sentence.subjects)
    #print(sentence.objects)
    #print(sentence.verbs)
    #print(sentence.chunk)

    sentScore = sid.polarity_scores(strSentence)

    # sqlite3 insert : subject / objects / verbs / CPC / Sentiment
    #   genre, wordCount, filename, sentence
    #   subject : Chunk('he/NP-SBJ-1'), Chunk('you/NP-SBJ-2')]
               words=[],
               type=None,
               role=None,
               relation=None)
print pnp.string  # String of words (Unicode).
print pnp.chunks  # List of Chunk objects.
# print pnp.preposition            # First PP chunk in the PNP.
# sentiment
print sentiment(
    "The movie attempts to be surreal by incorporating various time paradoxes,"
    "but it's presented in such a ridiculous way it's seriously boring.")
print sentiment('Wonderfully awful! :-)').assessments
# mode and modality
s = "Some amino acids tend to be acidic while others may be basic."  # weaseling
s = parse(s, lemmata=True)
s = Sentence(s)
print modality(s)
# wordnet
s = wordnet.synsets('bird')[0]
print 'Definition:', s.gloss  # Definition string.
print '  Synonyms:', s.synonyms  # List of word forms (i.e., synonyms)
print ' Hypernyms:', s.hypernyms(
)  # returns a list of  parent synsets (i.e., more general). Synset (semantic parent).
print ' Hypernyms:', s.hypernyms(recursive=False, depth=None)
print '  Hyponyms:', s.hyponyms(
)  # returns a list child synsets (i.e., more specific).
print '  Hyponyms:', s.hyponyms(recursive=False, depth=None)
print '  Holonyms:', s.holonyms(
)  # List of synsets (of which this is a member).
print '  Meronyms:', s.meronyms()  # List of synsets (members/parts).
print '       POS:', s.pos  # Part-of-speech: NOUN | VERB | ADJECTIVE | ADVERB.
Exemple #8
0
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the given input document.

m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is only a simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if your classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None)  # By default, baseline=MAJORITY
for document in m:  # (classify unknown documents with the most frequent type).
    classifier.train(document)
 def test_match(self):
     # Assert Pattern.match()
     P = search.Pattern.fromstring
     X = search.STRICT
     S = lambda s: Sentence(parse(s, relations=True, lemmata=True))
     for i, (pattern, test, match) in enumerate((
       (P("^rabbit"),                  "white rabbit",     None),                  #  0
       (P("^rabbit"),                        "rabbit",     "rabbit"),              #  1
       (P("rabbit"),               "big white rabbit",     "rabbit"),              #  2
       (P("rabbit*"),              "big white rabbits",    "rabbits"),             #  3
       (P("JJ|NN"),              S("big white rabbits"),   "big"),                 #  4
       (P("JJ+"),                S("big white rabbits"),   "big white"),           #  5
       (P("JJ+ NN*"),            S("big white rabbits"),   "big white rabbits"),   #  6
       (P("JJ black|white NN*"), S("big white rabbits"),   "big white rabbits"),   #  7
       (P("NP"),                 S("big white rabbit"),    "big white rabbit"),    #  8
       (P("big? rabbit", X),     S("big white rabbit"),    "rabbit"),              #  9 strict
       (P("big? rabbit|NN"),     S("big white rabbit"),    "rabbit"),              # 10 explicit
       (P("big? rabbit"),        S("big white rabbit"),    "big white rabbit"),    # 11 greedy
       (P("rabbit VP JJ"),       S("the rabbit was huge"), "the rabbit was huge"), # 12
       (P("rabbit be JJ"),       S("the rabbit was huge"), "the rabbit was huge"), # 13 lemma
       (P("rabbit be JJ", X),    S("the rabbit was huge"), "rabbit was huge"),     # 14
       (P("rabbit is JJ"),       S("the rabbit was huge"), None),                  # 15
       (P("the NP"),             S("the rabid rodents"),   "the rabid rodents"),   # 16 overlap
       (P("t*|r*+"),             S("the rabid rodents"),   "the rabid rodents"),   # 17
       (P("(DT) JJ? NN*"),       S("the rabid rodents"),   "the rabid rodents"),   # 18
       (P("(DT) JJ? NN*"),       S("the rabbit"),          "the rabbit"),          # 19
       (P("rabbit"),             S("the big rabbit"),      "the big rabbit"),      # 20 greedy
       (P("eat carrot"),         S("is eating a carrot"),  "is eating a carrot"),  # 21
       (P("eat carrot|NP"),      S("is eating a carrot"),  "is eating a carrot"),  # 22
       (P("eat NP"),             S("is eating a carrot"),  "is eating a carrot"),  # 23
       (P("eat a"),              S("is eating a carrot"),  "is eating a"),         # 24
       (P("!NP carrot"),         S("is eating a carrot"),  "is eating a carrot"),  # 25
       (P("eat !pizza"),         S("is eating a carrot"),  "is eating a carrot"),  # 26
       (P("eating a"),           S("is eating a carrot"),  "is eating a"),         # 27
       (P("eating !carrot", X),  S("is eating a carrot"),  "eating a"),            # 28
       (P("eat !carrot"),        S("is eating a carrot"),  None),                  # 28 NP chunk is a carrot
       (P("eat !DT"),            S("is eating a carrot"),  None),                  # 30 eat followed by DT
       (P("eat !NN"),            S("is eating a carrot"),  "is eating a"),         # 31 a/DT is not NN
       (P("!be carrot"),         S("is eating a carrot"),  "is eating a carrot"),  # 32 is eating == eat != is
       (P("!eat|VP carrot"),     S("is eating a carrot"),  None),                  # 33 VP chunk == eat
       (P("white_rabbit"),       S("big white rabbit"),    None),                  # 34
       (P("[white rabbit]"),     S("big white rabbit"),    None),                  # 35
       (P("[* white rabbit]"),   S("big white rabbit"),    "big white rabbit"),    # 36
       (P("[big * rabbit]"),     S("big white rabbit"),    "big white rabbit"),    # 37
       (P("big [big * rabbit]"), S("big white rabbit"),    "big white rabbit"),    # 38
       (P("[*+ rabbit]"),        S("big white rabbit"),    None),                  # 39 bad pattern: "+" is literal
     )):
         m = pattern.match(test)
         self.assertTrue(getattr(m, "string", None) == match)
     # Assert chunk with head at the front.
     s = S("Felix the cat")
     self.assertEqual(P("felix").match(s).string, "Felix the cat")
     # Assert negation + custom greedy() function.
     s = S("the big white rabbit")
     g = lambda chunk, constraint: len([w for w in chunk if not constraint.match(w)]) == 0
     self.assertEqual(P("!white").match(s).string, "the big white rabbit") # a rabbit != white
     self.assertEqual(P("!white", greedy=g).match(s), None)                # a white rabbit == white
     # Assert taxonomy items with spaces.
     s = S("Bugs Bunny is a giant talking rabbit.")
     t = search.Taxonomy()
     t.append("rabbit", type="rodent")
     t.append("Bugs Bunny", type="rabbit")
     self.assertEqual(P("RABBIT", taxonomy=t).match(s).string, "Bugs Bunny")
     # Assert None, the syntax cannot handle taxonomy items that span multiple chunks.
     s = S("Elmer Fudd fires a cannon")
     t = search.Taxonomy()
     t.append("fire cannon", type="violence")
     self.assertEqual(P("VIOLENCE").match(s), None)
     # Assert regular expressions.
     s = S("a sack with 3.5 rabbits")
     p = search.Pattern.fromstring("[] NNS")
     p[0].words.append(re.compile(r"[0-9|\.]+"))
     self.assertEqual(p.match(s).string, "3.5 rabbits")
     print("pattern.search.Pattern.match()")
Exemple #10
0
import os, sys

sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.search import Pattern
from pattern.en import Sentence, parse

# Constraints ending in + match one or more words.
# Pattern.search() uses a "greedy" approach:
# it will attempt to match as many words as possible.

# The following pattern means:
# one or more words starting with "t",
# followed by one or more words starting with "f".
p = Pattern.fromstring("t*+ f*+")
s = Sentence(parse("one two three four five six"))
m = p.search(s)
print s
print m
print

for w in m[0].words:
    print w, "matches", m[0].constraint(w)

# Pattern.fromstring("*") matches each word in the sentence.
# This yields a list with a Match object for each word.
print
print "* =>", Pattern.fromstring("*").search(s)

# Pattern.fromstring("*+") matches all words.
# This yields a list with one Match object containing all words.
Exemple #11
0
print m
print("")

# Sentence chunks can be matched by tag (e.g. NP, VP, ADJP).
# The pattern below matches anything from
# "the rabbit gnaws at your fingers" to
# "the white rabbit looks at the carrots":
p = Pattern.fromstring("rabbit VP at NP", s)
m = p.search(s)
print(m)
print("")

if m:
    for w in m[0].words:
        print("%s\t=> %s" % (w, m[0].constraint(w)))

print("")
print("-------------------------------------------------------------")
# Finally, constraints can also include regular expressions.
# To include them we need to use the full syntax instead of the search() function:
import re

r = re.compile(r"[0-9|\.]+")  # all numbers
p = Pattern()
p.sequence.append(Constraint(words=[r]))
p.sequence.append(Constraint(tags=["NN*"]))

s = Sentence(parse("I have 9.5 rabbits."))
print(s)
print(p.search(s))
print("")
Exemple #12
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.search import search, Pattern, Constraint
from pattern.en import Sentence, parse

# What we call a "search word" in example 01-search.py
# is actually called a constraint, because it can contain different options.
# Options are separated by "|".
# The next search pattern retrieves words that are a noun OR an adjective:
s = Sentence(parse("big white rabbit"))
print search("NN|JJ", s)
print

# This pattern yields phrases containing an adjective followed by a noun.
# Consecutive constraints are separated by a space:
print search("JJ NN", s)
print

# Or a noun preceded by any number of adjectives:
print search("(JJ)+ NN", s)
print

# Note: NN marks singular nouns, NNS marks plural nouns.
# If you want to include both, use "NN*" as a constraint.
# This works for NN*, VB*, JJ*, RB*.

s = Sentence(parse("When I sleep the big white rabbit will stare at my feet."))
m = search("rabbit stare at my", s)
print s
print m
Exemple #13
0
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred')  # 'p' in tenses() also works.
print(PAST, 1, PL) in tenses('purred')

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)

#parse
s = parse('I eat pizza with a fork.')
pprint(s)

#tag
for word, t in tag('The cat felt happy.'):
    print word + ' is ' + t

s = "The movie attempts to be surreal by incorporating various time paradoxes, but it's presented in such a ridiculous way it's seriously boring."
print sentiment(s)
print polarity(s)
print subjectivity(s)

#The modality() function returns a value between -1.0 and +1.0, expressing the degree of certainty
s2 = "Some amino acids tend to be acidic while others may be basic."  # weaseling
se = Sentence(parse(s, chunks=False, lemmata=True))
print modality(se)
 def chunklist_(self):
     sentence_obj = Sentence(self.PARSE)
     return sentence_obj.chunks
Exemple #15
0
# Constraints wrapped in () are optional, matching one or no word.
# Pattern.search() uses a "greedy" approach: 
# it will attempt to include as many optional constraints as possible.

# The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns).
# A preceding adjective, adverb or determiner are picked up as well. 
p = Pattern.fromstring("(DT) (RB) (JJ) NN+")
for s in (
  "the cat",             # DT NN
  "the very black cat",  # DT RB JJ NN
  "tasty cat food",      # JJ NN NN
  "the funny black cat", # JJ NN
  "very funny",          # RB JJ => no match, since there is no noun.
  "my cat is black and your cat is white"): # NN + NN  
    s = Sentence(parse(s))
    m = p.search(s)
    print
    print s
    print m
    if m:
        for w in m[0].words:
            print w, "matches", m[0].constraint(w)

# Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+"
# to include multiple adverbs/adjectives.
# By combining * () and + patterns can become quite complex.
# Optional constraints are useful for very specific patterns, but slow.
# Also, depending on which parser you use (e.g. MBSP), words can be tagged differently
# and may not match in the way you expect.
# Consider using a robust Pattern.fromstring("NP").
Exemple #16
0
# Pattern.fromstring("rose|lily|daisy|daffodil|begonia").

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")

print taxonomy.children("flower")
print taxonomy.parents("rose")
print taxonomy.classify("rose")  # Yields the most recently added parent.
print

# Taxonomy terms can be included in a pattern:
p = Pattern([Constraint(taxa=["flower"])])  # or
p = Pattern.fromstring("FLOWER")

s = Sentence(parse("A field of white daffodils.", lemmata=True))
m = p.search(s)
print s
print m
print

from pattern.search import search
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
print taxonomy.parents("chicken")
print taxonomy.children("animal", recursive=True)
print search("FOOD", "I'm eating chicken.")
print