Beispiel #1
0
def getDescription(article):
    intro = parsetree(article.sections[0].string, lemmata=True)
    pattern = Pattern.fromstring('be DT *+')
    try:
        mat = pattern.match(intro)
        return mat.string
    except TypeError:
        pattern = Pattern.fromstring('be *+')
        return pattern.match(intro).string
    def _extract_reporters(self):
        """ Extract the reporters and entities from those sentence of the text
            where a reported speech verb is used.
        """
        # search for those sentences with reported speech verbs
        sentences = [s for s in self.__tree if search('RPTVRB|según', s)]
        # search for proper nouns that are not locations
        pattern = Pattern.fromstring('!LOCATION|NNP+',
                                     STRICT, taxonomy=TAXONOMY)

        for s in sentences:
            matches = pattern.search(s)

            for m in matches:
                for w in m.words:
                    # chunks with roles (SBJ, OBJ) connected to a reporter verb
                    if ((w.chunk.role is not None) and
                        (w.chunk.verb.head.lemma in taxonomy)):
                        if self._is_composed_noun(w):
                            self.__reporters.append(w.previous())
                        self.__reporters.append(w)
                    # proper nouns not spotlighted as reported
                    else:
                        if self._is_composed_noun(w):
                            self.__entities.append(w.previous())
                        self.__entities.append(w)
Beispiel #3
0
def myExtract(statement):

  s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  p = Pattern.fromstring('There be DT NN+')
  match = p.search(s)
  #raise Exception(match)
  return match
Beispiel #4
0
def notneeded(word):
    print word,
    for pos in rmpat:
        p = Pattern.fromstring(pos)
        if (p.scan(word)):
            print " " + pos
            return True
    return False
Beispiel #5
0
def test_pattern():
    
   from pattern.search import Pattern
   from pattern.en import parsetree
  
   t = parsetree('Chuck Norris is cooler than Dolph.', lemmata=True)
   p = Pattern.fromstring('{NP} be * than {NP}')
   m = p.match(t)
   print m.group(1)
   print m.group(2)
   print t
Beispiel #6
0
def pattern_match(pattern, sentence):

    if type(sentence) is not Text:
        sentence = parsetree(sentence, lemmata=True)

    p = Pattern.fromstring(pattern)
    try:
        m = p.match(sentence)
        return m
    except:
        return None
    def _extract_sources(self):
        """ Extract those well-known sources from the text.
        """
        # search for well-known sources in the tree
        pattern = Pattern.fromstring('SOURCE', STRICT, taxonomy=TAXONOMY)

        for sentence in self.__tree:
            matches = pattern.search(sentence)

            for m in matches:
                for w in m.words:
                    self.__sources.append(w)
Beispiel #8
0
 def __init__(self,
              entity_type,
              expression,
              variables=None,
              negation=False,
              taxonomy=None,
              strict=False,
              exclude=None):
     self.entity_type = entity_type
     self.expression = expression
     if variables is None:
         variables = {}
     self.variables = variables
     if negation is None:
         negation = False
     self.negation = negation
     self.taxonomy = taxonomy
     self.strict = strict
     self.exclude = set() if exclude is None else set(exclude)
     self.pattern = Pattern.fromstring(expression,
                                       taxonomy=taxonomy,
                                       strict=strict)
Beispiel #9
0
 def __init__(self,
              expression,
              type,
              config=None,
              negation=False,
              taxonomy=None,
              strict=False,
              exclude=None):
     self.expression = expression
     self.type = type
     self.taxonomy = taxonomy
     if config is None:
         config = {'value': 1}
     self.config = config
     if negation is None:
         negation = False
     self.negation = negation
     self.pattern = Pattern.fromstring(expression,
                                       taxonomy=taxonomy,
                                       strict=strict)
     self.strict = strict
     self.exclude = set() if exclude is None else set(exclude)
Beispiel #10
0
s = Sentence(parse("When I sleep the big white rabbit will stare at my feet."))
m = search("rabbit stare at my", s)
print s
print m
print
# Why does this work? 
# The word "will" is included in the result, even if the pattern does not define it.
# The pattern should break when it does not encounter "stare" after "rabbit."
# It works because "will stare" is one verb chunk.
# The "stare" constraint matches the head word of the chunk ("stare"),
# so "will stare" is considered an overspecified version of "stare".
# The same happens with the "rabbit" constraint: 
# this matches the overspecified chunk "the big white rabbit".

p = Pattern.fromstring("rabbit stare at my", s)
p.strict = True # Now it matches only what the pattern explicitly defines.
m = p.search(s)
print m
print

# Sentence chunks can be matched by tag (e.g. NP, VP, ADJP).
# The pattern below matches anything from
# "the rabbit gnaws at your fingers" to
# "the white rabbit looks at the carrots":
p = Pattern.fromstring("rabbit VP at NP", s)
m = p.search(s)
print m
print

if m:
Beispiel #11
0
    print ''
    
    



taxonomy = PS.Taxonomy()
taxonomy.append('looks', type='perception')
taxonomy.append('appears', type='perception')


s = "Kiko foreign glitter that looks great in the shade."
s = "I'm also thinking this polish would look amazing over black!"
s = "Oh this is a great brush. Fluffy soft bristles and works like a charm."

pattern = Pattern.fromstring('{SBJ?} * {PERCEPTION} * {JJ?} * {OBJ?} {OBJ?}', taxonomy=taxonomy, strict=True)

#documents = [s]

for document in documents:
    parsed = parsetree(document, lemmata=True, relations=True)
    for sentence in parsed.sentences:
        matches = pattern.search(sentence)
        if matches:
            print sentence.string
            for match in matches:
                for c in match.constituents():
                    print c
            print ''

# Example of pattern: http://www.clips.ua.ac.be/pages/pattern

from pattern.web    import Bing, plaintext
from pattern.en     import Sentence, Chunk, parse
from pattern.search import Pattern
from pattern.graph  import Graph, Node, Edge, export
 
g = Graph()
for i in range(1):
    print "--------------", i
    for r in Bing().search('"more important than"', start=i+1, count=50):
        s = plaintext(r.description.lower())
        print s
        s = Sentence(parse(s))
        print s    
        p = Pattern.fromstring('NP (VP) more important than NP')
        for m in p.search(s):
            a = m.constituents(p[+0])[-1] # Left NP.
            b = m.constituents(p[-1])[+0] # Right NP.
            a = (isinstance(a, Chunk) and a.head or a).string
            b = (isinstance(b, Chunk) and b.head or b).string
            if a and b:
                if a not in g:
                    g.add_node(a, radius=5, stroke=(0,0,0,0.8))
                if b not in g:
                    g.add_node(b, radius=5, stroke=(0,0,0,0.8))
                g.add_edge(g[b], g[a], stroke=(0,0,0,0.6))

g = g.split()[0] # Largest subgraph.
 
for n in g.sorted()[:40]: # Sorted by Node.weight.
Beispiel #13
0
        sentence = " ".join(word_list)
        output.append(sentence)

        # print "\n", sentence, "\n\n"
    return output

results = find_all_matches_by_ziyu(whole_text, 'VB JJ NNS|NN')
print "RESULTS:"
for result in results:
    print result

sys.exit()

# t = parsetree('Dolph Lundgren is cooler than Frank.', lemmata=True)
# p = Pattern.fromstring('{VB} {TO} {VB} {IN} {NN}')
p = Pattern.fromstring('{V*}')

m = p.match(t)
print m

sys.exit()
# NAME OF DRUG
# take the title's first word, first three letters
# combine with suffixes ["phrin","ytril","syn","xyzal","yrhil","nexx"]

print('\n' * 4)
print " "*10 + "NAME OF DRUG" + ":"


# SHORT DESCRIPTION OF DRUG
# example: 100% grass fed supplement for cultrual materialism
Beispiel #14
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.search import Pattern
from pattern.en     import Sentence, parse

# Constraints wrapped in () are optional, matching one or no word.
# Pattern.search() uses a "greedy" approach: 
# it will attempt to include as many optional constraints as possible.

# The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns).
# A preceding adjective, adverb or determiner are picked up as well. 
p = Pattern.fromstring("(DT) (RB) (JJ) NN+")
for s in (
  "the cat",             # DT NN
  "the very black cat",  # DT RB JJ NN
  "tasty cat food",      # JJ NN NN
  "the funny black cat", # JJ NN
  "very funny",          # RB JJ => no match, since there is no noun.
  "my cat is black and your cat is white"): # NN + NN  
    s = Sentence(parse(s))
    m = p.search(s)
    print
    print s
    print m
    if m:
        for w in m[0].words:
            print w, "matches", m[0].constraint(w)

# Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+"
# to include multiple adverbs/adjectives.
# By combining * () and + patterns can become quite complex.
Beispiel #15
0
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make patterns somewhat unwieldy, e.g.:
# Pattern.fromstring("rose|lily|daisy|daffodil|begonia").

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")
    
print taxonomy.children("flower")
print taxonomy.parents("rose")
print taxonomy.classify("rose") # Yields the most recently added parent.
print
    
# Taxonomy terms can be included in a pattern:
p = Pattern([Constraint(taxa=["flower"])]) # or
p = Pattern.fromstring("FLOWER")

s = Sentence(parse("A field of white daffodils.", lemmata=True))
m = p.search(s)
print s
print m
print

from pattern.search import search
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
print taxonomy.parents("chicken")
print taxonomy.children("animal", recursive=True)
Beispiel #16
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.search import Pattern
from pattern.en     import Sentence, parse

# Constraints ending in + match one or more words.
# Pattern.search() uses a "greedy" approach: 
# it will attempt to match as many words as possible.

# The following pattern means:
# one or more words starting with "t", 
# followed by one or more words starting with "f".
p = Pattern.fromstring("t*+ f*+")
s = Sentence(parse("one two three four five six"))
m = p.search(s)
print s
print m
print

for w in m[0].words:
    print w, "matches", m[0].constraint(w)

# Pattern.fromstring("*") matches each word in the sentence.
# This yields a list with a Match object for each word.
print
print "* =>",  Pattern.fromstring("*").search(s)

# Pattern.fromstring("*+") matches all words.
# This yields a list with one Match object containing all words.
print
print "*+ =>", Pattern.fromstring("*+").search(s)
Beispiel #17
0
from pattern.en     import Sentence, parse
from pattern.search import Pattern
from pattern.db     import Datasheet, pprint

# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Bing and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'          # Bing search query
p = "NP (VP) more important than NP" # Search pattern.
p = Pattern.fromstring(p)
d = Datasheet()

engine = Bing(license=None)
for i in range(1): # max=10
    for result in engine.search(q, start=i+1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1] # Left NP.
            b = m.constituents(constraint=5)[ 0] # Right NP.
            d.append((
                a.string.lower(), 
                b.string.lower()))
# Example of pattern: http://www.clips.ua.ac.be/pages/pattern

from pattern.web import Bing, plaintext
from pattern.en import Sentence, Chunk, parse
from pattern.search import Pattern
from pattern.graph import Graph, Node, Edge, export

g = Graph()
for i in range(1):
    print "--------------", i
    for r in Bing().search('"more important than"', start=i + 1, count=50):
        s = plaintext(r.description.lower())
        print s
        s = Sentence(parse(s))
        print s
        p = Pattern.fromstring('NP (VP) more important than NP')
        for m in p.search(s):
            a = m.constituents(p[+0])[-1]  # Left NP.
            b = m.constituents(p[-1])[+0]  # Right NP.
            a = (isinstance(a, Chunk) and a.head or a).string
            b = (isinstance(b, Chunk) and b.head or b).string
            if a and b:
                if a not in g:
                    g.add_node(a, radius=5, stroke=(0, 0, 0, 0.8))
                if b not in g:
                    g.add_node(b, radius=5, stroke=(0, 0, 0, 0.8))
                g.add_edge(g[b], g[a], stroke=(0, 0, 0, 0.6))

g = g.split()[0]  # Largest subgraph.

for n in g.sorted()[:40]:  # Sorted by Node.weight.
Beispiel #19
0
import os, sys; sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.search import search, Pattern, Constraint
from pattern.en     import Sentence, parse

# This example demonstrates an interesting search pattern that mines for comparisons.
# Notice the use of the constraint "be".
# If the output from the parser includes word lemmas (e.g. "doing" => "do")
# these will also be matched. Using "be" then matches "is", "being", "are", ...
# and if underspecification is used "could be", "will be", "definitely was", ...

p = Pattern.fromstring("NP be (more) ADJP|ADVP than NP")

for s in (
  "the turtle was faster than the hare",
  "Arnold Schwarzenegger is more dangerous than Dolph Lundgren"):
    s = s = Sentence(parse(s, lemmata=True)) # parse lemmas
    m = p.search(s)
    print s
    print
    print m
    print
    if m:
        print m[0].constituents()                   # Words grouped by chunk whenever possible.
        print m[0].constraints(chunk=s.chunks[0])   # The constraints that match the given chunk.
        print m[0].constituents(constraint=p[0])    # Constituents for the given constraint.
        print m[0].constituents(constraint=[0,3,5]) # Constituents for the given constraint indices.
        print
        print
        print
        
Beispiel #20
0
	def process(self,results, pattern, download):
		risk_results = []
		body = ''
		for r in results:
			p = Pattern.fromstring(pattern)
			url = URL(r.url)
			s = Sentence(parse(r.description))
			p_search =  p.search(s)
			
			if download == DownloadType.Full or (download == DownloadType.Dynamic and len(p_search) == 0):
				try:
					#memetype and download url operation can throw 500+ & 400+, in those cases we
					#can escape the exception gracefully without halting the search
					if url.mimetype == "text/html":
						body = str(r.download(timeout=110, cached=True, proxy=None).encode("utf-8"))  
						body = plaintext(body)
				except:
					#there are cases where plaintext func fails to extract just text, we catch that exception,
					#however the body text is returned as html therefore the pattern search maybe not be 
					#reliable. Our choices are to skip this search intirely or  attemp to extract the
					#pattern. For now we'll skip that search result. 
					continue 
			
				p_search = p.search(Sentence(parse(body)))
	       
			else:
				body = ''
		
			result = Result(url=None)
			result.url = url
			result.url_content = (body or "")
			result.query= r.query
			result.sentence = s

			risky_terms = []

			for m in p_search:
				
				rightNP = ''
				for chunk in m.constituents(p[-1]): #Right NP, get all NP elements in the list
					rightNP += chunk.string + " "
				
				risky_terms.append(rightNP)
				
				"""
				leftNP = m.constituents(p[+0])[-1] # Left NP.
				leftNP = (isinstance(leftNP, Chunk) and leftNP.head or leftNP).string

				c = leftNP

				if leftNP and rightNP:
					if leftNP not in g:
						g.add_node(leftNP, radius=4, stroke=(0,0,0,0.8))
					if rightNP not in g:
						g.add_node(rightNP, radius=4, stroke=(0,0,0,0.8))
					if c not in g:
						g.add_node(c, radius=4, stroke=(0,0,0,0.8))

					g.add_edge(g[leftNP], g[c], stroke=(0,0,0,0.6))
					g.add_edge(g[c], g[rightNP], stroke=(0,0,0,0.6))
				 """
			
			if len(risky_terms) > 0 :
				result.risky_terms = risky_terms
				risk_results.append(result)

		"""
		g = g.split()[0] # Largest subgraph.

		for n in g.sorted()[:40]: # Sorted by Node.weight.
			n.fill = (0.0, 0.5, 1.0, 0.7 * n.weight)
		export(g, 'testtest', directed=True, weighted=0.6, distance=14, force=0.05, repulsion=150)
		"""

		return risk_results
Beispiel #21
0
 def __init__(self, expression, type, taxonomy=None):
     self.expression = expression
     self.type = type
     self.taxonomy = taxonomy
     self.pattern = Pattern.fromstring(expression, taxonomy=taxonomy)
Beispiel #22
0
import os, sys; sys.path.append(os.path.join("..", "..", ".."))

from pattern.search import Pattern
from pattern.en     import Sentence, parse

# Constraints ending in + match one or more words.
# Pattern.search() uses a "greedy" approach: 
# it will attempt to match as many words as possible.

# The following pattern means:
# one or more words starting with "t", 
# followed by one or more words starting with "f".
p = Pattern.fromstring("t*+ f*+")
s = Sentence(parse("one two three four five six"))
m = p.search(s)
print s
print m
print

for w in m[0].words:
    print w, "matches", m[0].constraint(w)

# Pattern.fromstring("*") matches each word in the sentence.
# This yields a list with a Match object for each word.
print
print "* =>",  Pattern.fromstring("*").search(s)

# Pattern.fromstring("*+") matches all words.
# This yields a list with one Match object containing all words.
print
print "*+ =>", Pattern.fromstring("*+").search(s)
Beispiel #23
0
import os, sys
sys.path.insert(0, os.path.join("..", "..", ".."))

from pattern.search import Pattern
from pattern.en import Sentence, parse

# Constraints wrapped in () are optional, matching one or no word.
# Pattern.search() uses a "greedy" approach:
# it will attempt to include as many optional constraints as possible.

# The following pattern scans for words whose part-of-speech tag is NN (i.e. nouns).
# A preceding adjective, adverb or determiner are picked up as well.
p = Pattern.fromstring("(DT) (RB) (JJ) NN+")
for s in (
        "the cat",  # DT NN
        "the very black cat",  # DT RB JJ NN
        "tasty cat food",  # JJ NN NN
        "the funny black cat",  # JJ NN
        "very funny",  # RB JJ => no match, since there is no noun.
        "my cat is black and your cat is white"):  # NN + NN
    s = Sentence(parse(s))
    m = p.search(s)
    print
    print s
    print m
    if m:
        for w in m[0].words:
            print w, "matches", m[0].constraint(w)

# Note: the above pattern could also be written as "(DT|RB|JJ)+ NN+"
# to include multiple adverbs/adjectives.
Beispiel #24
0
s = Sentence(parse("When I sleep the big white rabbit will stare at my feet."))
m = search("rabbit stare at my", s)
print s
print m
print
# Why does this work?
# The word "will" is included in the result, even if the pattern does not define it.
# The pattern should break when it does not encounter "stare" after "rabbit."
# It works because "will stare" is one verb chunk.
# The "stare" constraint matches the head word of the chunk ("stare"),
# so "will stare" is considered an overspecified version of "stare".
# The same happens with the "rabbit" constraint:
# this matches the overspecified chunk "the big white rabbit".

p = Pattern.fromstring("rabbit stare at my", s)
p.strict = True  # Now it matches only what the pattern explicitly defines.
m = p.search(s)
print m
print

# Sentence chunks can be matched by tag (e.g. NP, VP, ADJP).
# The pattern below matches anything from
# "the rabbit gnaws at your fingers" to
# "the white rabbit looks at the carrots":
p = Pattern.fromstring("rabbit VP at NP", s)
m = p.search(s)
print m
print

if m:
Beispiel #25
0
s = parsetree("When I sleep the big white rabbit will stare at my feet.")
m = search("rabbit stare at feet", s)
print(s)
print(m)
print()
# Why does this work?
# The word "will" is included in the result, even if the pattern does not define it.
# The pattern should break when it does not encounter "stare" after "rabbit."
# It works because "will stare" is one verb chunk.
# The "stare" constraint matches the head word of the chunk ("stare"),
# so "will stare" is considered an overspecified version of "stare".
# The same happens with "my feet" and the "rabbit" constraint,
# which matches the overspecified chunk "the big white rabbit".

p = Pattern.fromstring("rabbit stare at feet", s)
# Now it matches only what the pattern explicitly defines (=no match).
p.strict = True
m = p.search(s)
print(m)
print()

# Sentence chunks can be matched by tag (e.g. NP, VP, ADJP).
# The pattern below matches anything from
# "the rabbit gnaws at your fingers" to
# "the white rabbit looks at the carrots":
p = Pattern.fromstring("rabbit VP at NP", s)
m = p.search(s)
print(m)
print()
Beispiel #26
0










from pattern.search import Pattern
from pattern.en import parsetree

t = parsetree('Chuck Norris is cooler than Dolph Lundgren.', lemmata=True)
p = Pattern.fromstring('{NP} be * than {NP}')
m = p.match(t)
print m.group(1)
print m.group(2)





from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('humidity')


from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()