Ejemplos de parse en Python, ejemplos de pattern.en.parse en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_vector.py Proyecto: DataBranner/pattern

 def test_document(self):
     # Assert Document properties.
     # Test with different input types.
     for constructor, w in (
             (vector.Document, "The cats sit on the mat."),
             (vector.Document, ["The", "cats", "sit", "on", "the", "mat"]),
             (vector.Document, {"cat": 1, "mat": 1, "sit": 1}),
             (vector.Document, Text(parse("The cats sat on the mat."))),
             (vector.Document, Sentence(parse("The cats sat on the mat.")))):
         # Test copy.
         v = constructor(
             w, stemmer=vector.LEMMA, stopwords=False, name="Cat", type="CAT")
         v = v.copy()
         # Test properties.
         self.assertEqual(v.name, "Cat")
         self.assertEqual(v.type, "CAT")
         self.assertEqual(v.count, 3)
         self.assertEqual(v.terms, {"cat": 1, "mat": 1, "sit": 1})
         # Test iterator decoration.
         self.assertEqual(sorted(v.features), ["cat", "mat", "sit"])
         self.assertEqual(sorted(v), ["cat", "mat", "sit"])
         self.assertEqual(len(v), 3)
         self.assertEqual(v["cat"], 1)
         self.assertEqual("cat" in v, True)
     print("pattern.vector.Document")

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_en.py Proyecto: DataBranner/pattern

 def test_modality(self):
     # Assert -1.0 => +1.0 representing the degree of certainty.
     v = en.modality(en.Sentence(en.parse("I wish it would stop raining.")))
     self.assertTrue(v < 0)
     v = en.modality(
         en.Sentence(en.parse("It will surely stop raining soon.")))
     self.assertTrue(v > 0)
     # Assert the accuracy of the modality algorithm.
     # Given are the scores for the CoNLL-2010 Shared Task 1 Wikipedia uncertainty data:
     # http://www.inf.u-szeged.hu/rgai/conll2010st/tasks.html#task1
     # The baseline should increase (not decrease) when the algorithm is
     # modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     sentences = []
     for certain, sentence in Datasheet.load(os.path.join(PATH, "corpora", "uncertainty-conll2010.csv")):
         sentence = en.parse(sentence, chunks=False, light=True)
         sentence = en.Sentence(sentence)
         sentences.append((sentence, int(certain) > 0))
     A, P, R, F = test(
         lambda sentence: en.modality(sentence) > 0.5, sentences)
     #print(A, P, R, F)
     self.assertTrue(A > 0.69)
     self.assertTrue(P > 0.72)
     self.assertTrue(R > 0.64)
     self.assertTrue(F > 0.68)
     print("pattern.en.modality()")

Ejemplo n.º 3

0

Mostrar archivo

Archivo: game_state.py Proyecto: quensmit/scifi-text-adventure

  def update_with_sentences(self, user_input, text):
    combined_text = user_input + ' ' + text
    # Parse sentences (lazily doing user parsing twice)
    parsed_combined = parse(combined_text, chunks = False)
    parsed_user = parse(user_input, chunks = False)
    # Take out new lines
    parsed_combined = parsed_combined.replace('\n', ' ')

    self.topic_state.update_with_sentences(parsed_combined)
    self.character_state.update_with_sentences(parsed_user)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: try_pattern_2.py Proyecto: folagit/resumatcher

def testParse2():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True, split=True  )
   
    for s in result :
        print s
        print "-----------"
    return

Ejemplo n.º 5

0

Mostrar archivo

Archivo: try_pattern_2.py Proyecto: folagit/resumatcher

def testParse3():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('The new Control Center design might not be final, or it might even go back to the old design. ', tokenize=True, chunks=True, split=True  )
   
    for s in result :
        print s
        print "-----------"
    return

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_en.py Proyecto: EricSchles/pattern

 def test_mood(self):
     # Assert imperative mood.
     v = en.mood(en.Sentence(en.parse("Do your homework!")))
     self.assertEqual(v, en.IMPERATIVE)
     # Assert conditional mood.
     v = en.mood(en.Sentence(en.parse("We ought to help him.")))
     self.assertEqual(v, en.CONDITIONAL)
     # Assert subjunctive mood.
     v = en.mood(en.Sentence(en.parse("I wouldn't do that if I were you.")))
     self.assertEqual(v, en.SUBJUNCTIVE)
     # Assert indicative mood.
     v = en.mood(en.Sentence(en.parse("The weather is nice today.")))
     self.assertEqual(v, en.INDICATIVE)
     print "pattern.en.mood()"

Ejemplo n.º 7

0

Mostrar archivo

Archivo: try_pattern_2.py Proyecto: folagit/resumatcher

def testParse():
    
    from pattern.en import parse
    result = parse('I eat pizza with a fork.')
    result = parse('I eat pizza with a fork. I ate pizza.', tokenize=True )
        
    
    for s in result.split():
        print s
    return
    
    print type(result)
    print isinstance(result, unicode)
    print isinstance(result, basestring)
    print result.tags

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_en.py Proyecto: EricSchles/pattern

 def test_find_prepositions(self):
     # Assert preposition tag annotation (PP + NP).
     v = en.parser.find_prepositions([
         ["", "", "NP"],
         ["", "", "VP"],
         ["", "", "PP"],
         ["", "", "NP"], 
         ["", "", "NP"],])
     self.assertEqual(v, [
         ["", "", "NP", "O"], 
         ["", "", "VP", "O"], 
         ["", "", "PP", "B-PNP"], 
         ["", "", "NP", "I-PNP"], 
         ["", "", "NP", "I-PNP"]])
     # Assert PNP's with consecutive PP's.
     v = en.parse("The cat was looking at me from up on the roof with interest.", prepositions=True)
     self.assertEqual(v,
         "The/DT/B-NP/O cat/NN/I-NP/O " \
         "was/VBD/B-VP/O looking/VBG/I-VP/O " \
         "at/IN/B-PP/B-PNP me/PRP/B-NP/I-PNP " \
         "from/IN/B-PP/B-PNP up/IN/I-PP/I-PNP on/IN/I-PP/I-PNP the/DT/B-NP/I-PNP roof/NN/I-NP/I-PNP " \
         "with/IN/B-PP/B-PNP interest/NN/B-NP/I-PNP " \
         "././O/O"
     )
     print "pattern.en.parser.find_prepositions()"

Ejemplo n.º 9

0

Mostrar archivo

Archivo: extractor.py Proyecto: bluepolarfox/twss

def myExtract(statement):

  s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  p = Pattern.fromstring('There be DT NN+')
  match = p.search(s)
  #raise Exception(match)
  return match

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_search.py Proyecto: BarcelonaMedia-ViL/pattern

 def test_search_function(self):
     # Assert search() function.
     s = Sentence(parse("Go on Bors, chop his head off!"))
     m = search.search("PRP*? NN*", s)
     self.assertEqual(m[0].string, "Bors")
     self.assertEqual(m[1].string, "his head")
     print "pattern.search.search()"

Ejemplo n.º 11

0

Mostrar archivo

Archivo: try_pattern_1.py Proyecto: folagit/resumatcher

def test_findVerb():
    from pattern.en import parse, Text, Sentence
    from pattern.en import pprint 
    
   
    sent = "Bachelor's in Computer Science, Information Systems or a related study, is required."
    sent = 'I ate pizza.'
    sent = "Bachelor's in Computer Science is required."
    sent = "Bachelor 's Degree or 4 years equivalent professional experience ."
    sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "Bachelor ’ s degree in Computer Science or equivalent"
    sent = "Bachelor ' s degree in Computer Science or equivalent"
       
    
    result = parse(sent,
         tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
             tags = True,  # Find part-of-speech tags.
            )
    pprint(result) 
    
  #  print type(result)
  #  print result         
    sen = Sentence(result)
  #  for word in sen:
 #       print word, word.type
    
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    print vlist

Ejemplo n.º 12

0

Mostrar archivo

Archivo: pattern_engine.py Proyecto: bradjlarson/python-examples

def add_modality(tdb):
        for tweet in tdb:
                s = parse(tweet[2], lemmata=True)
                s = Sentence(s)
                (form, score) = (mood(s), modality(s))
                tweet.extend((form, score))
        return tdb

Ejemplo n.º 13

0

Mostrar archivo

Archivo: ParseNLTK.py Proyecto: asevans48/CrawlerAids

 def trainVectorizers(self,document):
     '''
     Train the Vectorizers with a document that should be tokenized into sentences and words
     
     **Warning: All listed items will be concatenated to a single matrix**
     
     *Required Parameters*
     :param document: the document (text) or list of documents (file paths) to build count and tfidf vectorizers with (be as representative as possible)
     '''
     
     self.buildVectorizer('count')
     self.buildVectorizer('tfidf')
     
     if type(document) is str:
         self.__tfidf.fit(self.__vectorizer.fit_transform(document))
     else:
         uvecs=None
         sentences=[]
         for doc in document:
             if os.path.exists(doc) is True:
                 sentences=[]
                 with open(document,'r') as fp:
                     sentences.extend([parse(x,tags=False,chunks=False).split(" ") for x in self.__sent_tokenizer.tokenize(fp.read())])
     
         if uvecs is not None:
             self.__tfidf.fit(self.__vectorizer.fit(sentences))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test_en.py Proyecto: DataBranner/pattern

 def setUp(self):
     # Parse sentences to test on.
     # Creating a Text creates Sentence, Chunk, PNP and Word.
     # Creating a Sentence tests Sentence.append() and
     # Sentence.parse_token().
     self.text = "I'm eating pizza with a fork. What a tasty pizza!"
     self.text = en.Text(en.parse(self.text, relations=True, lemmata=True))

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test_search.py Proyecto: BarcelonaMedia-ViL/pattern

 def test_match(self):
     # Assert Match properties.
     s = Sentence(parse("Death awaits you all with nasty, big, pointy teeth."))
     p = search.Pattern(sequence=[
         search.Constraint(tags=["JJ"], optional=True),
         search.Constraint(tags=["NN*"])])
     m = p.search(s)
     self.assertTrue(isinstance(m, list))
     self.assertEqual(m[0].pattern, p)
     self.assertEqual(m[1].pattern, p)
     self.assertEqual(m[0].words, [s.words[0]])
     self.assertEqual(m[1].words, [s.words[-3], s.words[-2]])
     # Assert contraint "NN*" links to "Death" and "teeth", and "JJ" to "pointy".
     self.assertEqual(m[0].constraint(s.words[ 0]), p[1])
     self.assertEqual(m[1].constraint(s.words[-3]), p[0])
     self.assertEqual(m[1].constraint(s.words[-2]), p[1])
     # Assert constraints "JJ NN*" links to chunk "pointy teeth".
     self.assertEqual(m[1].constraints(s.chunks[6]), [p[0], p[1]])
     # Assert Match.constituents() by constraint, constraint index and list of indices.
     self.assertEqual(m[1].constituents(), [s.chunks[6]])
     self.assertEqual(m[1].constituents(constraint=p[0]), [s.words[-3]])
     self.assertEqual(m[1].constituents(constraint=1), [s.words[-2]])
     self.assertEqual(m[1].constituents(constraint=(0,1)), [s.chunks[6]])
     # Assert Match.string.
     self.assertEqual(m[1].string, "pointy teeth")
     print "pattern.search.Match"

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_en.py Proyecto: BarcelonaMedia-ViL/pattern

 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0] < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.755)
     self.assertTrue(P > 0.760)
     self.assertTrue(R > 0.747)
     self.assertTrue(F > 0.754)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     self.assertTrue(A > 0.642)
     self.assertTrue(P > 0.653)
     self.assertTrue(R > 0.607)
     self.assertTrue(F > 0.629)
     print "pattern.en.sentiment()"

Ejemplo n.º 17

0

Mostrar archivo

Archivo: faq.py Proyecto: VRDate/twss

def process(statement,database_name = DATABASE_NAME):
  ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" 
      and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013"
      
      already encountering a statement like "There is a game engine Unity3d" gives us trouble
      seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization
      which doesn't really work for things like CTO as a category of items, hmm
      
      >>> sent = "There is a game engine Unreal Engine".split()
      >>> print nltk.ne_chunk(nltk.pos_tag(sent))
      '''
  # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D"
  # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source"

  s = parse(statement, relations=True, lemmata=True, light=True) 
  s = split(s)

  #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s)
  s, result = extract(statement)
  if result:
    #try:
      noun = search('(NN)+', s)[0].string
      table = pluralize(noun.replace(' ','_'))
      result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming
      ident = result[0].string
      name = result[1].string if len(result) > 1 else ident
      #raise Exception(table+"; "+ident+"; "+name)
      return newTable(table,ident,name,database_name)
    #except:
      #return regexMatch(statement,database_name)
  else:
    return regexMatch(statement,database_name)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: LDA_total.py Proyecto: hero-dot/TopicModelVisualisierung

def tokenize_line(line):
    res = line.lower()
    for regexp, replacement in TOKEN_MAPPINGS:
        res = regexp.sub(replacement, res)

    sentence = parse(res,tokenize=True,tags=False, chunks=False, relations= False, lemmata=True).split()

    # initialize the Variables
    allowed_tags = re.compile('(NN|VB|JJ|RB)')
    stopwords = frozenset()
    min_length = 2
    max_length = 15
    result = []

    # lemmatization of the words
    try:
        sentence = sentence[0]
    except IndexError:
        pass

    for token, tag, lemma in sentence:
        if min_length <= len(lemma) <= max_length and lemma not in stopwords:
            if allowed_tags.match(tag):
                lemma += "/" + tag[:2]
                result.append(lemma.encode('utf8'))
    res = result
    logging.info("That's how res looks %s" %res)
    return res

Ejemplo n.º 19

0

Mostrar archivo

Archivo: tweets_app.py Proyecto: emschuch/NLP_presidential_candidate_tweets

 def getData(self, params):
     if self.now_cache is not None:
         if (self.now_cache + datetime.timedelta(minutes=5)) < datetime.datetime.now():
             self.data_cache = None
             self.today_cache = None
             self.now_cache = None
     if self.data_cache is None:
         tweets = []
         for cand in candidates:
             tweets.append({'tweets': api.user_timeline(cand['user'], count=20), 
                             'name': cand['name'], 
                             'party': cand['party']})
         all_tweets = []
         for tweet_data in tweets:
             name = tweet_data['name']
             party = tweet_data['party']
             for tweet in tweet_data['tweets']:
                 all_tweets.append( {'Name': name,
                                     'Tweet': tweet.text, 
                                     'Favorites': tweet.favorite_count, 
                                     'Retweets': tweet.retweet_count} )
         dfs = pd.DataFrame(all_tweets)
         sentiments = [sentiment(tweet) for tweet in dfs['Tweet']]
         dfs['Polarity'] = [sent[0] for sent in sentiments]
         dfs['Subjectivity'] = [sent[1] for sent in sentiments]
         modal = [modality(Sentence(parse(tweet, lemmata=True))) for tweet in dfs['Tweet']]
         dfs['Certainty'] = modal
         today = date.strftime(datetime.datetime.now(), format='%m/%d/%Y, %H:%M')
         now = datetime.datetime.now()
         self.data_cache = dfs
         self.today_cache = today
         self.now_cache = now
     return self.data_cache

Ejemplo n.º 20

0

Mostrar archivo

Archivo: extractor.py Proyecto: bluepolarfox/twss

def basicExtract(statement):

  #s = Sentence(parse(statement, relations=True, lemmata=True, light=True))
  #p = Pattern.fromstring('(DT) (RB) (JJ) NN+')
  s = Sentence(parse(statement, lemmata=True))
  m = search("There be DT {JJ? NN}", s)
  return m

Ejemplo n.º 21

0

Mostrar archivo

Archivo: spark_testing.py Proyecto: annawhitney/cs109-twitter-airline-sentiment

def get_parts(thetext, punctuation):
    # generate stopwords list & regexes for 2+ periods or 2+ dashes
    stop = stopwords.words('english')
    regex1=re.compile(r"\.{2,}")
    regex2=re.compile(r"\-{2,}")
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

Ejemplo n.º 22

0

Mostrar archivo

Archivo: metrics.py Proyecto: christabor/namebot

def get_word_types(words):
    """Determine the occurrences of pos types.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_arr = []
    for val in words:
        try:
            val = parse(
                val,
                encoding='utf-8',
                tokenize=False,
                light=False,
                tags=True,
                chunks=False,
                relations=False,
                lemmata=False)
            new_arr.append(val)
        except IndexError:
            continue
    return {
        'data': new_arr,
        'summary': None
    }

Ejemplo n.º 23

0

Mostrar archivo

Archivo: utils.py Proyecto: MengWenkui/gensim

    def lemmatize(content, allowed_tags=re.compile('(NN|VB|JJ|RB)'), light=False):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming. Only consider nouns, verbs, adjectives
        and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        """
        if light:
            import warnings
            warnings.warn("The light flag is no longer supported by pattern.")

        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result

Ejemplo n.º 24

0

Mostrar archivo

Archivo: Evaluation.py Proyecto: kouki01/Text_Mining_University_Project

def gender_feature(text, feature_vect):
    """
    Extract the gender features
    :param text:
    :param feature_vect: contains a bag of words and a list of bigrams
    :return: a dictionary which contains the feature and its computed value
    """
    #sentence length and vocab features
    tokens = word_tokenize(text.lower())
    sentences = sent_tokenize(text.lower())
    words_per_sent = np.asarray([len(word_tokenize(s)) for s in sentences])

    #bag_of_word features
    bag_dict = {}
    for bag in feature_vect[:29]:
        bag_dict[bag] = bag in tokens

    #bigrams features
    bigram_dict = {}
    for big in feature_vect[29:]:
        bigram_dict[big] = big in bigrams(tokens)

    #POS tagging features
    POS_tag = ['ADJ', 'ADV', 'DET', 'NOUN', 'PRT', 'VERB', '.']
    tagged_word = parse(text, chunks=False, tagset='UNIVERSAL').split()
    simplified_tagged_word = [(tag[0], map_tag('en-ptb', 'universal', tag[1])) for s in tagged_word for tag in s]
    freq_POS = nltk.FreqDist(tag[1] for tag in simplified_tagged_word if tag[1] in POS_tag)

    d = dict({'sentence_length_variation': words_per_sent.std()}, **bag_dict)

    return dict(dict(d, **bigram_dict), **freq_POS)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: STM_names_to_WM.py Proyecto: byteface/sing

def run(o):
	""" STM is shortcuts to the short_term_memory operators """

	STM_PATH = './bin/%s/brain/short_term_memory' % o.o['name']
	
	WM_PLACES_PATH = './bin/%s/brain/working_memory/PLACES' % o.o['name']

	import os, sys;
	mydirs = os.listdir( STM_PATH )

	from pattern.en import parse, pprint, tag

	import shutil

	for word in mydirs:
		
		ignore = [".DS_Store",".gitignore","README.txt"]
		if word in ignore:
			continue

		#print word
		s = parse(word,tags=True)
		print s
		#pprint(s)

		tagged = s.split('/')[1]

		if tagged == "NNP-LOC":
			from_path = "%s/%s" % (STM_PATH,word)
			to_path = "%s/" % WM_PLACES_PATH
			
			os.system( "rsync -avrz %s %s" % (from_path,to_path) )

Ejemplo n.º 26

0

Mostrar archivo

Archivo: process_sentences.py Proyecto: folagit/resumatcher

def findVerb(sent):
    result = parse(sent,tokenize = True, tags = True, )
    sen = Sentence(result) 
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    print vlist
    vlist = [ word.string for word in sen if word.type.startswith("V") ]
    return vlist

Ejemplo n.º 27

0

Mostrar archivo

Archivo: text.py Proyecto: assamite/TwatBot

def get_NNPs(text, counts = False):
    '''Extract proper nouns from text.
    
    :param text: Text to parse
    :type text: str
    :param counts: Return counts for each extracted NNP
    :type counts: bool
    :returns: list -- List containing either only the extracted NNP's or (NNP, count) -pairs sorted by count.
    '''
    parsed_text = parse(text).split()
    nnps = [] 
    for sent in parsed_text:
        for word in sent:
            if word[1].startswith('NNP'):
                nnps.append(word[0])
                
    ctr = Counter(nnps)
    if counts:
        ctri = ctr.items()
        ctri = sorted(ctri, key = lambda x: x[1], reverse = True)
    else:
        ctri = ctr.keys()
    return ctri

Ejemplo n.º 28

0

Mostrar archivo

Archivo: getLemma.py Proyecto: PonteIneptique/mobify

def getLemma(dico):

	#we get every chapte
	for chapter in dico:
		#We create the new part in it
		dico[chapter]["lemma"] = {}

		#Then we get every sentence
		for sentence in dico[chapter]["sentences"]:

			#We parse it
			temp = parse(dico[chapter]["sentences"][sentence], tokenize = True, tags = True, chunks = False, relations = False, lemmata = True, default = 'NN', light = True)

			#We split it
			temp = temp.split()
			for weird in temp:
				for temp2 in temp:
					for triple in temp2:
						key = sentence + " " + triple[0]
						dico[chapter]["lemma"][key] = (triple[1],triple[2])

				#print(each)

			#print(temp)
		#End of sentence loop

	#End of chapter loop

	#We return our data
	return dico

Ejemplo n.º 29

0

Mostrar archivo

Archivo: test_search.py Proyecto: BarcelonaMedia-ViL/pattern

 def test_match(self):
     # Assert Constraint-Word matching.
     R = search.Constraint.fromstring
     S = lambda s: Sentence(parse(s, relations=True, lemmata=True))
     W = lambda s, tag=None, index=0: search.Word(None, s, tag, index)
     for constraint, tests in (
       (R("cat|dog"),  [(W("cat"), 1), (W("dog"), 1), (W("fish"), 0)]),
       (R("cat*"),     [(W("cats"), 1)]),
       (R("*cat"),     [(W("tomcat"), 1)]),
       (R("c*t|d*g"),  [(W("cat"), 1), (W("cut"), 1), (W("dog"), 1), (W("dig"), 1)]),
       (R("cats|NN*"), [(W("cats", "NNS"), 1), (W("cats"), 0)]),
       (R("^cat"),     [(W("cat", "NN", index=0), 1),(W("cat", "NN", index=1), 0)]),
       (R("*|!cat"),   [(W("cat"), 0), (W("dog"), 1), (W("fish"), 1)]),
       (R("my cat"),   [(W("cat"), 0)]),
       (R("my cat"),   [(S("my cat").words[1], 1)]),  # "my cat" is an overspecification of "cat"
       (R("my_cat"),   [(S("my cat").words[1], 1)]),
       (R("cat|NP"),   [(S("my cat").words[1], 1)]),
       (R("dog|VP"),   [(S("my dog").words[1], 0)]),
       (R("cat|SBJ"),  [(S("the cat is sleeping").words[1], 1)]),
       (R("dog"),      [(S("MY DOGS").words[1], 1)]), # lemma matches
       (R("dog"),      [(S("MY DOG").words[1], 1)])): # case-insensitive
         for test, b in tests:
             self.assertEqual(constraint.match(test), bool(b))
     # Assert Constraint-Taxa matching.
     t = search.Taxonomy()
     t.append("Tweety", type="bird")
     t.append("Steven", type="bird")
     v = search.Constraint.fromstring("BIRD", taxonomy=t)
     self.assertTrue(v.match(W("bird")))
     self.assertTrue(v.match(S("tweeties")[0]))
     self.assertTrue(v.match(W("Steven")))
     print "pattern.search.Constraint.match()"

Ejemplo n.º 30

0

Mostrar archivo

Archivo: lang_parser.py Proyecto: muzzi11/SWE_2013_Groep2

    def normalize(self, text):
        """Normalizes a given string by:
            * singularizing any plurals.
            * getting the base form of any verb
            * eliminating all capitals"""

        if self.language == self.LANG_EN:
            from pattern.en import parse
        elif self.language == self.LANG_NL:
            from pattern.nl import parse
        elif self.language == self.LANG_DE:
            from pattern.de import parse
        elif self.language == self.LANG_ES:
            from pattern.es import parse
        elif self.language == self.LANG_UNKNOWN:
            # Don't do any parsing.
            return text.lower()
        else:
            raise Exception("Unsupported language: %s" % repr(self.language))

        parsed = parse(text, lemmata=True, chunks=False)
        parsed = [x for y in parsed.split() for x in y]  # Flatten
        normalized = map(lambda w: w[-1], parsed)
        normalized = filter(lambda w: w not in string.punctuation, normalized)
        normalized = ' '.join(normalized)
        return normalized

Ejemplo n.º 31

0

Mostrar archivo

Archivo: posgrams.py Proyecto: adeaver/OSS-NLP

def get_sentiment_bigrams(paths):
    bigrams_list = []

    for path in paths:
        f = open(path, "r+")
        review_text = f.read()
        f.close()

        sentences = review_text.split(".")

        for sentence in sentences:
            tagged_sentence = parse(sentence.lower()).split(" ")
            sentence_bigrams = get_bigrams(tagged_sentence)

            bigrams_list = combine_lists(bigrams_list, sentence_bigrams)

    return bigrams_list

Ejemplo n.º 32

0

Mostrar archivo

Archivo: word2vecReaderUtils.py Proyecto: casmlab/modeling-political-attention

    def lemmatize(content,
                  allowed_tags=re.compile('(NN|VB|JJ|RB)'),
                  light=False,
                  stopwords=frozenset()):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming, taking word context into account.

        Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        >>> lemmatize('The study ranks high.')
        ['study/NN', 'rank/VB', 'high/JJ']

        >>> lemmatize('The ranks study hard.')
        ['rank/NN', 'study/VB', 'hard/RB']

        """
        if light:
            import warnings
            warnings.warn("The light flag is no longer supported by pattern.")

        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u(' ').join(tokenize(content, lower=True, errors='ignore'))

        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith(
                        '_') and lemma not in stopwords:
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result


#endif HAS_PATTERN

Ejemplo n.º 33

0

Mostrar archivo

Archivo: cngrec_corpus.py Proyecto: inurutdinov/eaa

def lemmatize(content):
    """
	Use the English lemmatizer from `pattern` to extract tokens in
	their base form=lemma, e.g. "are, is, being" -> "be" etc.
	This is a smarter version of stemming, taking word context into account.

	Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
    """
    content = u' '.join(utils.tokenize(content, lower=True, errors='ignore'))
    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                if utils.ALLOWED_TAGS.match(tag):
                    result.append(lemma.encode('utf8'))
    return result

Ejemplo n.º 34

0

Mostrar archivo

Archivo: test_search.py Proyecto: sb123456789sb/pattern-1

 def test_group(self):
     # Assert Match groups.
     s = Sentence(parse("the big black cat eats a tasty fish"))
     m = search.search("DT {JJ+} NN", s)
     self.assertEqual(m[0].group(1).string, "big black")
     self.assertEqual(m[1].group(1).string, "tasty")
     # Assert nested groups (and syntax with additional spaces).
     m = search.search("DT { JJ { JJ { NN }}}", s)
     self.assertEqual(m[0].group(1).string, "big black cat")
     self.assertEqual(m[0].group(2).string, "black cat")
     self.assertEqual(m[0].group(3).string, "cat")
     # Assert chunked groups.
     m = search.search("NP {VP NP}", s)
     v = m[0].group(1, chunked=True)
     self.assertEqual(v[0].string, "eats")
     self.assertEqual(v[1].string, "a tasty fish")
     print("pattern.search.Match.group()")

Ejemplo n.º 35

0

Mostrar archivo

 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase.
     v = en.parser.parse("The black cat sat on the mat.")
     self.assertEqual(v,
         "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \
         "sat/VBD/B-VP/O " + \
         "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
     )
     # 2) "the black cat" is the subject, "a fish" is the object.
     v = en.parser.parse("The black cat is eating a fish.", relations=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \
         "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \
         "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O"
     )
     # 3) "chasing" and "mice" lemmata are "chase" and "mouse".
     v = en.parser.parse("The black cat is chasing mice.", lemmata=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \
         "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \
         "mice/NNS/B-NP/O/mouse ././O/O/."
     )
     # 4) Assert unicode.
     self.assertTrue(isinstance(v, unicode))
     # 5) Assert unicode for faulty input (bytestring with unicode characters).
     self.assertTrue(isinstance(en.parse("ø ü"), unicode))
     self.assertTrue(isinstance(en.parse("ø ü", tokenize=True,  tags=False, chunks=False), unicode))
     self.assertTrue(isinstance(en.parse("ø ü", tokenize=False, tags=False, chunks=False), unicode))
     self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode))
     # 6) Assert optional parameters (i.e., setting all to False).
     self.assertEqual(en.parse("ø ü.", tokenize=True,  tags=False, chunks=False), u"ø ü .")
     self.assertEqual(en.parse("ø ü.", tokenize=False, tags=False, chunks=False), u"ø ü.")
     # 7) Assert the accuracy of the English tagger.
     i, n = 0, 0
     for sentence in open(os.path.join(PATH, "corpora", "tagged-en-penntreebank.txt")).readlines():
         sentence = sentence.decode("utf-8").strip()
         s1 = [w.split("/") for w in sentence.split(" ")]
         s2 = [[w for w, pos in s1]]
         s2 = en.parse(s2, tokenize=False)
         s2 = [w.split("/") for w in s2.split(" ")]
         for j in range(len(s1)):
             if s1[j][1] == s2[j][1].split("-")[0]:
                 i += 1
             n += 1
     #print float(i) / n
     self.assertTrue(float(i) / n > 0.945)
     print "pattern.en.parse()"

Ejemplo n.º 36

0

Mostrar archivo

 def getData(self, params):
     if self.now_cache is not None:
         if (self.now_cache +
                 datetime.timedelta(minutes=5)) < datetime.datetime.now():
             self.data_cache = None
             self.today_cache = None
             self.now_cache = None
     if self.data_cache is None:
         tweets = []
         for cand in candidates:
             tweets.append({
                 'tweets':
                 api.user_timeline(cand['user'], count=20),
                 'name':
                 cand['name'],
                 'party':
                 cand['party']
             })
         all_tweets = []
         for tweet_data in tweets:
             name = tweet_data['name']
             party = tweet_data['party']
             for tweet in tweet_data['tweets']:
                 all_tweets.append({
                     'Name': name,
                     'Tweet': tweet.text,
                     'Favorites': tweet.favorite_count,
                     'Retweets': tweet.retweet_count
                 })
         dfs = pd.DataFrame(all_tweets)
         sentiments = [sentiment(tweet) for tweet in dfs['Tweet']]
         dfs['Polarity'] = [sent[0] for sent in sentiments]
         dfs['Subjectivity'] = [sent[1] for sent in sentiments]
         modal = [
             modality(Sentence(parse(tweet, lemmata=True)))
             for tweet in dfs['Tweet']
         ]
         dfs['Certainty'] = modal
         today = date.strftime(datetime.datetime.now(),
                               format='%m/%d/%Y, %H:%M')
         now = datetime.datetime.now()
         self.data_cache = dfs
         self.today_cache = today
         self.now_cache = now
     return self.data_cache

Ejemplo n.º 37

0

Mostrar archivo

Archivo: generate_corruptions.py Proyecto: vard-uhi-ns/MUTT

def corrupt_remove_prep(sent):
    # parse
    P = parse(sent)
    parsed = sum(P.split(), [])

    # chunk
    new = []
    for word in parsed:
        chunktag = word[3]
        if chunktag != 'B-PNP':
            new.append(word[0])

    corr = ' '.join(new)

    if sorted(tokenize(sent)) == sorted(tokenize(corr)):
        return None

    return corr

Ejemplo n.º 38

0

Mostrar archivo

def team_sentiment_analysis(stats):
	for s in stats.sentences:
		this_sentiment = sentiment(s)
		polarity = float("{0:.2f}".format(this_sentiment[0]))
		subjectivity = float("{0:.2f}".format(this_sentiment[1]))
		polarity_10 = float("{0:.1f}".format(this_sentiment[0]))
		subjectivity_10 = float("{0:.1f}".format(this_sentiment[1]))
		stats.polarity_counts[polarity] += 1
		stats.subjectivity_counts[subjectivity] += 1
		stats.polarity_counts_10s[polarity_10] += 1
		stats.subjectivity_counts_10s[subjectivity_10] += 1

		s = Sentence(parse(s, lemmata=True))
		stats.mood_counts[mood(s)] += 1
		rounded_modality = float("{0:.2f}".format(modality(s)))
		rounded_modality_10 = float("{0:.1f}".format(modality(s)))
		stats.modality_counts[rounded_modality] += 1
		stats.modality_counts_10s[rounded_modality_10] += 1

Ejemplo n.º 39

0

Mostrar archivo

    def calculate_phrase_sentiment(self, phrases):
        # print "Rating phrases sentiment..."
        valence_list = []
        arousal_list = []
        for p in phrases:
            pol = sentiment(p)[0]
            sent = parse(p, lemmata=True)
            mod = modality(Sentence(sent))
            print mod
            valence_list.append(10 * pol)
            arousal_list.append(5 * mod)

        valence = max(valence_list)
        arousal = max(arousal_list)

        print "Valence: " + str(valence)
        print "arousal: " + str(arousal)
        return ((valence, arousal))

Ejemplo n.º 40

0

Mostrar archivo

def getBroken(word):
    global duplicateWordForms
    #     print('in get broken. word: ', word)
    ''' returns word parsed into array of caps chars, word root, and POS tag if any'''
    if word.isspace():
        return [word]
    returner = []
    if word in words_to_ignore:
        return getCapsChars(word) + [word.lower()]
    patternParsedList = parse(word, relations=True, lemmata=True).split()[0]
    for patternParsed in patternParsedList:
        word, root, pos = postParse(patternParsed)
        capsChars = []
        wordLower = word.lower()
        wordHasCaps = not word.islower()
        wordLower = word.lower() if wordHasCaps else word
        #         print("wordHasCaps", wordHasCaps)
        #         print(word, root, pos)
        if wordHasCaps:
            capsChars = getCapsChars(word)
#         if pos in unsupportedPoss:
#             returner += capsChars + [wordLower]
        if root == wordLower:
            ''' this means there are no POS tags we need to keep '''
            returner += capsChars + [wordLower]
        else:
            if root in roots_to_ignore or "'" in word or "‘" in word or "’" in word:
                returner += capsChars + [
                    wordLower
                ]  #was, were, am, are -- these words get tokenized/untokenized unreliably. :(
            else:
                useParsed = True
                key = root + pos
                if key in m_rootPos_word:
                    value = m_rootPos_word[key]
                    if value != wordLower:
                        duplicateWordForms += [(key, value, wordLower)]
                        useParsed = False
                if useParsed:
                    m_rootPos_word[root + pos] = wordLower
                    returner += capsChars + [root, pos]
                else:
                    returner += capsChars + [wordLower]
    return returner

Ejemplo n.º 41

0

Mostrar archivo

 def test_sentiment(self):
     # Assert < 0 for negative adjectives and > 0 for positive adjectives.
     self.assertTrue(en.sentiment("wonderful")[0] > 0)
     self.assertTrue(en.sentiment("horrible")[0] < 0)
     self.assertTrue(
         en.sentiment(en.wordnet.synsets("horrible", pos="JJ")[0])[0] < 0)
     self.assertTrue(
         en.sentiment(en.Text(en.parse("A bad book. Really horrible.")))[0]
         < 0)
     # Assert that :) and :( are recognized.
     self.assertTrue(en.sentiment(":)")[0] > 0)
     self.assertTrue(en.sentiment(":(")[0] < 0)
     # Assert the accuracy of the sentiment analysis (for the positive class).
     # Given are the scores for Pang & Lee's polarity dataset v2.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     # The baseline should increase (not decrease) when the algorithm is modified.
     from pattern.db import Datasheet
     from pattern.metrics import test
     reviews = []
     for score, review in Datasheet.load(
             os.path.join(PATH, "corpora", "polarity-en-pang&lee1.csv")):
         reviews.append((review, int(score) > 0))
     from time import time
     t = time()
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print A, P, R, F
     self.assertTrue(A > 0.753)
     self.assertTrue(P > 0.768)
     self.assertTrue(R > 0.725)
     self.assertTrue(F > 0.746)
     # Assert the accuracy of the sentiment analysis on short text (for the positive class).
     # Given are the scores for Pang & Lee's sentence polarity dataset v1.0:
     # http://www.cs.cornell.edu/people/pabo/movie-review-data/
     reviews = []
     for score, review in Datasheet.load(
             os.path.join(PATH, "corpora", "polarity-en-pang&lee2.csv")):
         reviews.append((review, int(score) > 0))
     A, P, R, F = test(lambda review: en.positive(review), reviews)
     #print A, P, R, F
     self.assertTrue(A > 0.654)
     self.assertTrue(P > 0.660)
     self.assertTrue(R > 0.636)
     self.assertTrue(F > 0.648)
     print "pattern.en.sentiment()"

Ejemplo n.º 42

0

Mostrar archivo

def mood(sentence, **kwargs):
    """Returns IMPERATIVE (command), CONDITIONAL (possibility), SUBJUNCTIVE
    (wish) or INDICATIVE (fact)."""
    if isinstance(sentence, basestring):
        try:
            # A Sentence is expected but a string given.
            # Attempt to parse the string on-the-fly.
            from pattern.en import parse, Sentence
            sentence = Sentence(parse(sentence))
        except ImportError:
            pass
    if imperative(sentence, **kwargs):
        return IMPERATIVE
    if conditional(sentence, **kwargs):
        return CONDITIONAL
    if subjunctive(sentence, **kwargs):
        return SUBJUNCTIVE
    else:
        return INDICATIVE

Ejemplo n.º 43

0

Mostrar archivo

def getPosTag(in_filename, out_filename):    
     #open files for inpurt/output, fhnd[0] - in, fhnd[1 ]- out
    fhnd = open_files(in_filename, out_filename)
    
    tagged_line = ""
    
    from pattern.en import parse
    
    for line in fhnd[0]:
        word_list = line.split() #line to list
        for word in word_list:
            tagged_line += " " + parse(word, Relations = False, lemmata = False)
        
        fhnd[1].write(tagged_line + "\n")
        tagged_line = ""
        
        
      
    return 0

Ejemplo n.º 44

0

Mostrar archivo

def pos_counts(sentence, ngram=4):
	counts = {}
	sentence = " ".join(sentence.strip().split())
	words = sentence.split();
	if len(words) < ngram:
		return counts

	pos = parse(sentence, chunks=False).split()[0]
	pos = filter(lambda x: re.match('^[\w-]+$', x[1]) is not None, pos)
	pos = [x[1] for x in pos]

	for w in range(len(pos)-4):
		postag = "|".join(pos[w:w+4])
		if postag in counts:
			counts[postag] = counts[postag] + 1
		else:
			counts[postag] = 1

	return counts

Ejemplo n.º 45

0

Mostrar archivo

 def test_search(self):
     # Assert one match containing all words.
     v = search.Pattern.fromstring("*+")
     v = v.search("one two three")
     self.assertEqual(v[0].string, "one two three")
     # Assert one match for each word.
     v = search.Pattern.fromstring("*")
     v = v.search("one two three")
     self.assertEqual(v[0].string, "one")
     self.assertEqual(v[1].string, "two")
     self.assertEqual(v[2].string, "three")
     # Assert all variations are matched (sentence starts with a NN* which must be caught).
     v = search.Pattern.fromstring("(DT) JJ?+ NN*")
     v = v.search(Sentence(parse("dogs, black cats and a big white rabbit")))
     self.assertEqual(v[0].string, "dogs")
     self.assertEqual(v[1].string, "black cats")
     self.assertEqual(v[2].string, "a big white rabbit")
     v = search.Pattern.fromstring("NN*")
     print "pattern.search.Pattern.search()"

Ejemplo n.º 46

0

Mostrar archivo

Archivo: utils.py Proyecto: IslamMohamedMosaad/PythonNlpResearch

    def lemmatize(content,
                  light=False,
                  allowed_tags=re.compile('(NN|VB|JJ|RB)')):
        """
        This function is only available when the optional 'pattern' package is installed.

        Use the English lemmatizer from `pattern` to extract tokens in
        their base form=lemma, e.g. "are, is, being" -> "be" etc.
        This is a smarter version of stemming. Only consider nouns, verbs, adjectives
        and adverbs by default (=all other lemmas are discarded).

        >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
        ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']

        From http://www.clips.ua.ac.be/pages/pattern-en#parser :

            The parser is built on a Brill lexicon of tagged words and rules to
            improve the tags context-wise. With light=False, it uses Brill's contextual
            rules. With light=True it uses Jason Wiener's simpler ruleset. This
            ruleset is 5-10x faster but also 25% less accurate.

        """
        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        # use simpler, modified pattern.text.en.text.parser.parse that doesn't
        # collapse the output at the end: https://github.com/piskvorky/pattern
        parsed = parse(content, lemmata=True, collapse=False, light=light)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if allowed_tags.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result


#endif HAS_PATTERN

Ejemplo n.º 47

0

Mostrar archivo

Archivo: keywords_expert.py Proyecto: JoannaMisztalRadecka/PoEmo

    def add_keywords(self, phrase):

        sent = en.Sentence(en.parse(phrase))
        nouns = search('NN', sent)
        self.blackboard.pool.nouns.update(
            set(Word(en.singularize(n[0].string)) for n in nouns))
        adjs = search('JJ', sent)
        self.blackboard.pool.adjectives.update(
            set(Word(en.lemma(a[0].string)) for a in adjs))

        try:
            nps = search('NP', sent)
            for np in nps:
                self.blackboard.pool.epithets.update({
                    Word(en.singularize(w.string), "NN"):
                    [Word(jj.string, "JJ") for jj in np if "JJ" in jj.tag]
                    for w in np if "NN" in w.tag
                })
        except IndexError:
            pass

Ejemplo n.º 48

0

Mostrar archivo

Archivo: try_pattern_2.py Proyecto: folagit/resumatcher

def testTokenize():
    s = "I eat pizza with a fork."

    s = "B.S. in Computer Science, a related degree or its equivalent "
    s = "What's this? This is a book."

    s = "Bachelor's degree in Computer Science or equivalent"
    s = "Bachelor’s degree in Computer Science or equivalent"

    s = parse(
        s,
        tokenize=True,  # Tokenize the input, i.e. split punctuation from words.
        tags=False,  # Find part-of-speech tags.
        chunks=
        False,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations=False,  # Find relations between chunks.
        lemmata=False,  # Find word lemmata.
        light=False)

    print s.split()

Ejemplo n.º 49

0

Mostrar archivo

Archivo: sentiment_function.py Proyecto: debayanbose/NLP

def sentiment(content):
    from pattern.en import parse, split, wordnet
    wordnet.sentiment.load()
    relevant_types = [
        'JJ',
        'VB',
        'VBD',
        'VBN',
        'VBG'
        'RB',
    ]
    score = 0
    sentences = split(parse(content, lemmata=True))
    for sentence in sentences:
        for word in sentence.words:
            if word.type in relevant_types:
                pos, neg, obj = wordnet.sentiment[word.lemma]
                score = score + ((pos - neg) * (1 - obj))
    #return 1 if score >= 0 else -1
    return score

Ejemplo n.º 50

0

Mostrar archivo

Archivo: extractor.py Proyecto: bluepolarfox/twss

def extract(statement):

  s = Sentence(parse(statement, lemmata=True))

  '''c1 = Constraint.fromstring("There be DT")
  c2 = Constraint.fromstring("NN+")
  c3 = Constraint.fromstring("(DT)")
  c4 = Constraint.fromstring("(RB) (JJ) NNP+")
  c5 = Constraint.fromstring("(call) (DT)")
  c6 = Constraint.fromstring("(RB) (JJ) (NNPS|NNP)+")
  p = Pattern(sequence=[c1, c2, c3, c4, c5, c6]) 
 
  match = p.search(s)
   '''
  s = find_entities(s)
   
   # not sure about this "be" thing - happy to match plural (is/are) but not sure about past tense ...
  match = search(MATCH_STRING, s)
  #raise Exception(match)
  return s, match

Ejemplo n.º 51

0

Mostrar archivo

Archivo: crfdata.py Proyecto: folagit/resumatcher

def tagSentence(sent):
    result = parse(
        sent,
        tokenize=True,  # Tokenize the input, i.e. split punctuation from words.
        tags=True,  # Find part-of-speech tags.
        chunks=
        False,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations=False,  # Find relations between chunks.
        lemmata=False,  # Find word lemmata.
        light=False)
    #    pprint(result)

    array = str(result).split(" ")
    tokens = []
    posTags = []
    for a in array:
        b = a.split("/")
        tokens.append(b[0])
        posTags.append(b[1])

#  print tokens
#  print posTags
    return (tokens, posTags)

Ejemplo n.º 52

0

Mostrar archivo

    def lemmatize(content):
        """
        Use the English lemmatizer from the `pattern` package to extract tokens in
        their base form (lemmas: "are, is, being"->"be" etc.).
        This is a smarter version of stemming.
        """
        # tokenization in `pattern` is weird; it gets thrown off by non-letters,
        # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
        # FIXME this throws away all fancy parsing cues, including sentence structure,
        # abbreviations etc.
        content = u' '.join(tokenize(content, lower=True, errors='ignore'))

        # use simpler, modified pattern.text.en.text.parser.parse that doesn't
        # collapse the output at the end: https://github.com/piskvorky/pattern
        parsed = parse(content, lemmata=True, collapse=False)
        result = []
        for sentence in parsed:
            for token, tag, _, _, lemma in sentence:
                if 2 <= len(lemma) <= 15 and not lemma.startswith('_'):
                    if ALLOWED_TAGS.match(tag):
                        lemma += "/" + tag[:2]
                        result.append(lemma.encode('utf8'))
        return result

Ejemplo n.º 53

0

Mostrar archivo

def convert_pattern_format(text):
    """
    Text is parsed through pattern's parsing function into a standardized format.
    """
    parsed_text = []
    # parse text via Pattern's parser
    pattern_parsed_text = Text(parse(text, relations=True, lemmata=True))
    for sentence in pattern_parsed_text:
        s = Sentence()
        s.string = remove_blanks(sentence.string)
        for word in sentence:
            # Patterns tags for each word in the sentence are stored in a new Word-object
            w = Word()
            w.string = word.string
            w.lemma = word.lemma
            w.index = word.index
            w.tag = word.type
            w.entity = ""
            # each word is appended to a Sentence-object
            s.words.append(w)
        # each Sentence-object is appended to an array
        parsed_text.append(s)
    return parsed_text

Ejemplo n.º 54

0

Mostrar archivo

def test_parse():
    from pattern.en import parse, Text, Sentence
    from pattern.en import pprint 
    
   
    sent = "Experience with mobile application development a plus: iPhone/iPad, Android, or Blackberry."
    sent = "3+ years web software development experience."
    sent = "Bachelor's in Computer Science, Information Systems or a related study, is required."
    sent = 'I ate pizza.'
    sent = "Bachelor's in Computer Science is required."
    sent = "Bachelor 's Degree or 4 years equivalent professional experience ."
    sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ."
    sent = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ."      
    
    result = parse(sent,
         tokenize = True,  # Tokenize the input, i.e. split punctuation from words.
             tags = True,  # Find part-of-speech tags.
           chunks = True,  # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
        relations = True,  # Find relations between chunks.
          lemmata = True,  # Find word lemmata.
            light = True)
    pprint(result)

Ejemplo n.º 55

0

Mostrar archivo

Archivo: getLemma.py Proyecto: geo-rk/mobify

def getLemma(dico):

    #we get every chapte
    for chapter in dico:
        #We create the new part in it
        dico[chapter]["lemma"] = {}

        #Then we get every sentence
        for sentence in dico[chapter]["sentences"]:

            #We parse it
            temp = parse(dico[chapter]["sentences"][sentence],
                         tokenize=True,
                         tags=True,
                         chunks=False,
                         relations=False,
                         lemmata=True,
                         default='NN',
                         light=True)

            #We split it
            temp = temp.split()
            for weird in temp:
                for temp2 in temp:
                    for triple in temp2:
                        key = sentence + " " + triple[0]
                        dico[chapter]["lemma"][key] = (triple[1], triple[2])

                #print(each)

            #print(temp)
        #End of sentence loop

    #End of chapter loop

    #We return our data
    return dico

Ejemplo n.º 56

0

Mostrar archivo

Archivo: test_en.py Proyecto: borborygmi/pattern

 def test_parse(self):
     # Assert parsed output with Penn Treebank II tags (slash-formatted).
     # 1) "the black cat" is a noun phrase, "on the mat" is a prepositional noun phrase.
     v = en.parser.parse("The black cat sat on the mat.")
     self.assertEqual(v,
         "The/DT/B-NP/O black/JJ/I-NP/O cat/NN/I-NP/O " + \
         "sat/VBD/B-VP/O " + \
         "on/IN/B-PP/B-PNP the/DT/B-NP/I-PNP mat/NN/I-NP/I-PNP ././O/O"
     )
     # 2) "the black cat" is the subject, "a fish" is the object.
     v = en.parser.parse("The black cat is eating a fish.", relations=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/NP-SBJ-1 black/JJ/I-NP/O/NP-SBJ-1 cat/NN/I-NP/O/NP-SBJ-1 " + \
         "is/VBZ/B-VP/O/VP-1 eating/VBG/I-VP/O/VP-1 " + \
         "a/DT/B-NP/O/NP-OBJ-1 fish/NN/I-NP/O/NP-OBJ-1 ././O/O/O"
     )
     # 3) "chasing" and "mice" lemmata are "chase" and "mouse".
     v = en.parser.parse("The black cat is chasing mice.", lemmata=True)
     self.assertEqual(v,
         "The/DT/B-NP/O/the black/JJ/I-NP/O/black cat/NN/I-NP/O/cat " + \
         "is/VBZ/B-VP/O/be chasing/VBG/I-VP/O/chase " + \
         "mice/NNS/B-NP/O/mouse ././O/O/."
     )
     # 4) Assert unicode.
     self.assertTrue(isinstance(v, unicode))
     # 5) Assert unicode for faulty input (bytestring with unicode characters).
     self.assertTrue(isinstance(en.parse("ø ü"), unicode))
     self.assertTrue(
         isinstance(
             en.parse("ø ü", tokenize=True, tags=False, chunks=False),
             unicode))
     self.assertTrue(
         isinstance(
             en.parse("ø ü", tokenize=False, tags=False, chunks=False),
             unicode))
     self.assertTrue(isinstance(en.parse("o u", encoding="ascii"), unicode))
     # 6) Assert optional parameters (i.e., setting all to False).
     self.assertEqual(
         en.parse("ø ü.", tokenize=True, tags=False, chunks=False),
         u"ø ü .")
     self.assertEqual(
         en.parse("ø ü.", tokenize=False, tags=False, chunks=False),
         u"ø ü.")
     print "pattern.en.parser.parse()"

Ejemplo n.º 57

0

Mostrar archivo

Archivo: metrics.py Proyecto: vijayendra-g/namebot

def get_word_types(words):
    """Determine the occurrences of pos types.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_arr = []
    for val in words:
        try:
            val = parse(val,
                        encoding='utf-8',
                        tokenize=False,
                        light=False,
                        tags=True,
                        chunks=False,
                        relations=False,
                        lemmata=False)
            new_arr.append(val)
        except IndexError:
            continue
    return {'data': new_arr, 'summary': None}

Ejemplo n.º 58

0

Mostrar archivo

    def process(self, message):
        # print pattern_en.suggest(message) -- suggestions
        if message == ">!train":
            self.train()
            return "It is nice to learn new stuff."
        if message == ">!forget":
            memory.clear()
            return "I am reborn. So much free space :) maybe you will use files to store memory and not RAM..."
        if message == ">!load_page":
            if sessionId not in memory:
                response = "Hello! My name is Chad and I am passionate about music."
                response += "We can share our experiences and maybe we can get along."
                response += "Would you mind telling me your name first?"
                expect[sessionId] = "name"
                memory[sessionId] = dict()
            else:
                response = "Welcome back!"
                search.search("new songs")
                with open('results.json') as data_file:
                    data = json.load(data_file)
                    for i in range(10):
                        if 'musicrecording' in data['items'][i]['pagemap']:
                            mr = data['items'][i]['pagemap']['musicrecording']
                            which = random.randint(0, len(mr) - 1)
                            if 'name' not in mr[which]:
                                response += " Did you know that " + mr[which][
                                    'byartist'] + " has released a new song?"
                            else:
                                response += " You can check out this cool song, " + mr[which]['name'] + ", by " + \
                                            mr[which]['byartist']
            return response

        s = nlp.get_sentences(message)

        doc = spacy_nlp(message)
        for w in doc:
            print "(", w, w.dep_, w.pos_, w.head, ")"

        aiml_sent_type = []
        aiml_responses = []
        memory_responses = []
        sentence_types = []
        emotions = []

        for sentence in s:
            sentence_type = self.instant_classifier.classify(
                dialogue_act_features(sentence))

            sentence_types.append(sentence_type)

            polarity, subjective = pattern_en.sentiment(sentence)
            sent = pattern_en.parse(sentence, lemmata=True)
            sent = pattern_en.Sentence(sent)
            modality = pattern_en.modality(sent)
            mood = pattern_en.mood(sent)

            if polarity > 0.8:
                emotions.append("SUPER HAPPY")
            elif polarity > 0.3:
                emotions.append("GOOD SURPRISE")
            elif polarity < -0.4:
                emotions.append("FEAR")
            elif polarity > 0.4:
                emotions.append("COOL")
            elif polarity < -0.1:
                emotions.append("SAD")
            elif polarity < -0.7:
                emotions.append("ANGER")
            else:
                emotions.append("NEUTER")

            print sentence_type, polarity, subjective, modality, mood

            if sentence_type not in ["whQuestion", "ynQuestion"]:
                try:
                    aiml_sent_type_res = self.kernel.respond(
                        sentence_type, sessionId)
                except:
                    aiml_sent_type_res = ""
                aiml_sent_type.append(aiml_sent_type_res)

            verbs_subj = set()
            sentence = sentence[0].upper() + sentence[1:]
            doc = spacy_nlp(sentence)
            for possible_subject in doc:
                if (possible_subject.dep == nsubj or possible_subject.dep
                        == nsubjpass) and possible_subject.head.pos == VERB:
                    verbs_subj.add((possible_subject, possible_subject.head))

            try:
                aiml_response = self.kernel.respond(sentence, sessionId)
            except:
                aiml_response = ""
            aiml_responses.append(aiml_response)

            # MEMORY MODULE
            memory_msg = ""
            if sentence_type == "Statement":
                # insert into memory
                for i in verbs_subj:
                    subjs = []
                    subjects = [i[0]]
                    for tok in i[0].children:
                        if tok.dep == conj:
                            subjects.append(tok)

                    for subj in subjects:
                        predec = ""
                        for tok in subj.children:
                            if tok.dep_ == "poss" or tok.dep == amod:
                                predec += tok.lower_
                        if len(predec) > 0:
                            subjs.append(predec + " " + subj.lower_)
                        else:
                            subjs.append(subj.lower_)

                    vb = i[1].lower_
                    if vb not in memory[sessionId]:
                        memory[sessionId][vb] = dict()
                    for subj in subjs:
                        for c in i[1].children:
                            if c.dep in [prep]:
                                memory[sessionId][vb][subj] = c.lower_ + " "
                                for c_prep in c.children:
                                    if c_prep.dep in [dobj, pobj, attr]:
                                        memory[sessionId][vb][
                                            subj] += c_prep.text
                                        memory_responses.append(
                                            self.kernel.respond(
                                                "memorate", sessionId))
                            elif c.dep in [dobj, pobj, attr]:
                                memory[sessionId][vb][subj] = c.text
                                memory_responses.append(
                                    self.kernel.respond("memorate", sessionId))
            elif sentence_type == "whQuestion":
                for i in verbs_subj:
                    subjs = []
                    subjects = [i[0]]
                    for tok in i[0].children:
                        if tok.dep == conj:
                            subjects.append(tok)

                    for subj in subjects:
                        predec = ""
                        for tok in subj.children:
                            if tok.dep_ == "poss" or tok.dep == amod:
                                predec += tok.lower_
                        if len(predec) > 0:
                            subjs.append(predec + " " + subj.lower_)
                        else:
                            subjs.append(subj.lower_)

                    max_similarity = 0
                    verb = i[1].lower_
                    for j in memory[sessionId]:
                        p_word = spacy_nlp(j)
                        similarity = i[1].similarity(p_word[0])
                        if similarity > max_similarity:
                            max_similarity = similarity
                            verb = j
                    if max_similarity > 0.5 and verb in memory[sessionId]:
                        num_subjs = len(subjs)
                        memory_msg = ""
                        for subj in subjs:
                            if subj in memory[sessionId][verb]:
                                toks = nlp.tokenize_text(subj)
                                memory_msg = ""
                                for t in toks:
                                    if t in first_person:
                                        memory_msg += pron_translate[t] + " "
                                    else:
                                        memory_msg += t + " "
                                num_subjs -= 1
                                if num_subjs > 2:
                                    memory_msg += ", "
                                elif num_subjs == 1:
                                    memory_msg += "and "
                        if len(memory_msg) > 0:
                            memory_msg += verb + " "
                            if num_subjs != len(subjs):
                                memory_msg += memory[sessionId][verb][
                                    subjs[-1]] + "."
            memory_responses.append(memory_msg)

        arr_response = []

        for i in aiml_sent_type:
            if len(i) > 0:
                arr_response.append(i)

        for i in aiml_responses:
            if len(i) > 0:
                arr_response.append(i)

        for i in memory_responses:
            if len(i) > 0:
                arr_response.append(i)

        if len(arr_response) == 0:
            data = search.search(message)
            snip = data['items'][0]['snippet']
            sents = nlp.get_sentences(snip)
            arr_response.append(sents[0])

        response = ""

        for i in emotions:
            try:
                emoi = self.kernel.respond(i, sessionId)
            except:
                emoi = None
            if emoi is not None:
                if random.randint(0, 100) < 50:
                    response += " " + emoi + "."
                    break

        for res in arr_response:
            if len(res) > 1:
                response += res + " "

        # generic response, if no response
        restoks = nlp.tokenize_text(response)
        if len(restoks) == 0:
            idx = random.randint(0, len(sentence_types) - 1)
            try:
                aiml_response = self.kernel.respond(sentence_types[idx],
                                                    sessionId)
            except:
                aiml_response = ""
            response += aiml_response

        # polarity, subjective = pattern_en.sentiment(response)
        # sent = pattern_en.parse(sentence, lemmata=True)
        # sent = pattern_en.Sentence(sent)
        # modality = pattern_en.modality(sent)
        # mood = pattern_en.mood(sent)
        # sentence_type = self.instant_classifier.classify(dialogue_act_features(response))
        # print response, polarity, subjective, modality, mood

        return response

Ejemplo n.º 59

0

Mostrar archivo

Archivo: 07-exclude.py Proyecto: Abhishek-1/temp

# It does not use modal verbs such as "could" and "would":
# "You could eat your dinner!" is not a command but a bubbly suggestion.

# We can create a pattern that scans for infinitive verbs (VB),
# and use "!" to exclude certain words:
# "!could|!would|!should|!to+ VB" = infinitive not preceded by modal or "to".
# This works fine except in one case: if the sentence starts with a verb.
# So we need a second rule "^VB" to catch this.
# Note that the example below contains a third rule: "^do|VB*".
# This catches all sentences that start with a "do" verb regardless if it is infinitive,
# because the parses sometimes tags infinitive "do" incorrectly.


def imperative(sentence):
    for p in ("!could|!would|!should|!to+ VB", "^VB", "^do|VB*"):
        m = match(p, sentence)
        if match(p, sentence) and sentence.string.endswith(
            (".", "!")):  # Exclude questions.
            return True
    return False


for s in ("Just stop it!", "Look out!", "Do your homework!",
          "You should do your homework.", "Could you stop it.",
          "To be, or not to be."):
    s = parse(s)
    s = Sentence(s)
    print(s)
    print(imperative(s))
    print("")

Ejemplo n.º 60

0

Mostrar archivo

 def test_chunk_modifiers(self):
     # Assert list of nearby adjectives and adverbs with no role, for VP.
     v = en.Sentence(en.parse("Perhaps you should go."))
     self.assertEqual(v.chunk[2].modifiers, [v.chunk[0]]) # should <=> perhaps
     print("pattern.en.Chunk.modifiers")