def test_findVerb(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent = "Bachelor's in Computer Science, Information Systems or a related study, is required." sent = 'I ate pizza.' sent = "Bachelor's in Computer Science is required." sent = "Bachelor 's Degree or 4 years equivalent professional experience ." sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "Bachelor ’ s degree in Computer Science or equivalent" sent = "Bachelor ' s degree in Computer Science or equivalent" result = parse(sent, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. ) pprint(result) # print type(result) # print result sen = Sentence(result) # for word in sen: # print word, word.type vlist = [ word.string for word in sen if word.type.startswith("V") ] print vlist
def test_sentence(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent1 = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ." sent2 = "Bachelor's degree in Computer Science is required." sent3 = "He created the robot and broke it after making it." sent4 = "A Computer Science or related degree " sent5 = "bachelors degree in Computer Science or Information Systems and/or related experience required" result = parse(sent5, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = True) pprint(result) sen = Sentence(result) # print type(sen) print sen for chunk in sen.chunks: print chunk.type, [(w.string, w.type) for w in chunk.words]
def run(o): # https://github.com/clips/pattern/blob/master/examples/03-en/03-parse.py import os, sys;# sys.path.insert(0, os.path.join("..", "..")) from pattern.en import parse, pprint, tag # The en module contains a fast regular expressions-based parser. # A parser identifies words in a sentence, word part-of-speech tags (e.g. noun, verb) # and groups of words that belong together (e.g. noun phrases). # Common part-of-speech tags: NN (noun), VB (verb), JJ (adjective), PP (preposition). # A tag can have a suffix, for example NNS (plural noun) or VBG (gerund verb). # Overview of tags: http://www.clips.ua.ac.be/pages/mbsp-tags s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex" s = parse(s, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = False) # The light parameter determines how unknown words are handled. # By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print s print # Prettier output can be obtained with the pprint() command: pprint(s) print # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print s.split() print # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork. one more test 1 Africa James Bob England Surrey Essex" s = tag(s) print s for word, tag in s: if tag == "NN": # Find all nouns in the input string. print word
def grammatical_tagging(): sentence = "The white house is at the top of the hill" sentences = "The white house is at the top of the hill. My house is not" print( tag(sentence) ) # The result is an array of tuples tagging each word (verbs, nouns, etc.) print(parse(sentence)) #pprint(parse(sentence)) pprint(parsetree(sentences))
def gather_question_bits(sentence): question_bits=[] a_parse=parse(sentence,relations=True) print a_parse pprint(a_parse) all_bits=a_parse.split(' ') ids=gather_bits_by_id(all_bits) for id in ids: roles=gather_bits_by_role(ids[id]) if 'SBJ' in roles and 'VP' in roles and 'OBJ' in roles: question_bits.append(roles) return question_bits
def process(self): text = self._regex.replace(self._text) pt = english.parsetree(text, lemmata=True) processed = [] vm = NateVm() english.pprint(pt) for sentence in pt: words = sentence pos = 0 last = len(words) while pos < last: for pattern, code in self._logic: matched = pattern.match(words, start=pos) if matched: vm.run(matched, code) pos = matched.stop processed += vm.get() break else: processed.append(words[pos]) pos += 1 self.rebuild_text(processed)
def test_parse(): from pattern.en import parse, Text, Sentence from pattern.en import pprint sent = "Experience with mobile application development a plus: iPhone/iPad, Android, or Blackberry." sent = "3+ years web software development experience." sent = "Bachelor's in Computer Science, Information Systems or a related study, is required." sent = 'I ate pizza.' sent = "Bachelor's in Computer Science is required." sent = "Bachelor 's Degree or 4 years equivalent professional experience ." sent = "A Master ’ s Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "A Master's Degree or equivalent in Electrical Engineering , Computer Science , or other technical/engineering field with related programming experience and applicable work experience is required ." sent = "BS degree ( BSEE or BSCS strongly preferred , MSCS a plus ) and/or the equivalent in training and experience ." result = parse(sent, tokenize = True, # Tokenize the input, i.e. split punctuation from words. tags = True, # Find part-of-speech tags. chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase. relations = True, # Find relations between chunks. lemmata = True, # Find word lemmata. light = True) pprint(result)
def run(o): """ STM is shortcuts to the short_term_memory operators """ STM_PATH = './bin/%s/brain/short_term_memory' % o.o['name'] WM_PATH = './bin/%s/brain/working_memory/' % o.o['name'] import os, sys; mydirs = os.listdir( STM_PATH ) from pattern.en import parse, pprint, tag import shutil for word in mydirs: ignore = [".DS_Store",".gitignore","README.txt"] if word in ignore: continue #print word s = parse(word,tags=True) #print s pprint(s) tagged = s.split('/')[1] #print tagged from_path = "%s/%s" % (STM_PATH,word) # TODO - ask do you want to move numbers #if tagged != "NNP" : # pprint(s) # to_path = "%s/%s" % (WM_PATH,"NUMBERS") # os.system( "rsync -avrz %s %s" % (from_path,to_path) ) # shutil.rmtree(from_path)
# - Even it's not as popular as spaCy or NLTK, it has unique functionalities such as finding superlatives and comparatives, get fact and opinion detecetion which other NLP libraries doesn't have [1] ## installation # !pip install pattern # # Python for NLP: Introduction to the Pattern Library [1] # ## Pattern Library Functions for NLP # ### Tokenizing, POS Tagging, and Chunking from pattern.en import parse from pattern.en import pprint pprint( parse('I drove my car to the hospital yesterday', relations=True, lemmata=True)) print( parse('I drove my car to the hospital yesterday', relations=True, lemmata=True).split()) # ### Pluralizing and Singularizing the Tokens from pattern.en import pluralize, singularize print(pluralize('leaf')) print(singularize('theives')) # ### Converting Adjective to Comparative and Superlative Degrees
light = False) # The light parameter determines how unknown words are handled. # By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print(s) print("") # Prettier output can be obtained with the pprint() command: pprint(s) print("") # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print(s.split()) print("") # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork." s = tag(s) print(s) for word, tag in s:
def test_pprint(): from pattern.en import parse from pattern.en import pprint result = parse('I ate pizza.', relations=True, lemmata=True) pprint(result)
# -*- coding: utf-8 -*- """ Created on Wed Dec 16 11:24:05 2020 @author: praja """ # from pattern.en import parse from pattern.en import pprint ## pprint(parse('He went to park', relations=True, lemmata=True)) print("sucesfull!!!")
x.replace("\n", " ") for x in nltk.sent_tokenize(plotText.replace("\t", "")) ] for strSentence in sentList: for word, pos in tag(strSentence): if pos in ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ"): # Retrieve all adjectives. print("=====================>>>>> ", word, pos) else: print(word, pos) print(strSentence) a = parse(strSentence, relations=True, lemmata=True) pprint(a) sentence = Sentence(a) print(sentence.verbs) print print #print(sentence.relations) #print(sentence.subjects) #print(sentence.objects) #print(sentence.verbs) #print(sentence.chunk) sentScore = sid.polarity_scores(strSentence) # sqlite3 insert : subject / objects / verbs / CPC / Sentiment
#https://stackabuse.com/python-for-nlp-introduction-to-the-pattern-library/ #standard libaries import numpy as np import pandas as pd import matplotlib.pyplot as plt from pydataset import data import seaborn as sns #pip install pattern from pattern.en import parse from pattern.en import pprint parse('Hello Everyone and Welcome to Analytics India Magazine') #parse function differentiate the words in the sentence as a noun, verb, subject, or subject. We can also use the ‘pprint’ function defined in the pattern library to display the parsed sentence in a clear manner. pprint( parse('Hello Everyone and Welcome to Analytics India Magazine', relations=True, tokenize=True, lemmata=True)) #%% ngrams # "n" combination of words in a sentence. from pattern.en import ngrams print(ngrams("Hello Everyone and Welcome to Analytics India Magazine", n=3)) print(ngrams("He goes to hospital", n=2)) #sentiment #Sentiment refers to an opinion or feeling towards a certain thing. sentiment object is used to find the polarity (positivity or negativity) of a text along with its subjectivity. from pattern.en import sentiment print(sentiment("He is a good boy but sometimes he behaves miserably"))
tokenize=True, # Split punctuation marks from words? tags=True, # Parse part-of-speech tags? (NN, JJ, ...) chunks=True, # Parse chunks? (NP, VP, PNP, ...) relations=False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata=False, # Parse lemmata? (ate => eat) encoding='utf-8', # Input string encoding. tagset=None) # Penn Treebank II (default) or UNIVERSAL. # parser tagger and tokenizer for word, pos in tag('I feel *happy*!', tokenize=True, encoding='utf-8'): if pos == "JJ": # Retrieve all adjectives. print word print tokenize('I feel *happy*!', punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={}) # parser output pprint(parse('I ate pizza.', relations=True, lemmata=True)) # parse trees s = parsetree( 'The cat sat on the mat.', tokenize=True, # Split punctuation marks from words? tags=True, # Parse part-of-speech tags? (NN, JJ, ...) chunks=True, # Parse chunks? (NP, VP, PNP, ...) relations=False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata=False, # Parse lemmata? (ate => eat) encoding='utf-8', # Input string encoding. tagset=None) # Penn Treebank II (default) or UNIVERSAL. print repr(s) for sentence in s: for chunk in sentence.chunks: print chunk.type, [(w.string, w.type) for w in chunk.words] for sentence in tree(open('data/input/tagged.txt'),
light=False) # The light parameter determines how unknown words are handled. # By default, unknown words are tagged NN and then improved with a set of rules. # light=False uses Brill's lexical and contextual rules, # light=True uses a set of custom rules that is less accurate but faster (5x-10x). # The output is a string with each sentence on a new line. # Words in a sentence have been annotated with tags, # for example: fork/NN/I-NP/I-PNP # NN = noun, NP = part of a noun phrase, PNP = part of a prepositional phrase. print s print # Prettier output can be obtained with the pprint() command: pprint(s) print # The string's split() method will (unless a split character is given), # split into a list of sentences, where each sentence is a list of words # and each word is a list with the word + its tags. print s.split() print # The tag() command returns a list of (word, POS-tag)-tuples. # With light=True, this is the fastest and simplest way to get an idea # of a sentence's constituents: s = "I eat pizza with a fork." s = tag(s, light=True) print s for word, tag in s:
# mmain ref http://www.academypublisher.com/jetwi/vol01/no1/jetwi01016076.pdf #to draw a paser tree in regersive from textblob import TextBlob wiki = TextBlob(open('full.txt','rU').read()) a=wiki.tags import nltk sentence = a pattern = """NP: {<DT>?<JJ>*<NN>} VBD: {<VBD>} IN: {<IN>}""" NPChunker = nltk.RegexpParser(pattern) result = NPChunker.parse(sentence) result.draw() # regresive array input for pos taging from pattern.en import parse from pattern.en import pprint with open('spam.txt', 'rU') as ins: array = [] for line in ins: array.append(line) for i in array: pprint(parse(i, relations=True, lemmata=True)) #new reference https://www.academia.edu/11692120/Human_Intentions_Mining_Through_Natural_Language_Text_Survey # in lexical word list approch now the acurecy may less becuse of the equal wiehgt for all the data is a disadvaange. so i need to give appropriste value for the data
Make a Nonet (first iteration) 1st line: contain 9 syllables 2nd line: contain 8 syllables 3rd line: contain 7 syllables ... 9th line: contain 1 syllable """ from pattern.en import parsetree from pattern.en import tag from pattern.en import pprint def word_eval(string) pprint(parsetree(string, relations = True)) for word, pos in tag(string): if pos == "NN": print word def gutenberg_text_gather(current_URL): from pattern.web import * buddhist_psalm_text = URL(current_URL).download() print buddhist_psalm_text # Save data to a file (will be part of your data fetching script) f = open('buddhist_psalm_text.pickle','w') pickle.dump(all_texts,f) f.close()