def main(): with open("test.txt", 'r', encoding="utf-8") as f: text = f.read() if (False): sentences = array(split_into_sentences(text, True)) if (not len(sentences)): print("Nothing found") exit(-1) tags = pos_tag_sents(map(word_tokenize, sentences)) lemmatized = lemmatize_sents( deepcopy(tags)) #Only for aesthetics reasons chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n " "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n " "}<DT>+{\n " "PH: {<[B-Z]+>+}\n " "}<DT|CC|PRP|EX|WDT>+{") chunked = list(chunker.parse_sents(lemmatized)) droped = setup_search_structure(chunked, tuple) if (True): num_print = input("Full data of:[None] ") if (num_print): num_print = int(num_print) print() for num_print in range(num_print, num_print + 10): print(sentences[num_print]) print() print(tags[num_print]) print() print(lemmatized[num_print]) print() #chunks = ne_chunk_sents(tags) #iob = [tree2conlltags(chunk) for chunk in chunks] #iob = tree2conlltags(chunks) #print(iob[num_print]) #print() #tree = [conlltags2tree(i) for i in iob] #print(tree[num_print]) #print() #"NP: {<IN|TO>?((<IN>?<DT>?<JJ.?>*<CD>?<NN.*>+<POS>?)+<CD>?<FW>?)+}\n " #"VP: {((<WP>?<PRP>?<MD>?<VB.?>?<JJ>*<TO>?<VB.?>+<RB>*(<JJ>*<TO>?)*)+<CC>?)+}\n " print(chunked[num_print]) print("\n###\n") print(droped[0][num_print]) print() if (input(f"({num_print}) ?> ")): break ### Search params to_search = input("Search: ") or "work" tag = { '1': 'n', '2': 'v', '3': 'a', '4': 'r' }.get( input(f"\nWhat '{to_search}'?\n" "[1]: Noun\n" "[2]: Verb\n" "[3]: Adjective\n" "[4]: Adverb\n\n" "> "), None) syn = 'y' in input("\nFind related words too? ").lower() exact = 'y' in input("\nFind exact word? ").lower() print() _, ph_num_ls, sentences = analize_text(text, exact_words=exact) num = 1000000 num2 = 10 if (to_search): if (syn): w_rel = words_related(to_search, tag) else: w_rel = to_search ph_nums = find(w_rel, ph_num_ls) print() if (not len(ph_nums)): print(f"{to_search} not in text.") exit(0) if (False): print(f"Looking for \"{to_search}\" {num} times...\n") print(timeit.timeit("find(w_rel, ph_num_ls)", number=num, globals={ **globals(), **locals() }), end=' seconds\n\n') if (False): print(f"{num2} times text setup...\n") print(timeit.timeit("analize_text(text)", number=num2, globals={ **globals(), **locals() }), end=' seconds \n') if ("y" in input("Show found instances?[No] ")): from colorama import init as color_init color_init() print() if (not ph_nums is None): # Unnecessary, but clean for ph in ph_nums: print(_color_sent(sentences[ph], w_rel)) print() else: print("You did not specify any search param")
def __init__(self, formats: list): self.__text = "\n".join(formats) self.__parse = RegexpParser(self.__text)
def setParser(self, formats): if type(formats) is list: self.__text = "\n".join(formats) else: self.__text = formats self.__parse = RegexpParser(self.__text)
class TextParser: """ Utility class for processing text content. """ substitutions = [ (r"\b(im|i'm)\b", "i am"), (r"\b(id|i'd)\b", "i would"), (r"\b(i'll)\b", "i will"), (r"\bbf|b/f\b", "boyfriend"), (r"\bgf|g/f\b", "girlfriend"), (r"\byoure\b", "you are"), (r"\b(dont|don't)\b", "do not"), (r"\b(didnt|didn't)\b", "did not"), (r"\b(wasnt|wasn't)\b", "was not"), (r"\b(isnt|isn't)\b", "is not"), (r"\b(arent|aren't)\b", "are not"), (r"\b(werent|weren't)\b", "were not"), (r"\b(havent|haven't)\b", "have not"), (r"\b(couldnt|couldn't)\b", "could not"), (r"\b(hadnt|hadn't)\b", "had not"), (r"\b(wouldnt|wouldn't)\b", "would not"), (r"\bgotta\b", "have to"), (r"\bgonna\b", "going to"), (r"\bwanna\b", "want to"), (r"\b(kinda|kind of)\b", ""), (r"\b(sorta|sort of)\b", ""), (r"\b(dunno|donno)\b", "do not know"), (r"\b(cos|coz|cus|cuz)\b", "because"), (r"\bfave\b", "favorite"), (r"\bhubby\b", "husband"), (r"\bheres\b", "here is"), (r"\btheres\b", "there is"), (r"\bwheres\b", "where is"), # Common acronyms, abbreviations and slang terms (r"\birl\b", "in real life"), (r"\biar\b", "in a relationship"), (r"\btotes\b", "totally"), (r",", " and "), # Remove fluff phrases (r"\b(btw|by the way)\b", ""), (r"\b(tbh|to be honest)\b", ""), (r"\b(imh?o|in my( humble)? opinion)\b", ""), # Default POS tagger seems to always tag "like" # (and sometimes "love") as a noun - this is a bandaid fix for now (r"\bprefer\b", ""), (r"\b(like|love)\b", "prefer"), ] # Skip if any of these is the *only* attribute - for instance, # "I'm a big fan of Queen" makes sense, but "I'm a fan" doesn't. skip_lone_attributes = [ "fan", "expert", "person", "advocate", "customer", ] # A select set of attributes we want to exclude. skip_attributes = [ "supporter", "believer", "gender", "backer", "sucker", "chapter", "passenger", "super", "water", "sitter", "killer", "stranger", "monster", "leather", "holder", "creeper", "shower", "member", "wonder", "hungover", "sniper", "silver", "beginner", "lurker", "loser", "number", "stupider", "outlier", "molester", "hitler", "beer", "cucumber", "earlier", "denier", "lumber", "hamster", "abuser", "murderer", "dealer", "consumer", "wallpaper", "paper", "madder", "uber", "computer", "rubber", "door", "liquor", "traitor", "favor", "year", "ear", "liar", "rapist", "racist", "misogynist", "apologist", "sexist", "satan", "batman", "veteran", "ban", "hypocrite", "candidate", "lot", "f****t", "teapot", "shot", "foot", "idiot", "bigot", "robot" ] # A select set of attributes we want to include. include_attributes = [ "geek", "nerd", "nurse", "cook", "student", "consultant", "mom", "dad", "marine", "chef", "sophomore", "catholic", "mod", # TODO - These make sense only when accompanied by # at least another noun #"person","enthusiast","fanboy","player","advocate", ] # Super awesome logic - if noun ends in any of these, it's *probably* # something we want to include/exclude. TODO - This is terrible logic, # see if we can implement actual NLP. include_attribute_endings = ("er", "or", "ar", "ist", "an", "ert", "ese", "te", "ot") exclude_attribute_endings = ("ing", "f****r") # "Filler" words (in sentences such as "I think...", "I guess...", etc.) skip_verbs = ["were", "think", "guess", "mean"] skip_prepositions = ["that"] skip_adjectives = ["sure", "glad", "happy", "afraid", "sorry", "certain"] skip_nouns = [ "right", "way", "everything", "everyone", "things", "thing", "mine", "stuff", "lot" ] # Should _N include conjunctions? grammar = r""" # adverb* verb adverb* # - really think, strongly suggest, look intensely _VP: {<RB.*>*<V.*>+<RB.*>*} # determiner adjective noun(s) # - a beautiful house, the strongest fighter _N0: {(<DT>*<JJ.*>*<NN.*>+(?!<POS>))+} _N: {<_N0>+} # noun to/in noun # - newcomer to physics, big fan of Queen, newbie in gaming _N_PREP_N: {<_N>((<TO>|<IN>)<_N>)+} # my adjective noun(s) # - my awesome phone POSS: {<PRP\$><_N>} # I verb in* adjective* noun # - I am a great chef, I like cute animals, # - I work in beautiful* New York, I live in the suburbs ACT1: {<PRP><_VP><IN>*<_N>} # Above + to/in noun # - I am a fan of Jaymay, I have trouble with flannel ACT2: {<PRP><_VP><IN>*<_N_PREP_N>} """ chunker = RegexpParser(grammar) def clean_up(self, text): """ Removes unnecessary words from text and replaces common misspellings/contractions with expanded words. """ for original, rep in self.substitutions: text = re.sub(original, rep, text, flags=re.I) return text def normalize(self, word, tag="N"): """ Normalizes word using given tag. If no tag is given, NOUN is assumed. """ kind = NOUN if tag.startswith("V"): kind = VERB elif tag.startswith("RB"): kind = ADV elif tag.startswith("J"): kind = ADJ return Word(word).lemmatize(kind).lower() def pet_animal(self, word): """ Returns word if word is in a predefined list of pet animals. """ word = word.lower() if re.match(r"\b(dog|cat|hamster|fish|pig|snake|rat|parrot)\b", word): return word else: return None def family_member(self, word): """ Returns normalized word if word is in a predefined list of family members. """ word = word.lower() if re.match(r"\b(mom|mother|mum|mommy)\b", word): return "mother" elif re.match(r"\b(dad|father|pa|daddy)\b", word): return "father" elif re.match(r"\b(brother|sister|son|daughter)s?\b", word): return word else: return None def relationship_partner(self, word): """ Returns word if word is in a predefined list of relationship partners. """ word = word.lower() if re.match(r"\b(ex-)*(boyfriend|girlfriend|so|wife|husband)\b", word): return word else: return None def gender(self, word): """ Returns normalized word if word is in a predefined list of genders. """ word = word.lower() if re.match(r"\b(girl|woman|female|lady|she)\b", word): return "female" elif re.match(r"\b(guy|man|male|he|dude)\b", word): return "male" else: return None def orientation(self, word): """ Returns word if word is in a predefined list of sexual orientations. """ word = word.lower() if re.match(r"\b(gay|straight|bi|bisexual|homosexual)\b", word): return word else: return None def process_verb_phrase(self, verb_tree): """ Returns list of (word,tag) tuples given a verb tree. """ if verb_tree.label() != "_VP": return None verb_phrase = [(w.lower(), t) for w, t in verb_tree.leaves()] return verb_phrase def process_noun_phrase(self, noun_tree): """ Returns list of (word,tag) tuples given a noun tree. """ if noun_tree.label() != "_N": return [] if any(n in self.skip_nouns + stopwords for n, t in noun_tree.leaves() if t.startswith("N")): return [] noun_phrase = [(w.lower(), t) for w, t in noun_tree.leaves()] return noun_phrase def process_npn_phrase(self, npn_tree): """ Given a phrase of the form noun-preposition-noun, returns noun and preposition-noun phrases. """ if npn_tree.label() != "_N_PREP_N": return None noun_phrase = [] prep_noun_phrase = [] for i in range(len(npn_tree)): node = npn_tree[i] # we have hit the prepositions in a prep noun phrase if type(node) is tuple: w, t = node w = w.lower() prep_noun_phrase.append((w, t)) else: if prep_noun_phrase: prep_noun_phrase += self.process_noun_phrase(node) else: noun_phrase = self.process_noun_phrase(node) return (noun_phrase, prep_noun_phrase) def process_possession(self, phrase): """ Given a phrase, checks and returns a possession/belonging (my <word>) if exists. """ noun_phrase = [] for i in range(len(phrase)): node = phrase[i] if type(node) is tuple: # word can only be pronoun w, t = node if t == "PRP$" and w.lower() != "my": return None else: # type has to be nltk.tree.Tree if node.label() == "_N": noun_phrase = self.process_noun_phrase(node) else: # what could this be? pass if noun_phrase: return {"kind": "possession", "noun_phrase": noun_phrase} else: return None def process_action(self, phrase): """ Given a phrase, checks and returns an action (I <verb-phrase>) if exists. """ verb_phrase = [] prepositions = [] noun_phrase = [] prep_noun_phrase = [] for i in range(len(phrase)): node = phrase[i] if type(node) is tuple: # word is either pronoun or preposition w, t = node if t == "PRP" and w.lower() != "i": return None elif t == "IN": prepositions.append((w.lower(), t)) else: # what could this be?! pass else: if node.label() == "_VP": verb_phrase = self.process_verb_phrase(node) elif node.label() == "_N": noun_phrase = self.process_noun_phrase(node) elif node.label() == "_N_PREP_N": noun_phrase, prep_noun_phrase = ( self.process_npn_phrase(node)) if noun_phrase: return { "kind": "action", "verb_phrase": verb_phrase, "prepositions": prepositions, "noun_phrase": noun_phrase, "prep_noun_phrase": prep_noun_phrase } else: return None def extract_chunks(self, text): """ Given a block of text, extracts and returns useful chunks. TODO - Should sentiments be excluded here? """ chunks = [] sentiments = [] text = self.clean_up(text) blob = TextBlob(text, pos_tagger=pattern_tagger, analyzer=naive_bayes_analyzer) for sentence in blob.sentences: if (not sentence.tags or not re.search(r"\b(i|my)\b", str(sentence), re.I)): continue tree = self.chunker.parse(sentence.tags) for subtree in tree.subtrees( filter=lambda t: t.label() in ['POSS', 'ACT1', 'ACT2']): phrase = [(w.lower(), t) for w, t in subtree.leaves()] phrase_type = subtree.label() if not any( x in [("i", "PRP"), ("my", "PRP$")] for x in [(w, t) for w, t in phrase] ) or (phrase_type in ["ACT1", "ACT2"] and (any(word in self.skip_verbs for word in [w for w, t in phrase if t.startswith("V")]) or any(word in self.skip_prepositions for word in [w for w, t in phrase if t == "IN"]) or any(word in self.skip_adjectives for word in [w for w, t in phrase if t == "JJ"]))): continue if subtree.label() == "POSS": chunk = self.process_possession(subtree) if chunk: chunks.append(chunk) elif subtree.label() in ["ACT1", "ACT2"]: chunk = self.process_action(subtree) if chunk: chunks.append(chunk) return (chunks, sentiments) def ngrams(self, text, n=2): """ Returns a list of ngrams for given text. """ return [" ".join(w) for w in TextBlob(text).ngrams(n=n)] def noun_phrases(self, text): """ Returns list of TextBlob-derived noun phrases. """ return TextBlob(text).noun_phrases def common_words(self, text): """ Given a text, splits it into words and returns as a list after excluding stop words. """ return [ word for word in list(TextBlob(text).words) if (word not in stopwords and word.isalpha()) ] def total_word_count(self, text): """ Returns total word count of a given text. """ return len(list(TextBlob(text).words)) def unique_word_count(self, text): """ Returns unique word count of a given text. """ return len(set(list(TextBlob(text).words))) def longest_word(self, text): """ Returns longest word in a given text. """ return max((list(TextBlob(text).words)), key=len) @staticmethod def test_sentence(sentence): """ Prints TextBlob-derived tags for a given sentence. For testing purposes only. """ print TextBlob(sentence).tags
CA: { <JJR><VB.*>|<RB>?<JJ> } # Adjectives AJ: { <CA>(<CC>?<CA>)* } # Entities EN: {<AJ>?<NN.*|FW>+} # Noun-phrases NP: {<DT>?<CC>?(<CC><CD>)*<EN>(<CC>?<EN>)*} # Rest should be considered as a Verb-Phrase Chunk VP: {<.*>+} }<NP>+{ ''' PARSER = RegexpParser(GRAMMAR) LEMMATIZER = WordNetLemmatizer() STOPWORDS = stopwords.words('english') class TextParser: @staticmethod def calculate_similarity(a, b) -> float: return SequenceMatcher(None, a, b).ratio() @staticmethod def generate_pos_tag_sets(input_string: str) -> next: """ Break given string into sentences, and return their pos-tagged lists.\n **REQUIRES AN ACTIVE POS TAGGER TO BE RUNNING!!** :param input_string: input string. may contain one or more sentences
pos_tagged_text = list() # create a for loop through each word tokenized sentence here for word in word_tokenized_text: # part-of-speech tag each sentence and append to list of pos-tagged sentences here pos_tagged_text.append(pos_tag(word)) # store and print any part-of-speech tagged sentence here single_pos_sentence = pos_tagged_text[100] print(single_pos_sentence) # define noun phrase chunk grammar here np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" # create noun phrase RegexpParser object here np_chunk_parser = RegexpParser(np_chunk_grammar) # define verb phrase chunk grammar here vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB><RB>?}" # create verb phrase RegexpParser object here vp_chunk_parser = RegexpParser(vp_chunk_grammar) # create a list to hold noun phrase chunked sentences and a list to hold verb phrase chunked sentences here np_chunked_text = list() vp_chunked_text = list() # create a for loop through each pos-tagged sentence here for sentence in pos_tagged_text: # chunk each sentence and append to lists here np_chunked_text.append(np_chunk_parser.parse(sentence))
GRAMMER = 'NP: {<DT|PP\$>?<JJ.*>*<NN.*>+}\n{<JJ.*>*<NN*><CC>*<NN*>+}\n{<NNP>+}\n{<NN>+}' # require from nltk import RegexpParser, sent_tokenize, pos_tag, word_tokenize, FreqDist import sys # sanity check if len(sys.argv) != 2: sys.stderr.write('Usage: ' + sys.argv[0] + " <file>\n") quit() # get input file = sys.argv[1] # initialize parser = RegexpParser(GRAMMER) # open and read the input handle = open(file, 'r') data = handle.read() # get all sentences and process them sentences = sent_tokenize(data) phrases = [] for sentence in sentences: # tokenize and tag the sentence sentence = (pos_tag(word_tokenize(sentence))) # parse the sentence and process each noun phrase tree = parser.parse(sentence)
def getParse(sentence) -> str: # Preset nlp = StanfordCoreNLP('stanford-corenlp-4.2.0/', memory='8g') cc = OpenCC('t2s') # sentence = 'Those two splendid old electric trains.' print( "##################################################################################" ) # # POS print('POS:', nlp.pos_tag(sentence)) print( "##################################################################################" ) # # Tokenize print('Tokenize:', nlp.word_tokenize(sentence)) print( "##################################################################################" ) # # NER print('NER:', nlp.ner(sentence)) print( "##################################################################################" ) # Parser tree = nlp.parse(sentence) parse_string = ' '.join(str(tree).split()) print(parse_string) # ParserTest print('Parser:') print(nlp.parse(sentence)) print( "##################################################################################" ) #TREE Graph tagged = pos_tag(word_tokenize(sentence)) # Extract all parts of speech from any text chunker = RegexpParser(""" NP: {<DT>?<JJ>*<NN>} #To extract Noun Phrases P: {<IN>} #To extract Prepositions V: {<V.*>} #To extract Verbs PP: {<P> <NP>} #To extract Prepostional Phrases VP: {<V> <NP|PP>*} #To extarct Verb Phrases """) # Print all parts of speech in above sentence output = chunker.parse(tagged) print("After Extracting\n", output) # To draw the parse tree output.draw() print( "##################################################################################" ) # Close Stanford Parser nlp.close() return str(parse_string)
def chunk_it_up(tagged_text): chunk_pattern = "Chunk: {<DT>?<JJ>*<NN>}" chunk_parser = RegexpParser(chunk_pattern) chunked = chunk_parser.parse(tagged_text) chunked.draw()
from nlp_consol import stanford_tree # def stanford_tree(line): # output = nlp.annotate(line, properties={ # 'annotators': 'tokenize,ssplit,pos,parse', # 'outputFormat': 'json' # }) # try: # return output['sentences'] # except IndexError: # pass NN_grammar = r""" 'Noun_phrase' : {<NN.*>+} """ np_parser = RegexpParser(NN_grammar) def get_np(parse_tree): if isinstance(parse_tree, Tree): all_np = [] get_tokens = parse_tree.pos() fish_np = np_parser.parse(get_tokens) for obj in fish_np: if isinstance(obj, Tree): np_items = [x[0] for x in obj] all_np.append(' '.join(np_items)) return all_np # read_unmatched = open('whatsapp_unmatched.txt', 'r').read().split('\n')
# {<J.*>+<N.*>+} {<J.*>?<N.*>+} # {<N.*>+<OF>?<N.*>+} {<N.*>+<IN>?<DT>?<J.*>+<N.*>+} {<NNP>+<IN>?<DT>?<J.*>?<NNP>+} {<N.*>+<CC>?<DT>?<J.*>+<N.*>+} {<N.*>+<CC>?<DT>?<J.*>?<N.*>+} <``>{<.*>+}<''> <BRA>{<.*>+}<BRB> NUM: {<CD>+} """ rg_parser = RegexpParser(grammar=grammar) def guess(qa): qd = qa['q'] ast = qa['as'] def cg(tg): if tg[1] == '(': return tg[0], 'BRA' if tg[1] == ')': return tg[0], 'BRB' if tg[0] == 'of': return 'of', 'OF' return tg
def calculate_calories(descriptions): NP = "NP: {(<V\w+>|<NN\w?>)}" chunker = RegexpParser(NP) items = get_continuous_chunks(descriptions.lower(), chunker.parse) # items = ''.join(nouns) API_KEY = 'FEjjqylAG6cqOjq8n2sO1y3njopvccXmVPwIJYGs' url = 'https://api.nal.usda.gov/fdc/v1/foods/search?' total = 0 for item in items: r = requests.get(url+'api_key={}&query={}'.format(API_KEY, item)) res = r.json() nutrients = res['foods'][0]['foodNutrients'] for nutrient in nutrients: if nutrient['nutrientName'] == 'Energy': calorie = nutrient['value'] break if item == 'spicy': calorie = 0 if item == 'cream': calorie = calorie / 5 if item == 'sauce contains bacon': calorie = 0 if item == 'note': calorie = 0 if item == 'rib': calorie = 0 if 'cheese' in item: calorie = calorie / 5 if 'seeds' in item: calorie = calorie / 15 if 'quinoa' in item: calorie = calorie / 5 if 'dressing' in item: calorie = calorie / 2 if 'sourdough' in item: calorie = calorie / 8 if 'lemon' in item: calorie = calorie / 10 if 'crushed' in item: calorie = calorie / 2 if 'butter' in item: calorie = calorie / 100 if 'granola' in item: calorie = calorie / 10 if 'fruit' in item: calorie = calorie / 10 if 'honey' in item: calorie = calorie / 10 if 'compote' in item: calorie = calorie / 5 if 'mayo' in item: calorie = calorie / 2 if 'fried egg' in item: calorie = calorie / 10 if 'potato bun' in item: calorie = calorie / 4 if 'sauce' in item: calorie = calorie / 2 if 'rigatoni' in item: calorie = calorie / 2 if 'pesto' in item: calorie = calorie / 2 if 'breadcrumbs' in item: calorie = calorie / 10 if 'flakes' in item: calorie = calorie / 10 if 'bacon' in item: calorie = calorie / 5 if 'pappardelle' in item: calorie = calorie / 2 total += calorie return jsonify({ 'calories': total })
print("After Lemmatization :") print(lemmatized_tokens) pos_tagged_word_list = pos_tag(lemmatized_tokens) print("After POS Tagging :") print(pos_tagged_word_list) grammar = """ NP: {<DT>?<JJ>*<NN>} {<NNP>+} {<NN><NN>} {<NNS><VBP>} {<V.*> <TO> <V.*>} {<N.*>(4,)} """ NPChunker = RegexpParser(grammar) chunked_result = NPChunker.parse(pos_tagged_word_list) shallow_parsed_set = list() for sub_tree in chunked_result: if type(sub_tree) is nltk.tree.Tree: if sub_tree.label() == 'NP': for w, t in sub_tree.leaves(): if 'NN' in t: shallow_parsed_set.append(w) print("After Chunking (Shallow Parsing) :") print(shallow_parsed_set) hypernym_parsed_set = list() meronym_parsed_set = list()
def taggerAndResultBuilder(emailInput): #Use a sent tokenizer (to maintain things like colons, for times, etc.) sentences = sent_tokenize(emailInput) sentencesBeforeTagging = [word_tokenize(sent) for sent in sentences] sentences = [pos_tag(sent) for sent in sentencesBeforeTagging] # This was the best that I could possibly come up with given the time I had. overallGrammar = """ CLAUSE0: {<IN>?<NNP>+<CD><CD>?} CLAUSE1: {<DT><CD>} DATE: {<CLAUSE0|CLAUSE1>} CLAUSE2: {<VBZ>?<TO><CD><CC|NN|VBP|VBZ>?} CLAUSE3: {<IN|VB><RB><CD|IN><CD>?<NN|NNS>} CLAUSE4: {<IN><IN><CD><NN>?} TIME_END: {<CLAUSE2|CLAUSE3|CLAUSE4>} CLAUSE5: {<IN><DT>?<NN>*<NNP>+<NNPS>*<NN>?} CLAUSE6: {<IN><DT><NN>} CLAUSE7: {<TO><NNP>} LOCATION: {<CLAUSE5|CLAUSE6|CLAUSE7>} TIME_START: {<CD><NN|VBP|VBZ>?} """ # DA1: {<IN>?<NNP>+<CD><CD>?} # DA2: {<DT><CD>} # DATE: {<DA1|DA2>} # TE3: {<IN><RB><CD><NN|NNS>} # TE4: {<VB><RB><IN><CD>} # TIME_END: {<TE1|TE2|TE3|TE4>} # TS1: {<CD><NN|VBP|VBZ>?} # TS2: {<VBZ><IN><CD>} # TIME_START: {TS1|TS2} # L1: {<IN><DT>?<NN>*<NNP>+<NNPS>*<NN>?} # L2: {<IN><DT><NN>} # L3: {<TO><NNP>} # LOCATION: {<L1|L2|L3>} # """ # Location has an optional noun at the end in case the word "building" or "place" or something like this is included. # So in the off case that someone enters "am" instead of "A.M." then this # can actually be mistaken as a verb that's why there are the cases for # VBP and VBZ in TIME_START # Get grammar for nouns like "tonight", "tomorrow", "this afternoon", "this evening", etc. # And check to see if these nouns exist. If they do then compare against the overall grammar # If there is no date then record these dateNounGrammar = """ DATE1: {<JJ><NN>+} DATE2: {<DT><NN>+} DATE3: {<DT><NNP>+} DATE4: {<DT|JJ|NN><VBG>} DATE5: {<NN>+} DATE6: {<JJ><NNP>} """ # DATE1 for catching things like "friday night" or "thursday night" where the day isn't capitalized and thus is JJ # DATE2 for "the evening time" or something like that. # DATE3 for "this Friday" (the tagger messes up the classification of capitalized days, etc.) # DATE4 for "this evening", or "(t/T)hursday evening" # DATE5 for "tonight", "tomorrow night", "this afternoon", "this evening", "lunch", "dinner", etc. # DATE6 for "this Friday", etc. # ----------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------- # # This is now the Grammar that will be used to extract events. # Keep in mind that it is often the first noun in the scheduling email that will be found # This is a known fact in information extraction. # # For example see: # http://www.iosrjournals.org/iosr-jce/papers/Conf-%20ICFTE%E2%80%9916/Volume-1/12.%2072-79.pdf?id=7557 # # I was also able to come up with a grammar based on all of the random sentences I generate. # # ----------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------- eventGrammar = """ EVENT1: {<DT><NN><VBG><NN>} EVENT2: {<DT|VBG><NN>+} EVENT3: {<VB|VBG><IN><NN>+} EVENT4: {<VBG|VBP><NNP>?<NNS>} EVENT5: {<NNS><VBP>} EVENT6: {<VB><NN|RP>} EVENT7: {<VB><DT><NN>} EVENT8: {<DT><NN><VBG><NN>} EVENT9: {<NN>} """ # EVENT1 for "a cake eating contest # EVENT2 for "having lunch" or "a meeting", or "curriculum meeting" etc. # EVENT3 for "wrestling in space" or "wrestle in space" or "going for ice cream" , etc. # EVENT4 for things like "buying Guinness beer" # EVENT5 for "doctor's appointment" # EVENT6 for "drive home" or "run away" # EVENT7 for "running the tap" # EVENT8 for "lunch" or "dinner", etc.This is last because the other POS sequences should have priority. # EVENT9 for pretty much everything else that could be valid. # Extra location grammar # file_object = open( homeDirectory+ "testerDataOutput.txt", "a") dateTimeLocationAndEventList = [] parser1 = RegexpParser(overallGrammar) parser2 = RegexpParser(dateNounGrammar) parser3 = RegexpParser(eventGrammar) for sentence in sentences: result1 = parser1.parse(sentence) result2 = parser2.parse(sentence) result3 = parser3.parse(sentence) dateTimeLocationAndEventResult = cleanTaggedExpressions( result1, result2, result3, sentencesBeforeTagging, emailInput) dateTimeLocationAndEventList.append(dateTimeLocationAndEventResult) resultString = "" for result in dateTimeLocationAndEventList: for iter in result: if ("undetermined" not in iter) and (iter not in resultString): resultString += iter resultString += ", " for info in infoTypes: if info not in resultString: resultString += info + ": undetermined" resultString += ", " resultString = resultString.rstrip(" ") resultString = resultString.lstrip(" ") resultString = resultString.rstrip(",") resultString = resultString.lstrip(",") # So if multiple dates were found by the tagger then just offer # The other date as additional info, this ultimately makes the program more robust! # Forget about checking times, because this is already double checked by regex! for info in infoTypes: if (info == 'DATE'): checkerInfo = info + ":" count = resultString.count(info) if (count > 1): newString = resultString.rsplit(info, resultString.count(info) - 1) new = info + "_ADDITIONAL_INFO_FOUND" resultString = new.join(newString) return resultString
lemmatizer = WordNetLemmatizer() for word in big_words: lemmatized_tokens.append(lemmatizer.lemmatize(word)) #Add POS Tagging to each Lematized Words pos_tagged_list = pos_tag(lemmatized_tokens) #[('Abu','NN')] #------------------ Chunking (Shallow Parsing) ----------------------- grammar = """ NP: {<DT>?<JJ>*<NN>} {<NNP>+} {<NN><NN>} {<NNS><VBP>} {<V.*> <TO> <V.*>} {<N.*>(4,)} """ NPChunker = RegexpParser(grammar) #Chunking Rule # Return the best chunk structure for the given tokens and return a tree. # http://www.bogotobogo.com/python/NLTK/chunking_NLTK.php chunked_result = NPChunker.parse(pos_tagged_list) ''' (S (NP accenture/NN) (NP plc/NN) (NP global/JJ management/NN) ''' #------------------ Shallow Parsed List ----------------------- #We only need nouns that contribute to the sentence. Ignore everything else. shallow_parsed_list = list()
content = inputFile.read() inputFile.close() nltk.download('averaged_perceptron_tagger') nltk.download('punkt') contentSplit = word_tokenize(content) print("After Split:", contentSplit) tokens_tag = pos_tag(contentSplit) print("After Token:", tokens_tag) patterns = """groupeNom:{<JJ.*><NN.*>} {<NN.*><NN.*>} {<JJ.*><JJ.*><NN.*>} {<JJ.*><NN.*><NN.*>} """ chunker = RegexpParser(patterns) print("After Regex:", chunker) output = chunker.parse(tokens_tag) #for outputBuf in output: #print("After Chunking",outputBuf); #Creating output files: outFile = open(outputPath, "w") for outputBuffer in output: if (len(outputBuffer[0][0]) > 1): for outBufferSplit in outputBuffer: if (len(outBufferSplit[0]) > 2): outFile.write(outBufferSplit[0] + "\t" + outBufferSplit[1] + "\t") outFile.write("\n") outFile.close()
from nltk import RegexpParser from pos_tagged_oz import pos_tagged_oz from np_chunk_counter import np_chunk_counter # define noun-phrase chunk grammar here chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" # create RegexpParser object here chunk_parser = RegexpParser(chunk_grammar) # create a list to hold noun-phrase chunked sentences np_chunked_oz = list() # create a for loop through each pos-tagged sentence in pos_tagged_oz here for words in pos_tagged_oz: np_chunked_oz.append(chunk_parser.parse(words) ) # store and print the most common np-chunks here most_common_np_chunks = np_chunk_counter(np_chunked_oz) print(most_common_np_chunks)
if first_idx is None: first_idx = int(token_range.split(':')[0]) new_token_range = ':'.join( [str(int(x) - first_idx) for x in token_range.split(':')]) new_line = line.strip()[:-len(token_range)] + new_token_range #print(token_range,new_token_range,first_idx,new_line) new_lines.append(new_line) new_texts.append('\n'.join(new_lines).strip()) return new_texts errors_to_correct = [ (('Prepositions', 'Prepositional_verb', 'Prepositional_adjective', 'Prepositional_adv', 'Prepositional_noun'), ('Spelling', ), PrepositionCorrector(), prepositions, RegexpParser('NP: {<IN|TO>?<DT|JJ.?|PRP\$|POS|RB.|CD|NN.*>*<NN.*|PRP>}')) ] #(('Articles',('Spelling','Prepositions','Prepositional_verb','Prepositional_adjective','Prepositional_adv','Prepositional_noun'), #ArticleCorrector(),['a','an','the','zero'],RegexpParser(r'NP: {<DT|JJ.?|PRP\$|POS|RB.|CD|NN.*>*<NN.*>}'))] #regexp-based chunker for err, preverr, corrector, options, chunker in errors_to_correct: predsp = None predst = None correct = [] all_sents = [] tagged_sents = [] init_sents = [] tn = 0
def GetVerbPhrase(sentence): #print('GetNounPhrase is called') output = '' verb_token = '' #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'VP: {<VB> | <VBP>}' #Create the Parser Object cp = RegexpParser(grammar) #Tokenize the input and get part of speech pos = pos_tag(word_tokenize(sentence)) result = cp.parse(pos) #Debug: look at the tree formed #result.draw() #print(result) #Loop through the tree datastructure and pull the values under DNP node #we created for the result for subtree in result.subtrees(filter=lambda t: t.label() == 'VP'): verb_token = ' '.join(item[0] for item in subtree.leaves()) #print('verb found:' + verb_token) misclassified_verbs = ['is', 'are', 'am', 'do'] if verb_token in misclassified_verbs: return ''; #if it is a verb that cannot be converted just return blank if (len(verb_token.strip()) == 0): return verb_token.strip() #if there's no verb just return blank #Second half of the program #Begin with creating a wordnet library object wn = wordnet #debugging #wl = WordNetLemmatizer() #wn.lemma('give.v.01.give').derivationally_related_forms() #Use try catch loop because some verbs do not have a noun form and result #in exception error try: #create a lemma word of hte form verb + v.01 + verb => this is what wordnet lemma method takes lemma_word = verb_token + '.v.01.' + verb_token #debug to try # wn.lemma('perform.v.01.perform').derivationally_related_forms() #Call the lemma function and then derivationally_related_forms() to get all the applicable #word forms wordnet can give us lemma_output = wn.lemma(lemma_word).derivationally_related_forms() #debug #print(lemma_output) #if we find a noun form ending with ing, ial, ion we want it! for x in lemma_output: #print (x.name()) if (re.search(r'ing$|ial$|ion$', x.name())): return x.name() #if its not one of the three above, return the first noun form found output = lemma_output[0].name() except: output = '' #Ideally handle the exception, in this case we return a blank #print("Oops!", sys.exc_info()[0], "occurred.") return output
text = '민병삼 대령의 항명행위로 초치했다' from konlpy.tag import Okt twitter = Okt() words = twitter.pos(text, stem=True) print(words) from nltk import RegexpParser grammar = """ NP: {<N.*>*<Suffix>?} # 명사구를 정의한다 VP: {<V.*>*} # 동사구를 정의한다 AP: {<A.*>*} # 형용사구를 정의한다 """ parser = RegexpParser(grammar) print(parser) chunks = parser.parse(words) #확print(chunks)