def Chunk(self, sentence, node='NP', grammer=r""" NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+} """): ''' Takes text and returns a list of noune and noun phrases, this is done by a form RegEx matching which is included in the NLTK libary. @param text: the text that is going to be chunked @param node='NP': this is which node to chunk @param grammer='NP: {<DT|PP\$>?<JJ>*<NN>}{<NNP>+}': the grammar ReGex to use for chunking @return: A nested list of tuples of chunked phrases with pos tagging. ''' tmp = [] cp = RegexpParser(grammer) for sent in sentence: for phrase in self.sub_leaves(cp.parse(sent), node): tmp.append(phrase) results = [] for phrase in tmp: string = "" for (word, tag) in phrase: string = string + word + " " results.append(string[:-1]) return results
def ProcessWoeds(self, arr): tagged = pos_tag(arr) chunkGram = r"""Chunk:{<RB.?>*<VB.?>*<NNP>}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) return chunked print(chunked)
def get_chunks(tagged_sentences): master_list = [] master_noun = [] master_adj = [] grammar = r""" CHUNK1: {<NN.*><.*>{0,3}<JJ.*>} # Any Noun terminated with Any Adjective CHUNK2: {<JJ.*><.*>{0,3}<NN.*>} # Nouns or Adjectives, terminated with Nouns """ cp = RegexpParser(grammar) for sent in tagged_sentences: tree = cp.parse(sent) for subtree in tree.subtrees( filter=lambda t: t.label() in ['CHUNK1', 'CHUNK2']): if (str(subtree).find('NN') > 0 or str(subtree).find('NNS') > 0 or str(subtree).find('NNP') > 0) and ( str(subtree).find('JJ') > 0 or str(subtree).find('JJS') > 0 or str(subtree).find('JJR') > 0): nouns = [ word for word, tag in subtree.leaves() if tag in ['NN', 'NNS', 'NNP'] ] adjss = [ word for word, tag in subtree.leaves() if tag in ['JJ', 'JJR', 'JJS'] ] master_noun.extend([nouns]) master_adj.extend([adjss]) return [m[0] + ":" + n[0] for m, n in zip(master_noun, master_adj)]
class RegexpChunker(Chunker): """ Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases. setupData: es el string de las gramaticas """ def __init__(self,setupData): super(RegexpChunker,self).__init__(setupData) self.chunker=RegexpParser(setupData) def tag(self,data): if self.fixer_function: data=self.fixer_function(data) iobs=None try: parsedTree=self.chunker.parse(data) iobs= tree2conlltags(parsedTree) except Exception,e: pass return iobs
class RegexpChunker(Chunker): """ Este tagger de n-gramas o chunker utiliza gramaticas para detectar frases. setupData: es el string de las gramaticas """ def __init__(self,setupData): super(RegexpChunker,self).__init__(setupData) self.chunker=RegexpParser(setupData) def tag(self,data): if self.fixer_function: data=self.fixer_function(data) iobs=None try: parsedTree=self.chunker.parse(data) print parsedTree iobs= tree2conlltags(parsedTree) except Exception,e: pass return iobs
def parse_request(message): tagPatterns = [ (r'(honda|toyota|ford|kia|hyundai|audi|bmw|opel|mitsubishi|mazda|skoda|skoda|subaru)$', 'VENDOR'), (r'([a-zA-Z0-9]+)$', 'MODEL'), (r'(от|для)$', 'PREP'), (r'(нах|бля|твою мать)$', 'PROFANITY'), (r'([а-яА-Я]+)$', 'PART_NAME'), ] tagger = nltk.RegexpTagger(tagPatterns) taggedRequest = tagger.tag(nltk.word_tokenize(message)) chunker = RegexpParser(r''' S: {<CAR> <PREP>? <PART_NAME>} MODEL: {<MODEL>+} VENDOR: {<VENDOR>} CAR: {<VENDOR> <MODEL>} PROFANITY: {<PROFANITY>+} PART_NAME: {<PART_NAME>+} ''') tree = chunker.parse(taggedRequest) car = list(tree.subtrees(lambda t: t.label() == 'VENDOR')) parsed_request = {} # Hack with try except try: parsed_request['vendor'] = list( tree.subtrees(lambda t: t.label() == 'VENDOR'))[0].leaves()[0][0] except Exception: parsed_request['vendor'] = None try: parsed_request['model'] = ' '.join([ leave[0] for leave in list( tree.subtrees(lambda t: t.label() == 'MODEL'))[0].leaves() ]) except Exception: parsed_request['model'] = None try: parsed_request['part_name'] = ' '.join([ leave[0] for leave in list( tree.subtrees(lambda t: t.label() == 'PART_NAME'))[0].leaves() ]) except Exception: parsed_request['part_name'] = None try: if len(list(tree.subtrees(lambda t: t.label() == 'PROFANITY'))): parsed_request['profanity'] = True else: parsed_request['profanity'] = False except Exception: parsed_request['profanity'] = False return parsed_request
class NLTKChunker(PackProcessor): r"""A wrapper of NLTK chunker. """ def __init__(self): super().__init__() self.chunker = None # pylint: disable=unused-argument def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.chunker = RegexpParser(configs.pattern) @classmethod def default_configs(cls): r"""This defines a basic config structure for NLTKChunker. """ config = super().default_configs() config.update({ 'pattern': 'NP: {<DT>?<JJ>*<NN>}', 'token_component': None, 'sentence_component': None }) return config def _process(self, input_pack: DataPack): for sentence in input_pack.get( Sentence, components=self.configs.sentence_component): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, components=self.configs.token_component)) tokens = [(token.text, token.pos) for token in token_entries] cs = self.chunker.parse(tokens) index = 0 for chunk in cs: if hasattr(chunk, 'label'): # For example: # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end phrase = Phrase(input_pack, begin_pos, end_pos) phrase.phrase_type = chunk.label() index += len(chunk) else: # For example: # chunk: ('is', 'VBZ') index += 1
def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'): chunks = [] chunkParser = RegexpParser(expression) try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "CHUNK": chunks.append(subtree.leaves()) except ValueError: chunks = [] return chunks
def _chunker(self, tuple_sent): """Chunk base-phrases using chunking rules. Args: tuple_sent (list(tuple(str, str))) Returns: chunk_struct Tree('S', [Tree('CHUNK', [(str, str), (str, str)]], (str, str), ...): chunked sentence """ chunkTreeList = [] chunker = RegexpParser(self._ChunkingRule(self._CHUNK_RULE_VXP_)) chunk_struct = chunker.parse(tuple_sent) return chunk_struct
def find_keywords(text): """ Extracts keywords from text. Args: text: A text fragment. Returns: A list containing the extracted keywords. """ grammar = r''' KEYWORD: {<NNP><NNP>+} {<NN.*><NN.*>+} {<JJ>+<NN>+} ''' parser = RegexpParser(grammar) sentences = [ ] words = [ ] keywords = [ ] for sentence in sent_tokenize(text): tokens = word_tokenize(sentence) if not tokens: continue sentences.append(tokens) words += tokens collocations = find_collocations(words) for sentence in sentences: tree = parser.parse(pos_tag(sentence)) for node in _select_nodes(tree, ['KEYWORD']): word = ' '.join(map(lambda p: p[0], node)) if word in collocations: keywords.append(word) keywords = sorted(keywords, key=lambda k: len(k.split()), reverse=True) instances = { } for k in keywords: key = k for existing in instances.keys(): if re.match(k, existing): key = existing break instances[key] = instances.get(key, 0) + 1 results = instances.items() results.sort(key=lambda item: int(item[1]), reverse=True) return map(lambda item: item[0], results)
def rule_based_reqs_chunk(tagged_reqs, ids): chunker = RegexpParser(ruleset) terms = [] term_index = [] for i, t in enumerate(tagged_reqs): s = chunker.parse(t) for c in s: if not isinstance(c, tuple): if c.label() == 'NP': term = [] for tagged_word in c: if (tagged_word[1] != 'DT') and (tagged_word[1] != 'PRP$'): term = term + [tagged_word[0]] terms.append(term) term_index.append(i) return terms, term_index
class NLTKChunker(PackProcessor): r"""A wrapper of NLTK chunker. """ def __init__(self): super().__init__() self.chunker = None self.token_component = None # pylint: disable=unused-argument def initialize(self, resource: Resources, configs: HParams): self.chunker = RegexpParser(configs.pattern) @staticmethod def default_configs(): r"""This defines a basic config structure for NLTKChunker. """ return { 'pattern': 'NP: {<DT>?<JJ>*<NN>}', } def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, component=self.token_component)) tokens = [(token.text, token.pos) for token in token_entries] cs = self.chunker.parse(tokens) index = 0 for chunk in cs: if hasattr(chunk, 'label'): # For example: # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end phrase = Phrase(input_pack, begin_pos, end_pos) kwargs_i = {"phrase_type": chunk.label()} phrase.set_fields(**kwargs_i) input_pack.add_or_get_entry(phrase) index += len(chunk) else: # For example: # chunk: ('is', 'VBZ') index += 1
def parse(self): """ Parse le texte tokenisé à l'aide de notre grammaire créé pour récupérer les groupes de mots contenant une NE. """ if self.own_tag: rp = RegexpParser(Parser.GRAMMAR_OWN_TAG) else: rp = RegexpParser(Parser.GRAMMAR) tree = rp.parse(self.tokens) for subtree in tree.subtrees(): if subtree.label() == "S": continue self.tagged_nodes.append( [subtree.label(), subtree.leaves()] ) print(self.tagged_nodes)
def additionalExtractions(dep_triples, tagged_sentence, svo_triples): if not svo_triples: return None grammar = "SmallNP: {(<CD.*>|<JJ.*>)<NN.*>+}" cp = RegexpParser(grammar) chunk = cp.parse(tagged_sentence) triple_array = [] for subtree in chunk.subtrees(): if subtree.label() == 'SmallNP': for triple in svo_triples: pos = subtree.leaves() loc1 = tag_index(pos, triple[0]) if loc1 != -1: triple_array.extend(chunk_triples(pos, loc1)) loc2 = tag_index(pos, triple[2]) if loc2 != -1: triple_array.extend(chunk_triples(pos, loc2)) return triple_array
def preprocessing(self,desc): desc = desc.replace(","," ") desc = desc.replace("!","") desc = desc.replace("@","") desc = desc.replace("#","") desc = desc.replace("%","") desc = desc.replace("(","") desc = desc.replace(")","") desc = desc.replace(":","") desc = desc.replace("{","") desc = desc.replace("}","") desc = desc.replace("`","") desc = desc.replace("[","") desc = desc.replace("]","") desc = desc.replace("'","") desc = desc.replace("*","") desc = desc.replace("&","") desc = desc.replace("^","") print desc if "I/O" in desc: desc = desc.replace("I/O","IO") desc = desc.replace("/"," and ") tokenized = nltk.word_tokenize(desc) posTag = nltk.pos_tag(tokenized) grammar = ''' RB: {<RB> | <RBS> | <RBR>}''' chunker = RegexpParser(grammar) chunked = chunker.parse(posTag) print chunked for n in range(len(chunked)): if str(chunked[n]).startswith('(RB') is True: if n is 0 : s = str(chunked[n]).split(" ") ss = s[1].split("/") removalWord = ss[0] desc = desc.replace(removalWord+" ","") if n>0 and n<=len : s = str(chunked[n]).split(" ") ss = s[1].split("/") removalWord = ss[0] desc = desc.replace(" "+removalWord,"") return desc
def exctract_ngrams(self, tagged_sent): ''' Exctract ngrams, given a list of chunk rules for the previously tagged sentence. Keyword arguments: @param tagged_sent the POST tagged sentence whose ngrams need to be exctracted ''' chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) ngrams = [] for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): ngrams.append(probable_ngram) return ngrams
def extractPossibleTerms(root, fileids): # get corpus #root, filename = os.path.split(path) reader = PlaintextCorpusReader(root, fileids) # get chunker grammar = 'NP: {<JJ>*<NNP>*<NN>*}' chunker = RegexpParser(grammar) # get terms terms = set() print len(reader.sents()) i = 0 for sent in reader.sents(): i += 1 if i%100==0: print i tree = chunker.parse(pos_tag(sent)) for t in tree.subtrees(lambda t: t.node!='S'): # exclude Sentence node terms.add(' '.join([el[0] for el in t])) return terms
def chunking_noun(document): #Get the words in the document words = word_tokenize(document) tagged = nltk.pos_tag(words) counts = Counter(tag for WORD, tag in tagged) counts = dict(counts) #print(counts) chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) serch_keywords = [] for tree in chunked.subtrees(): if tree.label() == 'PHRASE': serch_keyword = ' '.join([x for x, y in tree.leaves()]) serch_keywords.append(serch_keyword) serch_keywords = [ w for w in serch_keywords if len(w.split(' ')) > 1 and len(w.split(' ')) <= 3 ] return serch_keywords, tagged, counts
def get_noun_phrases(text_list, tagger): noun_phrases = [] tagged_texts = [tagger.tag(text.split()) for text in text_list] expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}' chunkParser = RegexpParser(expression) for tagged_sent in tagged_texts: try: if len(tagged_sent) == 0: tree = Tree('S', []) else: tree = chunkParser.parse(tagged_sent, trace=0) for subtree in tree.subtrees(): if subtree.label() == "NOUN_PHRASE": noun_phrases.append([el[0] for el in subtree.leaves()]) except ValueError: noun_phrases = [] return noun_phrases
def extract_candidate_phrases(document_obj, parts_of_speech_re=DEFAULT_RE): ''' :param document_obj: document from which you want to extract parts of the speech (candidate phrases) :param parts_of_speech_re: regular expression with parts of speech structure :return: dict, keys are the sentence id and values list of candidate phrases for that sentence ''' candidate_phrases = {} # get sentences of the document sentences = document_obj.get_sentences() # for each sentence for sentence in sentences: sentence_id = sentence.get_sentence_id() # get tokens tokens_objs = sentence.get_tokens() # list of tuples with token and its pos token_pos_list = [(token_obj.get_token_str(), token_obj.get_token_pos()) for token_obj in tokens_objs] # create regex parser with regular expression of tags regex_parser = RegexpParser(parts_of_speech_re) sentence_regex_tree = regex_parser.parse(token_pos_list) # get all subtrees with NP label match_subtrees = sentence_regex_tree.subtrees( filter=lambda t: t.label() == STAGE_MARKER) sentence_candidate_phrases = [] # add candidate phrases for subtree in match_subtrees: leaves_str = ' '.join( [leave_token_pos[0] for leave_token_pos in subtree.leaves()]) sentence_candidate_phrases.append(leaves_str) candidate_phrases[sentence_id] = sentence_candidate_phrases return candidate_phrases
class TreeChunker(ContextChunker): def __init__(self, patterns: str, loop: int = 1, trace: int = 0, attribute: str = 'pos', apply_iob2: bool = True) -> None: self.__attribute = attribute self.__regex_parser = RegexpParser(patterns, root_label='', loop=loop, trace=trace) self.__apply_iob2 = apply_iob2 def tag(self, context: Context) -> List[str]: tokens_to_chunk = [ 'NULL' if tk == '' else tk for tk in context.get(self.__attribute) ] chunk_struct = list(zip(context.get('tokens'), tokens_to_chunk)) return self._traverse_tree(self.__regex_parser.parse(chunk_struct)) def _traverse_tree(self, tree, is_subtree: bool = False): tags = [] for i, subtree in enumerate(tree): if isinstance(subtree, nltk.tree.Tree): tags.extend(self._traverse_tree(subtree, True)) else: tag = tree.label() if is_subtree: index = '' if self.__apply_iob2: index = 'B-' if i == 0 else 'I-' tag = f'{index}{tag}' tags.append(tag) return tags
class PostPatternStrategy(Strategy): """ Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token. Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles. """ def __init__(self, grammar="", loop=2): super(PostPatternStrategy, self).__init__() self.postChunker = RegexpParser(grammar, loop) self.grammar = grammar self.loop = loop def fix(self, feature): cleanSentence = feature tree = None try: grammar_pattern_to_clean = r'_.*' # caracter de separacion de niveles dentro de un mismo token. clean_pattern = '' modified_chunk_pattern = r'.*_' words, post, iobs = zip(*feature) wiobs = tuple( w + "_" + iob for w, iob in zip(words, iobs) ) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras. sentence = zip(words, wiobs) tree = self.postChunker.parse(sentence) loc_tags = tree2conlltags(flatten_deeptree( tree)) # voy de arbol a lista de tuplas de nuevo. cleanSentence = cleanIobs(words, post, loc_tags, grammar_pattern_to_clean, modified_chunk_pattern, clean_pattern) except Exception, e: pass return cleanSentence
def get_search_tags(a, verbose=False): if verbose: print() print('-' * 100) print("\tRunning `get_search_tags`...") print('-' * 100) search_tag_parser = RegexpParser("STAG: {\ (<RB>|<RBR>|<RBS>|<VB>|<VB[A-Z]>|<IN>|<CC>)\ (<JJ>|<JJR>|<JJS>|<DT>)\ (<NN>|<NNS>|<NNP>|<NNPS>)+\ }") pos_tags = pos_tag(word_tokenize(a)) if verbose: print("Part of Speech Tags:", pos_tags, '\n') data = search_tag_parser.parse(pos_tags) if verbose: print("Matched Search Tags:", data) return extract_tags(data)
def extract_candidate_keywords(document): #Get the words in the document words = word_tokenize(document) # Chunk first to get 'Candidate Keywords' tagged = nltk.pos_tag(words) chunkGram = r""" PHRASE: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+} """ chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) candidate_keywords = [] for tree in chunked.subtrees(): if tree.label() == 'PHRASE': candidate_keyword = ' '.join([x for x,y in tree.leaves()]) candidate_keywords.append(candidate_keyword) candidate_keywords = [w for w in candidate_keywords if len(w) > 3 and len(w.split(' ')) < 6] #print("Data XYZ:",candidate_keywords) return candidate_keywords
class PostPatternStrategy(Strategy): """ Hay casos en que las frases que deseamos detectar se basan en las palabras mas que en su clase gramatical. Tambien podemos ser mas precisos si podemos considerar distintos niveles del arbol, por ejemplo frases y palabras juntas dentro de una regla como un unico token. Esta estrategia permite mirar el arbol en altura y ancho, de modo que las gramaticas que escribamos podran ser mas presicas y flexibles. """ def __init__(self,grammar="",loop=2): super(PostPatternStrategy,self).__init__() self.postChunker=RegexpParser(grammar,loop) self.grammar=grammar self.loop=loop def fix(self, feature): cleanSentence=feature tree=None try: grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token. clean_pattern='' modified_chunk_pattern=r'.*_' words,post,iobs=zip(*feature) wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras. sentence=zip(words,wiobs) tree=self.postChunker.parse(sentence) loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo. cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern) except Exception,e: pass return cleanSentence
test_data = data[4000:] print train_data[7] simple_sentence = 'the quick fox jumped over the lazy dog' from nltk.chunk import RegexpParser from pattern.en import tag tagged_simple_sent = tag(simple_sentence) print tagged_simple_sent chunk_grammar = """ NP: {<DT>?<JJ>*<NN.*>} """ rc = RegexpParser(chunk_grammar) c = rc.parse(tagged_simple_sent) print c chink_grammar = """ NP: {<.*>+} # chunk everything as NP }<VBD|IN>+{ """ rc = RegexpParser(chink_grammar) c = rc.parse(tagged_simple_sent) print c tagged_sentence = tag(sentence) print tagged_sentence grammar = """ NP: {<DT>?<JJ>?<NN.*>}
def GetPatternsTree(tagsList, pattern, patternName): gramaticalAnalyse = RegexpParser(pattern) tree = gramaticalAnalyse.parse(tagsList) patt = ExtractPhrases(tree, patternName) return patt
def chunk(self, posTaggedQuote): '''Holds the chunkers used by the condensed class''' quoteItemCondensedList = [] #Need to zero this our for testing, might take away later EMPChunker = RegexpParser(r""" EMP: #Emotion Phrase {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>} #Modular, verb, anything, adjective, comma, conjunction {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Modular, verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><,><CC>} #Verb, anything, adjective, comma, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ><CC>} #Verb, anything, adjective, conjunction {(<VBP>|<VB>|<VBZ>|<VBD>)<RB><JJ>} #Verb, adverb, adjective {<MD><RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Modular, adverb, verb, adjective {<RB>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Adverb, verb, adjective {<MD>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Modular, verb, anything, adjective {(<VBP>|<VB>|<VBZ>|<VBD>)<TO>(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, "to", verb anything, adjective {(<VBP>|<VB>|<VBZ>|<VBD>)<JJ>} #Verb, anything, adjective """) PRPHChunker = RegexpParser(r""" PRPH: #Preposition Phrase {<.*>*<PRP><.*>*<EMP>} #Anything, proposition, anything {<EMP><.*>*<PRP><.*>*} #Anything, proposition, anything }<EMP>{ #Chink at the EMP chunk, recursion! """) #This is going to have to be recursive, to chunk the entire phrase #This section chunkes, and condenses, the EMP chunk becomes "EMP" #Then sets the happy level of the condesned quoteItem EMPChunked = EMPChunker.parse(posTaggedQuote) for piece in EMPChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'EMP')) #TESTING self.quoteItemCondensedList.append(('','EMP')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed() #Simulating the recursion, PRP chunk next #Want to chunk everything seperately, then figure out the best recursive algorithm newQuoteItemCondensedList = self.quoteItemCondensedList self.quoteItemCondensedList = [] #Clear the list to condense more PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList) for piece in PRPHChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'PRPH')) #TESTING self.quoteItemCondensedList.append(('','PRPH')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed() newQuoteItemCondensedList = self.quoteItemCondensedList self.quoteItemCondensedList = [] #Clear the list to condense more PRPHChunked = PRPHChunker.parse(newQuoteItemCondensedList) for piece in PRPHChunked: if type(piece) != tuple: #self.quoteItemCondensedList.append((piece, 'PRPH')) #TESTING self.quoteItemCondensedList.append(('','PRPH')) #TESTING else: self.quoteItemCondensedList.append(piece) self.printCondensed()
class Chunker: def __init__(self): grammar = r''' R-DATE: {<IN><CD><TO><CD>} R-DATE: {<IN><CD><IN><CD>} R-DATE: {<JJ><CD><CC><CD>} FULL-DATE: {<IN><CD><NNP><CD>} FULL-DATE: <VB.*>{<CD><NNP><CD>} MONTH-DATE: {(<IN|DT>)?<NNP><CD>} NP: {<JJR><IN><CD><NNS>} NP: {<IN><CD><NNS>} NP: {<CD><IN><DT><CD><NNS>(<JJ>)?} DM_DATE: {<IN><CD><NNP>}(<,>|<NN.*>) DATE: {<IN>(<DT>)?<CD>} DT-DATE: {<DT><CD>} POS-DATE: <POS>{<CD>} V-DATE: {<IN|CD><JJ><CD>} DATE: (<,>)?{<CD>}<,> N-DATE: (<,>)?{((<.*DATE><,>)+)?<CD><CC><CD>} NN-LST: {<NN.*>(<,><NN.*>)+(<,>)?<CC><NN.*>} NP: {(<RP|IN|NN.*|.*DT|RB|JJ.*|RB.*|POS|``|"|''|FW|POS-DATE|CD|TO|WRB>)*<NN.*>(<TO>(<DT>)?<NN.*>)?(<RB>)?(<IN>)?(<JJ|RB|CD|DT|POS>)*} NP: {<P-DATE><NP>} NP: {<NP><NP>} NP: {<NP><,><NP><,>} CC-NP: {<NP>(<CC><NP>)+} PP: {((<PDT>)?<DT>)?(<RB|IN|WRB|WDT|TO|JJ|PRP>)*<PRP.*>(<MD>)?} PP: {<WP|WRB>} PP: {<IN><WDT>(<DT|RBR>)*} PP: <,>{<DT><JJ>} NP: {<NP><PP><NP>} P-NP: {<PP><NP>(<,><NP><,>)?} C-PP: {(<CD><PP>|<PP><CD>)} CC-P-NP: {<P-NP|PP><CC><NP>} NP: {<NP><,>((<,|CC>)*<.*NP>)*<,>} VP: {<VB.*><IN><TO><DT><VB.*>} VP: {<VB.*><RP>} VP: {(<IN|TO|VB.*|.*DT|RB|JJ|EX|MD>)*<VB.*>(<JJ>)?(<RB>(<TO|JJ|>)?)?} VP: {<IN><DT><VB.*>(<RB><TO>)?} VP: {<RB|VB.*|MD|TO>*<VB.*><RB|VB.*|MD|TO>*} VP: {<VP><IN>} VP: {<IN><VP>(<RP>)?<TO>} VP: {((<DT>)?<IN>)?<WDT><VP>} VP: {<IN><DT-DATE><VP>} Y-DATE: <JJ>{<CD>} VP: {<JJ>}<Y-DATE> CC-VP: {<VP><NP><CC><VP><NP>} CC-NP: <VP>{<NP>(<,><NP>)*<CC><NP>} D-NP : <VP>{<.*DATE><.*NP>} CLAUSE-P: <,|CC>{<VP><P-NP>}(<,>|<CC>|<.*DATE>) CLAUSE-NS: <,>(<CC>)?{(<VP><.*NP>)+}<,> CLAUSE-NS: <CC>{(<VP><.*NP>)+} CLAUSE: {<NP>(<VP><.*NP>|<CC-VP>)+(.*P-NP)?} CLAUSE-P: {<PP|P-NP>(<VP><.*NP>|<CC-VP>)+} CLAUSE-P: <,>{<PP|P-NP><VP>}<,> CLAUSE-P: <,>{<PP|P-NP><VP><CLAUSE>} CLAUSE: <CC>{<NP><VP><CLAUSE-P>} CLAUSE-NS: <,>{<VP><.*NP>} CLAUSE-OSL: <CLAUSE-P><CC><,>{<NP>}<,> CLAUSE-OSR: <,>{<NP>}<CLAUSE-P> CLAUSE: {<NP><CLAUSE-P>} D-CLAUSE-P: {<CLAUSE-P><.*DATE>} D-CLAUSE-P: <,>{<DATE><CLAUSE-P>}<,> D-CLAUSE-P: <,>{<CLAUSE-P><,><VP><.*DATE>} D-CLAUSE: {<CLAUSE><.*DATE>} D-CLAUSE: {<.*DATE><,><CLAUSE>}<,> CLAUSE-NS: {<VP><.*NP>} D-CLAUSE-NS: {<CLAUSE-NS><.*DATE>} D-CLAUSE-NS: {<VP><NP><.*DATE>}<,> D-CLAUSE-NS: <CC>{<.*DATE>(<,>)?<CLAUSE-NS>} D-CLAUSE-P: {<P-NP><VP><.*DATE>} D-CLAUSE-M-P: {<.*DATE><,><CLAUSE-P>((<,|CC>)+<CLAUSE-P>)+} D-CLAUSE-M: {<.*DATE><,><CLAUSE-P>(<,>(<CC>)?<CLAUSE-NS>)+} D-CC-CLAUSE: {<.*DATE><CLAUSE><,><CC><CLAUSE>} D-CLAUSE: {<.*NP><.*VP><.*DATE>} D-CLAUSE: <,>{<.*DATE><.*CLAUSE.*>} D-CLAUSE-P: {<CLAUSE-P>(<,>)?(<.*NP>)?<.*DATE>} D-CLAUSE-P-L: <D-CLAUSE-P>(<,|CC>)+{<NP>(<,><NP>)*<.*DATE>} D-CLAUSE-P: {<.*DATE><,><CLAUSE-P>} D-CLAUSE-NS: <.*CLAUSE.*>(<,|CC>)*{<.*DATE>(<,>)?<CLAUSE-NS>} DD-CLAUSE: {<D-CLAUSE.*>(<,|CC>)+(<RB>)?<.*DATE>} D-CLAUSE-P: {<.*DATE><CLAUSE-P>}(<,>)? D-CLAUSE-P: (<,>)?{<CLAUSE-P><CC><D-CLAUSE-NS>} ''' self.chunker = RegexpParser(grammar, loop=1) self.exclude = {s for s in string.punctuation if s not in [';', ':', '&', ',', ]} self.exclude.add('``') self.exclude.add("''") def prepare_sentence(self, s: list) -> list: s = [n for n in s if n[0] not in self.exclude] txt = [w[0] for w in s] pos = nltk.pos_tag(txt) return [(w, ps, net) for (w, ps), (_, net) in zip(pos, s)] @staticmethod def tree_label_fix(tree: nltk.tree.Tree) -> nltk.tree.Tree: for st in tree: if isinstance(st, nltk.tree.Tree): if bool(re.match(r'.*CLAUSE.*', st.label())): if not bool(re.match('.*D-.*CLAUSE.*', st.label())): leafs = st.leaves() if any([n for n in leafs if n[2] == 'DATE']): # Fixing the label of the tree new_lbl = 'D-' + st.label() st.set_label(new_lbl) st.label() else: leafs = st.leaves() if not any([n for n in leafs if n[2] == 'DATE']): oldlbl = st.label() new_lbl = re.sub(r'D-', '', oldlbl) st.set_label(new_lbl) return tree def generate_tree(self, s: list) -> nltk.tree.Tree: # noinspection PyTypeChecker t1 = self.chunker.parse(s) return self.tree_label_fix(t1)
from nltk import Tree, RegexpChunkParser from nltk.chunk import RegexpParser from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''') print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir])
# Regex-based shallow parser. # The Tree structures used to represent parsed sentences in NLTK get converted to ChunkString objects here. # Create an object RegexpParser using chunking and chunking rules (classes ChunkRule and ChinkRule) smple_sntnc = 'The brown fox is quick and he is jumpling over the lazy dog' # Create POS tagged tokens from sample sentence tagged_sentence = tag(smple_sntnc) print(tagged_sentence) # Create the shallow parser grammar = """ NP: {<DT>?<JJ>?<NN.*>} ADJP: {<JJ>} ADVP: {<RB.*>} PP: {<IN>} VP: {<MD>?<VB.*>+} """ rc = RegexpParser(grammar) # Shallow parse the sample sentence c = rc.parse(tagged_sentence) print(c) # Evaluate parser performance on test data print(rc.evaluate(test_data))
from nltk import Tree, RegexpChunkParser from nltk.chunk import RegexpParser from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')] # forth chunker = RegexpParser(r''' NP: {<DT><NN.*><.*>*<NN.*>} }<VB.*>{''' ) print(chunker.parse(s)) # back t = Tree('S', s) cs = ChunkString(t) print(cs) ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns') ur.apply(cs) print(cs) ir = ChinkRule('<VB.*>', 'chink verbs') ir.apply(cs) print(cs) print(cs.to_chunkstruct()) # cs.to_chunkstruct().draw() chunker = RegexpChunkParser([ur, ir])
import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import pos_tag from nltk.chunk import RegexpParser nltk.download('averaged_perceptron_tagger') phrase = "I love Ice Cream. I also like steak" tokenized_sentences = sent_tokenize(phrase) tokenized_phrases = [ word_tokenize(sentence) for sentence in tokenized_sentences ] tagged_words = [pos_tag(phrase) for phrase in tokenized_phrases] print(tagged_words) grammar = r""" NP: {<PRP|NN|NNP>} """ parser = RegexpParser(grammar) results = [parser.parse(sentence) for sentence in tagged_words] print(results) results[0].draw()
synonyms = [] for syn in wn.synsets('girl'): print(syn) for lemma in syn.lemmas(): # A lemma is basically the dictionary form or base form of a word, as opposed to the various inflected forms of a word. print(lemma) synonyms.append(lemma.name()) synonyms antonyms = [] for syn in wn.synsets("girl"): for l in syn.lemmas(): if l.antonyms(): antonyms.append(l.antonyms()[0].name()) antonyms ###chunking#### from nltk import pos_tag tags = pos_tag(tokens) tags from nltk.chunk import RegexpParser grammar = "NP: {<DT>?<JJ>*<NN>}" chunker = RegexpParser(grammar) result = chunker.parse(tags) result chunker = RegexpParser(grammar) result = chunker.parse(tags) result
#help(t) sentCount = 1 sentScore = [] #tuple with (Subj-Obj , Verb-P , ) totalS = [] print "Processing input..." print "Number of sentences to process: ", len(arr_pos) for q in ["", vp, prd, cls1, cls2]: grammer += q npc = RegexpParser(grammer) print "\n\n" for i in arr_pos: print "Reading sentence ", sentCount sentCount += 1 t = npc.parse(i) print t tmpVP = [] tmpNP = [] tmpPrd = [] tmpCls = [] x1 = "" for x in t: try: if x.node == "VP": #print x x1 = addVerbPhrase(x) tmpVP.append(x1) if x.node == "NP":
sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000] #Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao) coment = str(input("Entre com o texto: ")) if coment == "default": coment = open("default.txt", "r").read().replace("\n", " ") #O texto é convertido em tokens tokens=nltk.word_tokenize(coment.lower()) #É etiquetada cada token do texto tags = etiq.tag(tokens) #É criado o analisador de expresões regulares contendo os padrões procurados analiseGramatical = RegexpParser(r""" PADRAO7: {<N><ADJ>} PADRAO1: {<ADJ><N>(<PREP>?<N>)*} PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?} PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?} PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+} PADRAO5: {<ADV><V>} PADRAO6: {<V><ADV>} """) #O analisador é então utilizado para a geração da árvore de padrões arvore = analiseGramatical.parse(tags) x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"), ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"), ExtractPhrases(arvore, "PADRAO5"), ExtractPhrases(arvore, "PADRAO6"), ExtractPhrases(arvore, "PADRAO7")] for aux in range(len(x)): print("PADRAO 0"+str(aux+1)+str(x[aux]))
from nltk.tokenize import word_tokenize from nltk.tag import pos_tag from nltk.chunk import RegexpParser tokenized_data = word_tokenize(dataset) pos_tagging = pos_tag(tokenized_data) chunk_sequence = """ chunk: {<NNPS>+} {<NNP>+} {<NN>}""" chunk = RegexpParser(chunk_sequence) chunked_data = chunk.parse(pos_tagging) print(chunked_data) """## Named Entity Recognition - Also known as - Entity Identification - Entity Chunking - Entity Extraction - It is a subtask of information extraction that classify named entities into pre-defined categories such as names of persons, organizations, locations - Tesla: Organization, Elon Musk: Person ### Applications - classify the contents to news providers - Efficent search Algorithms - Content recommendation - Question and Answer systems - Automatic Forwarding
''.join(c for c in s if c not in string.punctuation) for s in sentence_token ] sentence_token = [s for s in sentence_token if s] print(sentence_token) #POS Tagging, Chunking and N-grams def extract_ngrams(data, num): n_grams = ngrams(word_tokenize(data), num) return [' '.join(grams) for grams in n_grams] for t in sentence_token: #POS_Tagging print(t) wordsList = word_tokenize(t) pos_tagged = pos_tag(wordsList) print("After POS-Tagging\n") print(pos_tagged) #Chunking chunker = RegexpParser(r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""") output = chunker.parse(tagged) print("After chunking", '\n') print(output) #3-grams print("3 grams : ") print(extract_ngrams(t, 3))
# Filter out strings with an invalid tag taggedArticle = [sanitizeTags(unsanitizedList) for unsanitizedList in taggedArticleUnsanitized] # Chunk and calculate frequency frequency = {} paraNumber = -1 for para in taggedArticle: paraNumber += 1 if not len(para): # Ignore empty paragraphs continue # Extract all subtrees tagged with the right identifier for subtree in chunker.parse(para).subtrees( filter = lambda x: x.node == 'Nouns'): # Concatenate member strings leafString = ' '.join( [key.lower() for key, value in subtree.leaves()]) # Get the increment value increment = 1 if paraNumber == 0: increment = 3 # Title elif paraNumber == 1: increment = 2 # First paragraph # Increment the frequency of the current string if leafString in frequency: frequency[leafString] += increment