def NP_Chunker(sentence): """ grammar = "NP: {<DT>?<JJ>*<NN>}" This rule says that an NP chunk should be formed whenever the chunker finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN). <DT>?<JJ.*>*<NN.*>+. This will chunk any sequence of tokens beginning with an optional determiner, followed by zero or more adjectives of any type (including relative adjectives like earlier/JJR), followed by one or more nouns of any type. chunkGram = " {<RB.?>*<VB.?>*<NNP>+<NN>?}" """ grammar_exp = r""" NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun {<NNP>+} # chunk sequences of proper nouns """ chunkParser = RegexpParser(grammar_exp) r = chunkParser.parse(sentence) print(r.draw()) grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}" cp = RegexpParser(grammar) result = cp.parse(sentence) print(result) result.draw()
def natural_language_parser(text): text = open(f"{text}.txt", encoding='utf-8').read().lower() # Split the text into individual sentences and then individual words: word_tokenized_text = word_sentence_tokenize(text) # Print any tokenized word sentence: print(word_tokenized_text[100]) # Create a list to hold part-of-speech tagged sentences: pos_tagged_text = [pos_tag(word) for word in word_tokenized_text] # Print any part-of-speech tagged sentence: print(pos_tagged_text[100]) # Define noun phrase chunk grammar: np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" # Create noun phrase RegexpParser object: np_chunk_parser = RegexpParser(np_chunk_grammar) # Define verb phrase chunk grammar:: vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" # Create verb phrase RegexpParser object: vp_chunk_parser = RegexpParser(vp_chunk_grammar) # List of noun phrase chunked sentences: np_chunked_text = [] # List of verb phrase chunked sentences: vp_chunked_text = [] for sentence in pos_tagged_text: np_chunked_text.append(np_chunk_parser.parse(sentence)) # List of verb phrase chunked sentences: vp_chunked_text.append(vp_chunk_parser.parse(sentence)) # Most commons chunks: most_common_np_chunks = np_chunk_counter(np_chunked_text) most_common_vp_chunks = vp_chunk_counter(vp_chunked_text) print(most_common_np_chunks) print(most_common_vp_chunks)
def getNounPhrases(sentences): # Should remove stopwords nounPhrases = [] words = [word_tokenize(sentence) for sentence in sentences] taggedWords = [pos_tag(word) for word in words] for sent in taggedWords: grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>}""" parser = RegexpParser(grammar) sentence = parser.parse(sent) grammar = "NP: {<NN>+}" parser = RegexpParser(grammar) classTree = parser.parse(sentence) flatTree = [] for leaf in classTree: flatTree.append(leaf) for leaf in flatTree: if isinstance(leaf, Tree) and leaf.label() == 'NP': nounPhrase = '' for (word, cls) in leaf.leaves(): nounPhrase += " " + word nounPhrases.append(nounPhrase) return nounPhrases
class Chunking: def __init__(self, formats: list): self.__text = "\n".join(formats) self.__parse: RegexpParser = RegexpParser(self.__text) def setParser(self, formats): if type(formats) is list: self.__text = "\n".join(formats) else: self.__text = formats self.__parse = RegexpParser(self.__text) def getParse(self) -> RegexpParser: return self.__parse def addChunking(self, s: str): self.__text += "\n".join(s) self.__parse: RegexpParser = RegexpParser(self.__text) def parse(self, tokens) -> Tree: return self.__parse.parse(tokens) def merge(self, tokens): chunks = self.__parse.parse(tokens) ret = [] for N in chunks: if type(N) is Tree: label = N.label() text = " ".join([T[0] for T in N.leaves()]) ret.append((text, label)) else: ret.append(N) return ret
def chunkingTweet(filtrd_tweet): trees = [] # grammar = """ # ADJN:{<JJ><N.*>*} # VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)} # AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)} # VBAVB:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)(<RB>|<RBR>|<RBS>)} # MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)} # """ grammar = """ ADJN:{<JJ><N.*>*} VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)} AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)} MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)} """ chunkParser = REPARSE(grammar) rootTree = chunkParser.parse(filtrd_tweet) rootTree.draw() for tree in rootTree: if isinstance(tree,TREE.Tree): trees.append(tree) chunkParser.parse(filtrd_tweet) return trees
def VBD_question(tagged): """ Tries to leverage prepositions to generation questions. """ try: first_verb_index = next(i for i, pair in enumerate(tagged) if (pair[1] == 'VBZ' or pair[1] == 'VBP')) subject_phrase = [pair[0] for pair in tagged[:first_verb_index+1]] phrase_dict = {'VBDP': 'VBDP: {<VBD|VBP|VBN><RB|JJ>*<IN>}'} vbdp_fragments = [] for i, (key, phrase) in enumerate(phrase_dict.items()): cp = RegexpParser(phrase) if i==0: result = cp.parse(tagged) else: result = cp.parse(result) for i, item in enumerate(result): if type(item) is nltk.Tree: fragment = [pair[0] for pair in item] if item.node == 'VBDP': vbdp_fragments.append((fragment, i)) qa_list = [] for vbdp, index in vbdp_fragments: question_list = subject_phrase + vbdp question_list.append('what?') question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip() sentence_remainder = result[index+1:] sentence_remainder_treeless = [] for tree_or_tuple in sentence_remainder: try: tree_or_tuple.leaves() for leaf in tree_or_tuple.leaves(): sentence_remainder_treeless.append(leaf) except AttributeError: sentence_remainder_treeless.append(tree_or_tuple) answer_list = [pair[0] for pair in sentence_remainder_treeless] answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip() qa_list.append((question_string, answer_string)) return qa_list except: """ If not verb recognized above, simply split sentence based on prepositions. """ prep_indices = [i for i, pair in enumerate(tagged) if pair[1] == 'IN'] qa_list = [] for prep_index in prep_indices: question_list = [pair[0] for pair in tagged[:prep_index+1]] question_list.append('what?') question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip() answer_list = [pair[0] for pair in tagged[prep_index+1:]] answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip() qa_list.append((question_string, answer_string)) return qa_list
def recuperarEntidades(texto): chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} # Nouns and Adjectives, terminated with Nouns {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {(<NN|NNS><JJ>)|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<DT><NN|NNS><ENTI>} {<DT><NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} # Above, connected with in/of/etc... {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """) Lista = [] for sentence in sent_tokenize(texto): #print sentence tags=tagear(sentence) #tags=tagear(traducir(word_tokenize(sentence))) #print tags parsed = chunker.parse(tags) #print parsed for chunk in parsed: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node'): # print chunk #print chunk.leaves() #print ' '.join(c[0] for c in chunk.leaves()) Lista.append (' '.join(c[0] for c in chunk.leaves())) Lista = [] tags=tagear(sentence) parsed = chunker.parse(tags) for chunk in parsed: if hasattr(chunk, 'node'): Lista.append (' '.join(c[0] for c in chunk.leaves())) return Lista
def nounphrase2(abstract): ab = abstract.lower() ab_words = [word_tokenize(s) for s in sent_tokenize(ab)] pos_tagged = [] for sent in ab_words: pos_tagged.append(pos_tag(sent)) chunk_grammar = "NP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}" #chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" #chunk_grammar = "NP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" chunk_parser = RegexpParser(chunk_grammar) np_chunked = [] for sentence in pos_tagged: np_chunked.append(chunk_parser.parse(sentence)) most_com = np_chunk_counter(np_chunked) truelab = lambda row: row.true_label4 if row.Abstract == abstract else None p = df.apply(truelab, axis=1) if p.dropna().values[0] == 1.0: print('\n') print(p.dropna()) print(most_com) print('\n') count = 0 # print(most_com) #print('\n') # print("count2",count) if count >= 2: return 1 elif count == 1: return 0 elif count <= 0: return 2
def findNounPhrases(toRead, category): words = tagWords(toRead) #create file newFile = open(category.strip() + 'NounPhrases.txt', 'a') #? is include, if it exists #* is include it (however many) #+ is at least one patterns = r""" NP: {<VBD><N.*>+} {<VBG><N.*>+} {<VBN><N.*>+} """ chunker = RegexpParser(patterns) tree = chunker.parse(words) #subtree.leaves() returns list for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): newFile.write("\n") newFile.write("\n") for leaf in subtree.leaves(): newFile.write(str(leaf)) newFile.close()
def GetVerbDetNounPhrase(sentence): #print('GetNounPhrase is called') output = '' #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'DNP: {<(VB |VBP)><DT>?<NN>}' #Create the Parser Object cp = RegexpParser(grammar) #Tokenize the input and get part of speech pos = pos_tag(word_tokenize(sentence)) result = cp.parse(pos) #result.draw() #print(result) #Loop through the tree datastructure and pull the values under DNP node #we created for the result for tree in result.subtrees(): if tree.label() == 'DNP': name_match = ' '.join([x for x, y in tree.leaves()]) output = output + name_match return output
def pos_tagging(result): pos_tagged_words = [] for tf_idf_info in result: tf_idf_info["pos"] = morph.parse(tf_idf_info["normal_form"])[0].tag.POS if tf_idf_info["pos"] is not None: pos_tagged_words.append( (tf_idf_info["normal_form"], tf_idf_info["pos"])) # ToDo: Add reg exps for numeric patterns = """ many adj+noun:{<ADJF>+<NOUN>} noun+many adj:{<NOUN><ADJF>+} verb + noun:{<INFN><NOUN>+} verb + verb:{<INFN><INFN>} prep + verb/noun:{<PRCL>(<INFN>|<NOUN>)} verb + prep + verb?:{<INFN><PRCL><INFN>?} conj + verb/verb + conj:{(<INFN><CONJ>)|(<CONJ><INFN>)?} """ chunker = RegexpParser(patterns) tree = chunker.parse(pos_tagged_words) for subtree in tree.subtrees(): if subtree._label == "S": continue # highlight all words in collocation if one of them already was highlighted # TODO: Iterate through all elements of subtree (it might be > 2) term1, term2 = subtree[0][0], subtree[1][0] tf_idf_info1, tf_idf_info2 = next(x for x in result if x["normal_form"] == term1), \ next(x for x in result if x["normal_form"] == term2) if tf_idf_info1["highlight"] or tf_idf_info2["highlight"]: tf_idf_info1["highlight"], tf_idf_info2["highlight"] = True, True
def tokenise_subjects(subject): if subject == '' or subject is None: return [] split_subjects = [] phrase_pattern = 'CHUNK:{<JJ>*<NN.?>*<VBG>*}' phrase_chunker = RegexpParser(phrase_pattern) for s in subject.split(','): tokens = word_tokenize(s.strip().lower()) tags = pos_tag(tokens) phrases = [ ' '.join([leaf[0] for leaf in c.leaves()]) for c in phrase_chunker.parse(tags) if hasattr(c, 'label') and c.label() == 'CHUNK' ] for phrase in phrases: phrase_tokens = word_tokenize(phrase) phrase_tags = pos_tag(phrase_tokens) lemmatised_phrase = [] for pto, pta in phrase_tags: wn_tag = { 'n': wn.NOUN, 'j': wn.ADJ, 'v': wn.VERB, 'r': wn.ADV }.get(pta[0].lower(), None) if wn_tag is None: continue lemmatised = WordNetLemmatizer().lemmatize(pto, wn_tag) lemmatised_phrase.append(lemmatised) if len(lemmatised_phrase) > 0: lemmatised_phrase = ' '.join(lemmatised_phrase) split_subjects.append(lemmatised_phrase) return list(set(split_subjects))
def process_command(command): try: words = word_tokenize(command) tagged = pos_tag(words) chunkGram = r""" Tasks: {<VB.?>} Numbers:{<CD>} """ chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) traverse(chunked) #chunked.draw() if (len(numbers) > 2): speak(to_many_numbers) return elif (len(numbers) < 2): speak(to_less_numbers) return if (possible_tasks.isdisjoint(tasks)): speak(unknown_task) return if ('add' in tasks): add(numbers[0], numbers[1]) elif ('subtract' in tasks): sub(numbers[0], numbers[1]) elif ('multiply' in tasks): mul(numbers[0], numbers[1]) elif ('divide' in tasks): div(numbers[0], numbers[1]) except Exception as e: print(str(e))
def extract_entities(self, text, grammar=None, lang=None): """ Extract entities from text """ entities = [] if lang is None: lang = WORLD_2_NLTK[self.detect_language(text)] else: if lang in WORLD_2_NLTK.keys(): lang = WORLD_2_NLTK[lang] else: lang = self._lang if lang == 'japanese': return JAParser().extract_entities(text), lang pos_sentences = [ pos_tag(self.word_tokenize(sentence, lang=lang)) for sentence in self.sent_tokenize(text, lang=lang) ] if grammar is not None: chunker = RegexpParser(grammar) for pos_sentence in pos_sentences: tree = chunker.parse(pos_sentence) self.logger.debug(tree) entities = entities + self._select_entities(tree) else: for pos_sentence in pos_sentences: tree = ne_chunk(pos_sentence, binary=False) self.logger.debug(tree) entities = entities + self._select_entities(tree) return entities, lang
class RegexChunker: def __init__(self): self.grammar = grammar self.parser = RegexpParser(grammar) def parse(self, tagged_sent): return self.parser.parse(tagged_sent)
def extraxt_semantic_chuncks(self, pos_tags): """ Extract chunks of text from the paper taking advantage of the parts of speech previously extracted. It uses a grammar Returns: chunks (list): list of all chunks of text """ grammar_parser = RegexpParser(GRAMMAR) chunks = list() pos_tags_with_grammar = grammar_parser.parse(pos_tags) #print(pos_tags_with_grammar) for node in pos_tags_with_grammar: if isinstance(node, tree.Tree) and node.label( ) == 'DBW_CONCEPT': # if matches our grammar chunk = '' for leaf in node.leaves(): concept_chunk = leaf[0] concept_chunk = re.sub( '[\=\,\…\’\'\+\-\–\“\”\"\/\‘\[\]\®\™\%]', ' ', concept_chunk) concept_chunk = re.sub('\.$|^\.', '', concept_chunk) concept_chunk = concept_chunk.lower().strip() chunk += ' ' + concept_chunk chunk = re.sub('\.+', '.', chunk) chunk = re.sub('\s+', ' ', chunk) chunks.append(chunk) return chunks
def identify_map_verbs_with_negation(nlp, sentences): """Errors are happens when sentences are list type, hint: extracts string from the list""" sentences = sent_tokenize(sentences) grammar = r""" GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP|VBD>+<IN|RP>*} """ # GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP>+<IN|RP>*} map_part_verb_and_negation = [] for sent in sentences: # sample = sent.split('=') words = word_tokenize(sent) tagged = pos_tag(words) cp = RegexpParser(grammar) t = cp.parse(tagged) # t.draw() negate = '' verb = '' verbs = [] for s in t.subtrees(): is_phrasal = False if s.label() == "GR": for token in s.leaves(): if token[0] == 'is' or token[0] == 'are' or token[ 0] == 'does' or token[0] == 'do': continue elif token[1] == 'RB': negate = token[0] elif token[0] != "=": verb = verb + " " + token[0] verb = InputsOutputsStateFinder.phrasal_verb_verifier_or_verb_part_extractor( nlp, verb) verbs.append([negate, verb]) map_part_verb_and_negation.append(verbs) return map_part_verb_and_negation.pop()
def word_combination(pos_tagged): """Chunking of a part of speech tagged sentence based on specific grammar""" # Entity grammar used for the Penn Tree Bank Tagset for nouns and verbs grammar_noun = r""" NU: {<NN.*>+} """ grammar_verb = r""" VE: {<VB.*>+} """ cp_noun = RegexpParser(grammar_noun) nouns = cp_noun.parse(pos_tagged) cp_verb = RegexpParser(grammar_verb) verbs = cp_verb.parse(pos_tagged) return nouns, verbs
def GetNounPhrase(sentence): #print('GetNounPhrase is called') output = '' #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'NP: {<DT>?<JJ>*<NN.*>+}' #Create the Parser Object cp = RegexpParser(grammar) #Tokenize the input and get part of speech pos = pos_tag(word_tokenize(sentence)) result = cp.parse(pos) #for debugging #result.draw() #print(result) #Loop through the tree datastructure and pull the values under DNP node #we created for the result for subtree in result.subtrees(filter=lambda t: t.label() == 'NP'): output = ' '.join(item[0] for item in subtree.leaves()) # 'abc\nghi\nmno' return output
def GetName(sentence): #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'NAME: {<NNP>*|<NN?>*}' #Create the Parser Object cp = RegexpParser(grammar) common_words = {'hi', 'name', 'hello', 'thank', 'you', 'i', 'am', 'oh', 'hey', 'sure', 'yes', 'named', 'known'} #Tokenize the input word_tokens = word_tokenize(sentence) #Eliminate the greeting words and get straight to discerning the Name as NNP or NN word_tokens = [x for x in word_tokens if x.lower() not in common_words] #Obtain parts of speech for each token and run through parser pos = pos_tag(word_tokens) result = cp.parse(pos) #print statements for debugging #print(result) #result.draw() #Loop through the tree datastructure and pull the x (actual name), if the Root is 'NAME' #we created for the result output = "" for tree in result.subtrees(): if tree.label() == 'NAME': name_match = ' '.join([x for x,y in tree.leaves()]) output = output + ' ' + name_match return output.replace(" ", " ").strip()
def get_noun_counter(text) -> collections.Counter: text = text.split() tokens_tag = pos_tag(text) patterns= """mychunk:{<JJ.?>*<NN.?.?>*}""" chunker = RegexpParser(patterns) output = chunker.parse(tokens_tag) noun_list = [] compound_noun_list = [] for n in output: if isinstance(n, nltk.tree.Tree): n = str(n) part_of_speech = [el.split('/')[1]for el in n.split()[1:]] if any([el.find('NN')>-1 for el in part_of_speech]): noun = [ stemmer.stem(el.split('/')[0]) if el.split('/')[1] == 'NNS' or el.split('/')[1] == 'NNPS' else el.split('/')[0] for el in n.split()[1:] ] compound_noun_list.append(''.join([ f'{n} ' for n in noun ])[:-1]) noun_list.extend(noun) noun_list = [ noun for noun in noun_list if len(noun) > 1] return collections.Counter(noun_list), compound_noun_list
def run(posTaggedTokenListList, pos1, pos2): retVal = '' total = 0 # word pair 정규식을 정의한다. regex = ('pattern: {<%s><%s>}' % (getPosTagRegex(pos1), getPosTagRegex(pos2))) # 파서를 생성한다. parser = RegexpParser(regex) # 파싱한다. for posTaggedTokenList in posTaggedTokenListList: parsedTree = parser.parse(posTaggedTokenList) for subtree in parsedTree: if isinstance(subtree, tree.Tree): retVal += (subtree[0][0] + " ") retVal += (subtree[1][0] + "\r\n") total += 1 retVal = (("total: %d\r\npos1: %s, pos2: %s\r\n\r\n" % (total, pos1, pos2)) + retVal) return retVal
def output(): if request.method == 'POST': result = request.form['inputdata'] result = result.lower() #converting the paragraph in tokens for parts of speech tagging words = word_tokenize(result) allwords=[] for word in words: if word not in allwords: allwords.append(word) ##lm = WordNetLemmatizer() rootwords = [] ##for word in allwords: ## rootwords.append(lm.lemmatize(word)) #tagged all the parts of speech to words taggedwords = pos_tag(allwords) #splits the string and gets favorable parts of speech tagged words... Here I have taken all Noun, adjectives, Verbs and Adverbs chunkString = """Junk*1234: {<.*>+} }<NN*|JJ|VB.|RB>+{""" chunkParse = RegexpParser(chunkString) chunkedwords = chunkParse.parse(taggedwords) impwords = [] ## coverting chunked words tree to list of favorable words for words in chunkedwords: impwords.append(str(words)) keyvalue=[] for word in impwords: if ('Junk*1234' not in word): keyvalue.append(word[1:].split(",")[0]) return render_template("output.html", keyvalues=keyvalue,result=result)
def extract_NP(posTagged): grammar = r""" ADJ: {<RB.*>? <JJ.* | VBG>} ADJLIST: {<ADJ> (<CC>? <,>? <ADJ>)*} ADJNOUN: {<ADJLIST>? <NN.*>+} PREFIXEDNOUN: {<DT|PRP\$>? (<ADJNOUN> <POS>)* <ADJNOUN>} PP: {<IN><PREFIXEDNOUN>} NP: {<PREFIXEDNOUN> (<PP>)*} {<PRP>} """ chunker = RegexpParser(grammar) ne = [] chunk = chunker.parse(posTagged) for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'): ne.append(' '.join([child[0] for child in tree.leaves()])) return ne
def post_text_process(text): """ Performs a post process with the text, preparing it for text deep analysis or other purposes. """ # Splits words to tokens tokenized_word = word_tokenize(text.lower()) # Normalize words to normal form # aka: playing -> play lemmatizer = WordNetLemmatizer() lemmatized_words = [] for word in tokenized_word: lemmatized_words.append(lemmatizer.lemmatize(word)) # Remove stopwords from the set stopwords_set = set(stopwords.words('english')) stopwords_set.add('.') stopwords_text = [ word for word in lemmatized_words if word not in stopwords_set ] # Named entity recognition tags = nltk.pos_tag(stopwords_text) chunk = ne_chunk(tags) chunking_rule = "NP: {<DT>?<JJ>*<NN>}" chunking_text_parsed = RegexpParser(chunking_rule) chunking_result = chunking_text_parsed.parse(tags) # Returns important information created by the post processing return chunking_result, chunk
def data_gathering_iterator(file_path, morph, grammar=COMPLEX_GRAMMAR): """На каждой итерации возвращает список полученных из одной строки комбинаций прилаг + сущ. Элемент списка кортеж (прилагательное, существительно). Прилагательное и существительное приведены к нормальной форме :param file_path: путь до файла с данными. Файл должен быть в формате UTF-8 :param morph: морфология """ chunk_parser = RegexpParser(grammar) f = open(file_path, "r") for line in f: try: line = line.decode('utf-8') except UnicodeDecodeError: continue line = line.strip() #разбиваем на предложения for sentence in sent_tokenize(line): sentence = sentence.strip() if sentence: tokens = word_tokenize(sentence) tagged_tokens = pos_tag(tokens) tree = chunk_parser.parse(tagged_tokens) for subtree in tree.subtrees(): if subtree.node == u"CHUNK": adj_noun_list = get_adj_noun_list_from_chunk(subtree) yield normilize_adj_noun_list(adj_noun_list, morph)
def convert_to_noun(sen): sen = ie_preprocess(sen) grammar = r""" NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun {<NNP>+} # chunk sequences of proper nouns """ cp = RegexpParser(grammar) res = cp.parse(sen[0]) print(res) ROOT = 'ROOT' tree = res output = [] def getNodes(parent): for node in parent: if type(node) is Tree: print("Label:", node.label()) print("Leaves:", node.leaves()) if node.leaves()[0][1] in ("NN", "JJ"): if node.leaves()[0][0] not in output: output.append(node.leaves()[0][0]) print(node.leaves()[0][0]) getNodes(node) else: print("Word:", node) if node[1] in ("NN", "JJ"): if node[0] not in output: output.append(node[0]) getNodes(tree) print(output) return " ".join(output)
def on_get(self, req, resp, id): print(id) if (len(id) > 0): arts_obj = ArticleModel.objects(_id=ObjectId(id)) art = arts_obj[0] print(art['href']) title = art['title'] toks = word_tokenize(title) sent = sent_tokenize(art['txt']) sent = [word_tokenize(xt) for xt in sent] sent = [pos_tag(xt) for xt in sent] print(sent) tag = pos_tag(toks) grammar = "NP: {<DT>?<JJ>*<NN>}" patterns = """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}""" cp = RegexpParser(patterns) rslt = cp.parse(tag) print(rslt) resp.json = {'rslt': str(rslt)} else: #resp.status = falcon.HTTP_200 #arts = [] #arts_obj = ArticleModel.objects().all_fields() #for art in arts_obj: #print(art.to_json()) # arts.append(art.to_json()) callnames = ['tst'] resp.json = {'rslt': json.dumps(callnames)}
def drawNamedEntityTree(self, text): tokenized_text = tokenizer.tokenize(text) tagged_text = self.tagWords(tokenized_text) grammar = "ENT: {<PESSOA>*}" cp = RegexpParser(grammar) res = cp.parse(tagged_text) res.draw()
def on_get(self, req, resp, id): nnp1 = [] nnp2 = [] arts_obj = ArticleModel.objects().all_fields() for art in arts_obj: #print(art.to_json()) #arts.append(art.to_json()) titl = word_tokenize(art['title']) #sent = sent_tokenize(art['txt']) #words = [word_tokenize(xt) for xt in sent] #tags = [pos_tag(xt) for xt in words] nnp1pat = """nnp1: {<NNP.?>{1,}}""" nnp2pat = """nnp2: {<NNP.?>{2,}}""" tag = pos_tag(titl) cp = RegexpParser(nnp1pat) rslt1 = cp.parse(tag) cp = RegexpParser(nnp2pat) rslt2 = cp.parse(tag) def chkname(rslt, nnp): for mc in rslt: if 'NNP' in mc[0]: name = "" for wd in mc: #print(wd[0]) if len(wd[0]) == 1: print(mc) return else: name = name + wd[0] + ' ' name = name.strip() print(name) #print(mc.leaves, len(mc), mc) nnp.append(name) #if mc[:3] == "myc": # print(mc) #chkname(rslt2, nnp2) chkname(rslt1, nnp1) #print(rslt) #out = nltk.chunk.ne_chunk(sent) #nnp2 = list(dict.fromkeys(nnp2)) nnp1 = list(dict.fromkeys(nnp1)) #print(nnp2) resp.json = {'rslt': nnp1}
def recuperarEntidadesEs(self,texto): chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {<NN|NNS><JJ>|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<NN|NNS><ENTI>} {<NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """) '''chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {<NN|NNS><JJ>|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<DT><NN|NNS><ENTI>} {<DT><NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """)''' ObjTag = Tokenizar() Lista = [] Lista2 = [] for sentence in sent_tokenize(texto): tags=ObjTag.tagear(sentence) tagsentX=word_tokenize(sentence) filtered_words = ' '.join(w for w in tagsentX if not w in nltk.corpus.stopwords.words('spanish')) parsed = chunker.parse(tags) for chunk in parsed: if hasattr(chunk, 'node'): Lista2.append([chunk.leaves(),filtered_words]) Lista.append (' '.join(c[0] for c in chunk.leaves())) return Lista2
def syntax_parsing(pos_tagged): np_chunk_grammar = 'NP: {<DT>?<JJ>*<NN>}' vp_chunk_grammar = 'VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}' # <VB.*> matches VB->present tense, VBD->past tense, VBN->past participle # <RB.?> matches any form of adverb: regular RB, comparative RBR, superlative RBS np_chunk_parser = RegexpParser(np_chunk_grammar) vp_chunk_parser = RegexpParser(vp_chunk_grammar) np_chunked_text = [ np_chunk_parser.parse(sentence) for sentence in pos_tagged ] vp_chunked_text = [ vp_chunk_parser.parse(sentence) for sentence in pos_tagged ] return np_chunked_text, vp_chunked_text
def get_continuous_chunks(tokenized_text): # this regex is not working, change to another later NP = "(?:(?:\w+ ART)?(?:\w+ ADJ) *)?\w + (?:N[NP] | PRN)" chunker = RegexpParser(NP) tagged_text = PortugueseTextualProcessing.postag(tokenized_text, as_list=False) chunked = chunker.parse(tagged_text) return PortugueseTextualProcessing().extract_chunks(chunked)
def add_chunk_data(pos_data): chunker = RegexpParser(CHUNKER_GRAMMAR) chunks = {} for sentence_id, data in pos_data.iteritems(): result = chunker.parse(data) chunks[sentence_id] = {} chunks[sentence_id]['chunks'] = [' '.join([token for token, pos in t.leaves()]) for t in result.subtrees(lambda result: result.label() == 'SN')] return chunks
def tagtosem(sent): cp = RegexpParser(''' NP: {<DET>? (<ADJ>|<ADV>)* <CONJ>* (<NOUN>|<NUM>|<X>|(<PRON> <PRT>))* <PRON>?} R: {(<PRT> <VERB>?)* <A..>* <PRON>?} V: {<VERB>*(<PRT>*|<VERB>)*} PNC:{<\.>} C: {<ADP>} ''') return cp.parse(sent)
def extract_bow_from_raw_text(text_as_string): """Extracts bag-of-words from a raw text string. Parameters ---------- text (str): a text document given as a string Returns ------- list : the list of the tokens extracted and filtered from the text """ if (text_as_string == None): return [] if (len(text_as_string) < 1): return [] import nltk if '/home/hadoop/nltk_data' not in nltk.data.path: nltk.data.path.append('/home/hadoop/nltk_data') nfkd_form = unicodedata.normalize('NFKD', unicode(text_as_string)) text_input = nfkd_form.encode('ASCII', 'ignore') sent_tokens = sent_tokenize(text_input) tokens = map(word_tokenize, sent_tokens) sent_tags = map(pos_tag, tokens) grammar = r""" SENT: {<(J|N).*>} # chunk sequences of proper nouns """ cp = RegexpParser(grammar) ret_tokens = list() stemmer_snowball = SnowballStemmer('english') for sent in sent_tags: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'SENT': t_tokenlist = [tpos[0].lower() for tpos in subtree.leaves()] t_tokens_stemsnowball = map(stemmer_snowball.stem, t_tokenlist) #t_token = "-".join(t_tokens_stemsnowball) #ret_tokens.append(t_token) ret_tokens.extend(t_tokens_stemsnowball) #if subtree.label() == 'V2V': print(subtree) #tokens_lower = [map(string.lower, sent) for sent in tokens] stop_words = {'book', 'author', 'read', "'", 'character', ''}.union(ENGLISH_STOP_WORDS) tokens = [token for token in ret_tokens if token not in stop_words] return (tokens)
def _word_combination(self, pos_tagged_sentence): # Finding entities still testing grammar = r""" EN: {<NN.*><CD>+} """ cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def parse(query_text, networks_json): query_text = preprocess(query_text) tokens = word_tokenize(query_text) double_tokens = [ (w, w) for w in tokens ] wg = word_grammar() w_cp = RegexpParser(compile_grammar(wg)) word_result = w_cp.parse(double_tokens) word_result = convert_dates(word_result) new_tokens = list(zip(*(word_result.leaves()))[0]) tagged = pos_tag(new_tokens) domain_tagged = tag_domains(tagged, networks_json) tg = tag_grammar() t_cp = RegexpParser(compile_grammar(tg)) tagged_result = t_cp.parse(domain_tagged) slots = assign_slots(new_tokens, tagged_result, word_result) interpreted_input = make_sentence(slots) print 'tagged-result = ',tagged_result print 'word-result = ',word_result return {"parse":slots, "interpreted":interpreted_input}
def test_tag_pattern2re_pattern_quantifier(self): """Test for bug https://github.com/nltk/nltk/issues/1597 Ensures that curly bracket quantifiers can be used inside a chunk rule. This type of quantifier has been used for the supplementary example in http://www.nltk.org/book/ch07.html#exploring-text-corpora. """ sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')] # source: brown corpus cp = RegexpParser('CHUNK: {<N.*>{4,}}') tree = cp.parse(sent) assert tree.pformat() == """(S
def filter_sentences_by_chunk(pos_data, tokens): chunker = RegexpParser(CHUNKER_GRAMMAR) filtered = {} for sentence_id, data in pos_data.iteritems(): result = chunker.parse(data) good_one = False if 'CHUNK' in [s.label() for s in result.subtrees()]: for t in result.subtrees(lambda result: result.label() == 'CHUNK'): for token, pos in t.leaves(): if pos.find('VER') != -1 and token in tokens: good_one = True if good_one: filtered[sentence_id] = ' '.join(item[0] for item in data) return filtered
def word_combination(self, pos_tagged_sentence): """Chunking of a part of speech tagged sentence based on specific grammar""" # grammar = r""" # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+} # """ # Previous one grammar = r""" EN: {<JJ.*>*<NN.*>+} """ cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def grammar_selection(self, grammar=None): """ Select candidates using nltk RegexpParser with a grammar defining noun phrases (NP). Args: grammar (str): grammar defining POS patterns of NPs. """ # initialize default grammar if none provided if grammar is None: grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ # initialize chunker chunker = RegexpParser(grammar) # loop through the sentences for i, sentence in enumerate(self.sentences): # compute the offset shift for the sentence shift = sum([s.length for s in self.sentences[0:i]]) # convert sentence as list of (offset, pos) tuples tuples = [(str(j), sentence.pos[j]) for j in range(sentence.length)] # parse sentence tree = chunker.parse(tuples) # find candidates for subtree in tree.subtrees(): if subtree.label() == 'NP': leaves = subtree.leaves() # get the first and lest offset of the current candidate first = int(leaves[0][0]) last = int(leaves[-1][0]) # add the NP to the candidate container self.add_candidate(words=sentence.words[first:last+1], stems=sentence.stems[first:last+1], pos=sentence.pos[first:last+1], offset=shift+first, sentence_id=i)
def chunkingList(dataS, chunkgram): """ This function will find the chunk """ #data = str(dataS) words = word_tokenize(str(dataS)[1:]) #print words ps = pos_tag(words) # print ps chunkParser = RegexpParser(chunkgram) chunked = chunkParser.parse(ps) #print chunked tree = Tree('s', chunked) docs = [] for subtree in tree.subtrees(filter=lambda t: t.label() == 'Chunk'): # Assemble the chunk into one line and strip extra punctuations docs.append(" ".join([a for (a,b) in subtree.leaves()])) return docs
def word_combination(pos_tagged_sentence, tag_set='ptb'): """Chunking of a part of speech tagged sentence based on specific grammar""" # grammar = r""" # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+} # """ if tag_set == 'ptb': # Entity grammar used for the Penn Tree Bank Tagset grammar = r""" EN: {<JJ.*>*<NN.*>+} """ elif tag_set == 'universal': # Entity grammar used for the Universal Tagset grammar = r""" EN: {<ADJ>*<NOUN>+} """ else: raise SyntaxError cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def relationships_of(string): # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" pattern = "^\((.*?)\)" match = re.search(pattern, string, re.I) relationships = [] if match and len(match.groups()) > 0: relationship_text = match.group(1).encode("ascii", "replace") # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar from nltk import tree, pos_tag, RegexpParser tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) pos = pos_tag(tokens) grammar = r""" NAME: {<NNP>+} NAMES: { <IN><NAME>(?:<CC><NAME>)* } RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ } MATCH: { <RELATIONSHIP><NAMES> } """ cp = RegexpParser(grammar) chunks = cp.parse(pos) # iterate through the Relationship/Names pairs for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [x for x in piece if isinstance(x, tree.Tree)]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person}) return relationships
def parse_sent(self, pos_tagged_sentence, grammar=None): #wq = csv.writer(open('wiki2.csv','w')) parsed_tagged_sents = [] for grammar in GRAMMARS: #if not grammar: #grammar = r""" # NP_CHUNK: {<VBP|VBG|VB|VB*|IN|JJ>*<NNP|NN>*<VBP|VBG|VB|IN|JJ>?} # """ parsedsent = [] #parsed_tagged_sents = [] cp = RegexpParser(grammar) for sentence in pos_tagged_sentence: result = cp.parse(sentence) #print result for node in result: if str(type(node)) == "<class 'nltk.tree.Tree'>": #wq.writerow((grammar.strip(),sentence,node.leaves())) temp = ' '.join(word for word, POS in node.leaves()) if len(temp.split()) >= 2: parsedsent.append(temp) parsed_tagged_sents.append(node.leaves()) #print grammar,node.leaves() return parsed_tagged_sents
class Chunker: def __init__(self): grammar = r""" NP: {<DT>?<JJ.*|CD>*<NN.*>+} NP: {<NP><of><NP>} # need to change tags of "of" to <of>!! NP: {<NP><in><NP>} # need to change tags of "of" to <of>!! """ self.parser = RegexpParser(grammar) def parse(self, sent): """ sent should be a list of tuples of word and tag """ for i, (word, pos) in enumerate(sent): if word == 'of' or word == 'in': sent[i] = (word, word) return self.parser.parse(sent) def print_chunks(self, tree, label): for node in tree: if type(node) == Tree and node.node == label: print node.leaves() def get_chunks(self, tree, label): """ return a list of ranges (tuples) marking the start and end index of the chunk """ offset = 0 chunks = [] for node in tree: if type(node) == Tree and node.node == label: phrase_size = len(node.leaves()) chunks.append((offset, offset + phrase_size - 1)) offset += phrase_size else: offset += 1 return chunks
def compare(sentence, grammar): """ Compare sentence against a grammar rule to see if any matches are found Paramaters ---------- sentence: str a single sentence for which matches are to be found grammar: str grammar rule in regexp format Returns ------- matches: nltk.tree.Tree all matches with the grammar rule """ matches = [] # Apply grammar rule cp = RegexpParser(grammar) chunk = cp.parse(sentence) # Identify label of the rule label = grammar.split(':')[0] for n in chunk: if isinstance(n, nltk.tree.Tree): if n.label() == label: matches.append(n) if matches == []: matches.append('None') return matches
#term_tokenizer = PunktWordTokenizer() sys.stdin = codecs.getreader(locale.getpreferredencoding())(sys.stdin) sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr) def remove_trailing_period(token): return re.sub(r'\.$', '', token) for line in sys.stdin.readlines(): try: post_id, text = string.split(line.rstrip(), "\t") #print "post_id", post_id, "text", text for sentence in sent_tokenizer.tokenize(text): tokens = term_tokenizer.tokenize(sentence) tokens_without_trailing_periods = map(remove_trailing_period, tokens) # required for PunktWordTokenizer tagged = pos_tag(tokens_without_trailing_periods) #print "post_id",post_id,"sentence",sentence,"tagged",tagged if not len(tagged)==0: #print "tagged", tagged parse_tree = chunk_parser.parse(tagged) for subtree in parse_tree.subtrees(): if subtree.node == 'NP': phrase = subtree.leaves() phrase = " ".join([ term for (term,type) in phrase ]).strip() if len(phrase) > 1: # WordPunctTokenizer leaks in stuff like ',' and ' ' print "\t".join([post_id,phrase]) except: sys.stderr.write("error! on line ["+line+"]\n")
def parse_features(self,review): cp = RegexpParser(self.grammar) return cp.parse(review)
""") """ NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ #sentimentanalysis("su") print 'asadasdasdasdsadadsa' gs = goslate.Goslate() palabra=gs.translate('sido ', 'en') print palabra print 'asdasdsadasdadsadadsadsa' for sentence in sent_tokenize(texto): tags=tagear(sentence) #tags=tagear(traducir(word_tokenize(sentence))) #print tags parsed = chunker.parse(tags) print parsed """for chunk in parsed: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node'): print chunk #entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])"""
class Extraction: """This class is used to extract nouns, proper nouns, phrases from text""" def __init__(self, word_tokenize=None, sent_tokenize=None, pos_tag=None, stop_words=None, punct=None, grammar=chunk_grammar_propernouns): self._word_tokenize = word_tokenize if word_tokenize else nltk.word_tokenize self._sent_tokenize = sent_tokenize if sent_tokenize else nltk.sent_tokenize self._pos_tag = pos_tag if pos_tag else nltk.pos_tag self._stop_words = stop_words if stop_words else set(nltk.corpus.stopwords.words('english')) self._punct = punct if punct else set(string.punctuation) self._chunk_grammar = grammar self._chunker = RegexpParser(self._chunk_grammar) def extract_chunks_sent(self, sent): """ Extract chunk phrases from a sentence. :param sent: a sentence level text. :return: chunk phrases """ tags = self._pos_tag(self._word_tokenize(sent)) chunks = nltk.chunk.tree2conlltags(self._chunker.parse(tags)) # join constituent chunk words into a single chunked phrase return [' '.join(word for word, pos, chunk in group) for key, group in itertools.groupby(chunks, lambda (word, pos, chunk): chunk != 'O') if key] def extract_chunks_doc(self, text): """ Extract chunk phrases from a document. :param text: a document level text :return: chunk phrases """ sents = self._sent_tokenize(text) sents = [s for s in sents if s] return list(itertools.chain.from_iterable(map(self.extract_chunks_sent, sents))) def extract_words_sent(self, sent, good_tags=set(['NN', 'NNS'])): """ Extract desired words from a sentence. :param sent: a sentence level text :param good_tags: desired word tags :return: words with desired word tags """ tagged_words = self._pos_tag(self._word_tokenize(sent)) words = [word for word, tag in tagged_words if tag in good_tags and word.lower() not in self._stop_words and not all(char in self._punct for char in word)] return list(set(words)) def extract_words_doc(self, text, good_tags=set(['NN', 'NNS'])): """ Extract desiredwords from document :param text: a document level text :param good_tags: desired word tags :return: words with desired word tags """ sents = self._sent_tokenize(text) sents = [s for s in sents if s] func_extract = lambda x: self.extract_words_sent(x, good_tags) words = list(itertools.chain.from_iterable(map(func_extract, sents))) return list(set(words))
class NPExtractor(Persistent): """ """ implements(ITermExtractor) def __init__(self): """ """ self.filter = DefaultFilter() self.tokenizer = getUtility(ITokenizer, name="collective.classification.tokenizers.NLTKTokenizer") self.tagger = getUtility(IPOSTagger, name="collective.classification.taggers.PennTreebankTagger") self.tagger_metadata = {'type':'Pen TreeBank','categories':[]} self.np_grammar = r""" NP: {<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ self.np_finder = RegexpParser(self.np_grammar) def _add(self,norm, terms): terms.setdefault(norm, 0) terms[norm] += 1 @ram.cache(_extractor_cachekey) def extract(self,text): """ """ tokens = self.tokenizer.tokenize(text) tagged_terms = self.tagger.tag(tokens) terms = {} np_terms = {} noun_phrases = [ node for node in self.np_finder.parse(tagged_terms) if not isinstance(node,tuple)] for node in noun_phrases: coll_tag = tree2conlltags(node) if len(coll_tag) > 1: mterm = [ term.lower() for (term,tag,temp) in coll_tag if len(term)>1 ] mterm = ' '.join(mterm) self._add(mterm,np_terms) for (term,tag,temp) in coll_tag: if tag.startswith('N') and len(term)>1: if tag in ['NNS','NNPS']: term = singularize(term) self._add(term.lower(),terms) for term in terms.keys(): if not self.filter(term,terms[term]): del terms[term] for term in np_terms.keys(): if not self.filter(term,np_terms[term]): del np_terms[term] return (terms,np_terms) def setTagger(self,tagger,tagger_metadata={}): self.tagger = tagger if not tagger_metadata: self.tagger_metadata['type']='unknown' else: self.tagger_metadata = tagger_metadata
class CopyrightDetector(object): """ Class to detect copyrights and authorship. """ def __init__(self): from nltk import RegexpTagger from nltk import RegexpParser self.tagger = RegexpTagger(patterns) self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE) @staticmethod def as_str(node): """ Return a parse tree node as a space-normalized string. """ node_string = ' '.join(k for k, _ in node.leaves()) return u' '.join(node_string.split()) def detect(self, numbered_lines): """ Return a sequence of tuples (copyrights, authors, years, holders) detected in a sequence of numbered line tuples. """ from nltk.tree import Tree numbered_lines = list(numbered_lines) numbers = [n for n, _l in numbered_lines] start_line = min(numbers) end_line = max(numbers) # logger.debug('CopyrightDetector:detect:lines numbers: %(start_line)d->%(end_line)d' % locals()) tokens = self.get_tokens(numbered_lines) # we accumulate detected items in these synchronized lists # this could be a single list of namedtuples # or a list of dicts instead copyrights, authors, years, holders = [], [], [], [] if not tokens: return copyrights, authors, years, holders, None, None # OPTIMIZED copyrights_append = copyrights.append authors_append = authors.append years_append = years.append holders_append = holders.append # first, POS tag each token using token regexes tagged_text = self.tagger.tag(tokens) logger.debug('CopyrightDetector:tagged_text: ' + str(tagged_text)) # then build a parse tree based on tagged tokens tree = self.chunker.parse(tagged_text) logger.debug('CopyrightDetector:parse tree: ' + str(tree)) CopyrightDetector_as_str = CopyrightDetector.as_str def collect_year_and_holder(detected_copyright): """ Walk the a parse sub-tree starting with the `detected_copyright` node collecting all years and holders. """ for copyr in detected_copyright: if isinstance(copyr, Tree): # logger.debug('n: ' + str(copyr)) node_text = CopyrightDetector_as_str(copyr) copyr_label = copyr.label() if 'YR-RANGE' in copyr_label: years_append(refine_date(node_text)) elif 'NAME' == copyr_label or 'COMPANY' in copyr_label: # FIXME : this would wreck things like 23andme # where a company name contains numbers holders_append(refine_author(node_text)) # logger.debug('CopyrightDetector: node_text: ' + node_text) else: collect_year_and_holder(copyr) # then walk the parse tree, collecting copyrights, years and authors for tree_node in tree: if isinstance(tree_node, Tree): node_text = CopyrightDetector_as_str(tree_node) tree_node_label = tree_node.label() if 'COPYRIGHT' in tree_node_label: if node_text and node_text.strip(): refined = refine_copyright(node_text) if not is_junk(refined): copyrights_append(refined) collect_year_and_holder(tree_node) elif tree_node_label == 'AUTHOR': authors_append(refine_author(node_text)) return copyrights, authors, years, holders, start_line, end_line def get_tokens(self, numbered_lines): """ Return an iterable of tokens from lines of text. """ tokens = [] tokens_append = tokens.append # simple tokenization: spaces and some punctuation splitter = re.compile('[\\t =;]+').split for _line_number, line in numbered_lines: line = line.strip() if line: line = prepare_text_line(line) if line : line = strip_markup(line) if line and line.strip(): for tok in splitter(line): # strip trailing quotes and ignore empties tok = tok.strip("' ") if not tok: continue # strip trailing colons: why? tok = tok.rstrip(':').strip() # strip leading @: : why? tok = tok.lstrip('@').strip() if tok and tok not in (':',): tokens_append(tok) logger.debug('CopyrightDetector:tokens: ' + repr(tokens)) return tokens
class GrammarExtractor(SentenceExtractor): """ Grammar-based extraction strategy: pick sentences that comply with a pre-defined grammar. """ splitter = None parser = None # Grammars rely on POS labels, which are language-dependent grammars = { 'en': r""" NOPH: {<PDT>?<DT|PP.*|>?<CD>?<JJ.*|VVN>*<N.+|FW>+<CC>?} CHUNK: {<NOPH>+<MD>?<V.+>+<IN|TO>?<NOPH>+} """, 'it': r""" SN: {<PRO.*|DET.*|>?<ADJ>*<NUM>?<NOM|NPR>+<NUM>?<ADJ|VER:pper>*} CHUNK: {<SN><VER.*>+<SN>} """, } def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens]) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: return elif isinstance(document, list): document = '\n'.join(document) # Sentence splitting sentences = self.splitter.split(document) tokens = 0 for sentence in sentences: tagged = [(token, pos) for token, pos, lemma in self.tagger.tag_one(sentence)] # Parsing via grammar parsed = self.parser.parse(tagged) # Loop over sub-sentences that match the grammar for grammar_match in parsed.subtrees(lambda t: t.label() == 'CHUNK'): logger.debug("Grammar match: '%s'" % grammar_match) # Look up the LU for token, pos in grammar_match.leaves(): # Restrict match to sub-sentence verbs only if pos.startswith('V'): for lemma, match_tokens in self.lemma_to_token.iteritems(): if token.lower() in match_tokens: # Return joined chunks only # TODO test with full sentence as well # TODO re-constitute original text (now join on space) text = ' '.join([leaf[0] for leaf in grammar_match.leaves()]) logger.debug("Extracted sentence: '%s'" % text) logger.debug("Sentence token '%s' is in matches %s" % (token, match_tokens)) logger.debug("Extracted sentence: %s" % text) extracted.append({ 'lu': lemma, 'text': text, 'tagged': tagged, 'url': url, }) if extracted: logger.debug("%d sentences extracted. Removing the full text from the item ...", len(extracted)) item.pop(self.document_key) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...")
def InfoExtractor(text): ### Regex Expressions ### ######################### regex_email = re.compile(r'([a-zA-Z0-9._-]+@[a-zA-z0-9._-]+\.[^\s]*)',re.IGNORECASE | re.UNICODE) regex_phone = re.compile(r'(\d+[\-\+\(]?\d+[\)\-\s]?\d+[\-\s]?\d+)', re.UNICODE) regex_DOB = re.compile(r'([0-3]?[0-9](?:\.|\/|\-|\s)[0-3]?(?:[0-9]|' + r'(?:Feb|Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January' + r'|February|March|April|May|June|July|August|September|October|' + r'November|December))(?:\.|\/|\-|\s)(?:[0-9]{2})?[0-9]{2})',re.IGNORECASE | re.UNICODE) #regex_phone = re.compile(r'\s*(?:\+?(\d{1,3}))?([-. (]*(\d{3})[-. )]*)?((\d{3})[-. ]*(\d{2,4})(?:[-.x ]*(\d+))?)$',re.IGNORECASE | re.UNICODE) info = dict() regex = { 'email':regex_email, 'phone':regex_phone, 'DOB':regex_DOB } for exp in regex.keys(): info[exp] = regex[exp].findall(text) #Filtering phone numbers info['phone'] = [x for x in info['phone'] if len(x)>5] print text ### Sent Tokenize ### ###################### sent = sent_tokenize(text.decode("utf8")) print sent print ### Word Tokenize ### ##################### sent = [ word_tokenize(word) for word in sent ] #print words sent = [pos_tag(word) for word in sent] #print sent[0] #print sent #print sent[1] grammar = "NP: {<DT>?<JJ>*<NN>}" cp = RegexpParser(grammar) result = cp.parse(sent[0]) #print result #result.draw() #print sent ''' raw_tuples = sent[0].split('\n') for line in raw_tuples: try: key, value = line.split('\t') print key, value print except: pass ''' #return None return info
# ('of', 'IN'), # ('submarines', 'NNS'), # ('.', '.')], # .... ] matcheur = RegexpParser( """ truc: {<JJ.*>*<NN.*>+(<P|IN> <JJ.*>*<NN.*>+)*} """ ) # # lecture/analyse expressions correspondant à notre recherche recog_trees = [] for s in pos_sents: reconnu = matcheur.parse(s) recog_trees.append(reconnu) del pos_sents t4 = datetime.now() # [('We', 'PRP'), ('all', 'DT'), ('live', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('yellow', 'JJ'), ('submarine', 'NN'), ('.', '.')] # (S # We/PRP # all/DT # live/VBP # in/IN # a/DT # (truc yellow/JJ submarine/NN) # ./.)