def output(): if request.method == 'POST': result = request.form['inputdata'] result = result.lower() #converting the paragraph in tokens for parts of speech tagging words = word_tokenize(result) allwords=[] for word in words: if word not in allwords: allwords.append(word) ##lm = WordNetLemmatizer() rootwords = [] ##for word in allwords: ## rootwords.append(lm.lemmatize(word)) #tagged all the parts of speech to words taggedwords = pos_tag(allwords) #splits the string and gets favorable parts of speech tagged words... Here I have taken all Noun, adjectives, Verbs and Adverbs chunkString = """Junk*1234: {<.*>+} }<NN*|JJ|VB.|RB>+{""" chunkParse = RegexpParser(chunkString) chunkedwords = chunkParse.parse(taggedwords) impwords = [] ## coverting chunked words tree to list of favorable words for words in chunkedwords: impwords.append(str(words)) keyvalue=[] for word in impwords: if ('Junk*1234' not in word): keyvalue.append(word[1:].split(",")[0]) return render_template("output.html", keyvalues=keyvalue,result=result)
def chunkingTweet(filtrd_tweet): trees = [] # grammar = """ # ADJN:{<JJ><N.*>*} # VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)} # AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)} # VBAVB:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)(<RB>|<RBR>|<RBS>)} # MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)} # """ grammar = """ ADJN:{<JJ><N.*>*} VBN:{(<VB>|<VBG>|<VBD>|<VBN>|<VBP>|<VBZ>)(<NN>|<NNP>)} AVBN:{(<RB>|<RBR>|<RBS>)(<NN>|<NNP>)} MDVB:{<MD><.+>(<VB>|<VBG>|<VBD>|<VBN>|<VBP>)} """ chunkParser = REPARSE(grammar) rootTree = chunkParser.parse(filtrd_tweet) rootTree.draw() for tree in rootTree: if isinstance(tree,TREE.Tree): trees.append(tree) chunkParser.parse(filtrd_tweet) return trees
def extraxt_semantic_chuncks(self, pos_tags): """ Extract chunks of text from the paper taking advantage of the parts of speech previously extracted. It uses a grammar Returns: chunks (list): list of all chunks of text """ grammar_parser = RegexpParser(GRAMMAR) chunks = list() pos_tags_with_grammar = grammar_parser.parse(pos_tags) #print(pos_tags_with_grammar) for node in pos_tags_with_grammar: if isinstance(node, tree.Tree) and node.label( ) == 'DBW_CONCEPT': # if matches our grammar chunk = '' for leaf in node.leaves(): concept_chunk = leaf[0] concept_chunk = re.sub( '[\=\,\…\’\'\+\-\–\“\”\"\/\‘\[\]\®\™\%]', ' ', concept_chunk) concept_chunk = re.sub('\.$|^\.', '', concept_chunk) concept_chunk = concept_chunk.lower().strip() chunk += ' ' + concept_chunk chunk = re.sub('\.+', '.', chunk) chunk = re.sub('\s+', ' ', chunk) chunks.append(chunk) return chunks
def extract_entities(self, text, grammar=None, lang=None): """ Extract entities from text """ entities = [] if lang is None: lang = WORLD_2_NLTK[self.detect_language(text)] else: if lang in WORLD_2_NLTK.keys(): lang = WORLD_2_NLTK[lang] else: lang = self._lang if lang == 'japanese': return JAParser().extract_entities(text), lang pos_sentences = [ pos_tag(self.word_tokenize(sentence, lang=lang)) for sentence in self.sent_tokenize(text, lang=lang) ] if grammar is not None: chunker = RegexpParser(grammar) for pos_sentence in pos_sentences: tree = chunker.parse(pos_sentence) self.logger.debug(tree) entities = entities + self._select_entities(tree) else: for pos_sentence in pos_sentences: tree = ne_chunk(pos_sentence, binary=False) self.logger.debug(tree) entities = entities + self._select_entities(tree) return entities, lang
def findNounPhrases(toRead, category): words = tagWords(toRead) #create file newFile = open(category.strip() + 'NounPhrases.txt', 'a') #? is include, if it exists #* is include it (however many) #+ is at least one patterns = r""" NP: {<VBD><N.*>+} {<VBG><N.*>+} {<VBN><N.*>+} """ chunker = RegexpParser(patterns) tree = chunker.parse(words) #subtree.leaves() returns list for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): newFile.write("\n") newFile.write("\n") for leaf in subtree.leaves(): newFile.write(str(leaf)) newFile.close()
def extract_NP(posTagged): grammar = r""" ADJ: {<RB.*>? <JJ.* | VBG>} ADJLIST: {<ADJ> (<CC>? <,>? <ADJ>)*} ADJNOUN: {<ADJLIST>? <NN.*>+} PREFIXEDNOUN: {<DT|PRP\$>? (<ADJNOUN> <POS>)* <ADJNOUN>} PP: {<IN><PREFIXEDNOUN>} NP: {<PREFIXEDNOUN> (<PP>)*} {<PRP>} """ chunker = RegexpParser(grammar) ne = [] chunk = chunker.parse(posTagged) for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'): ne.append(' '.join([child[0] for child in tree.leaves()])) return ne
def GetNounPhrase(sentence): #print('GetNounPhrase is called') output = '' #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'NP: {<DT>?<JJ>*<NN.*>+}' #Create the Parser Object cp = RegexpParser(grammar) #Tokenize the input and get part of speech pos = pos_tag(word_tokenize(sentence)) result = cp.parse(pos) #for debugging #result.draw() #print(result) #Loop through the tree datastructure and pull the values under DNP node #we created for the result for subtree in result.subtrees(filter=lambda t: t.label() == 'NP'): output = ' '.join(item[0] for item in subtree.leaves()) # 'abc\nghi\nmno' return output
def convert_to_noun(sen): sen = ie_preprocess(sen) grammar = r""" NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun {<NNP>+} # chunk sequences of proper nouns """ cp = RegexpParser(grammar) res = cp.parse(sen[0]) print(res) ROOT = 'ROOT' tree = res output = [] def getNodes(parent): for node in parent: if type(node) is Tree: print("Label:", node.label()) print("Leaves:", node.leaves()) if node.leaves()[0][1] in ("NN", "JJ"): if node.leaves()[0][0] not in output: output.append(node.leaves()[0][0]) print(node.leaves()[0][0]) getNodes(node) else: print("Word:", node) if node[1] in ("NN", "JJ"): if node[0] not in output: output.append(node[0]) getNodes(tree) print(output) return " ".join(output)
def identify_map_verbs_with_negation(nlp, sentences): """Errors are happens when sentences are list type, hint: extracts string from the list""" sentences = sent_tokenize(sentences) grammar = r""" GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP|VBD>+<IN|RP>*} """ # GR : {<RB>*<VB|VBN|JJ|VBG|VBZ|VBP>+<IN|RP>*} map_part_verb_and_negation = [] for sent in sentences: # sample = sent.split('=') words = word_tokenize(sent) tagged = pos_tag(words) cp = RegexpParser(grammar) t = cp.parse(tagged) # t.draw() negate = '' verb = '' verbs = [] for s in t.subtrees(): is_phrasal = False if s.label() == "GR": for token in s.leaves(): if token[0] == 'is' or token[0] == 'are' or token[ 0] == 'does' or token[0] == 'do': continue elif token[1] == 'RB': negate = token[0] elif token[0] != "=": verb = verb + " " + token[0] verb = InputsOutputsStateFinder.phrasal_verb_verifier_or_verb_part_extractor( nlp, verb) verbs.append([negate, verb]) map_part_verb_and_negation.append(verbs) return map_part_verb_and_negation.pop()
def drawNamedEntityTree(self, text): tokenized_text = tokenizer.tokenize(text) tagged_text = self.tagWords(tokenized_text) grammar = "ENT: {<PESSOA>*}" cp = RegexpParser(grammar) res = cp.parse(tagged_text) res.draw()
def tokenise_subjects(subject): if subject == '' or subject is None: return [] split_subjects = [] phrase_pattern = 'CHUNK:{<JJ>*<NN.?>*<VBG>*}' phrase_chunker = RegexpParser(phrase_pattern) for s in subject.split(','): tokens = word_tokenize(s.strip().lower()) tags = pos_tag(tokens) phrases = [ ' '.join([leaf[0] for leaf in c.leaves()]) for c in phrase_chunker.parse(tags) if hasattr(c, 'label') and c.label() == 'CHUNK' ] for phrase in phrases: phrase_tokens = word_tokenize(phrase) phrase_tags = pos_tag(phrase_tokens) lemmatised_phrase = [] for pto, pta in phrase_tags: wn_tag = { 'n': wn.NOUN, 'j': wn.ADJ, 'v': wn.VERB, 'r': wn.ADV }.get(pta[0].lower(), None) if wn_tag is None: continue lemmatised = WordNetLemmatizer().lemmatize(pto, wn_tag) lemmatised_phrase.append(lemmatised) if len(lemmatised_phrase) > 0: lemmatised_phrase = ' '.join(lemmatised_phrase) split_subjects.append(lemmatised_phrase) return list(set(split_subjects))
def pos_tagging(result): pos_tagged_words = [] for tf_idf_info in result: tf_idf_info["pos"] = morph.parse(tf_idf_info["normal_form"])[0].tag.POS if tf_idf_info["pos"] is not None: pos_tagged_words.append( (tf_idf_info["normal_form"], tf_idf_info["pos"])) # ToDo: Add reg exps for numeric patterns = """ many adj+noun:{<ADJF>+<NOUN>} noun+many adj:{<NOUN><ADJF>+} verb + noun:{<INFN><NOUN>+} verb + verb:{<INFN><INFN>} prep + verb/noun:{<PRCL>(<INFN>|<NOUN>)} verb + prep + verb?:{<INFN><PRCL><INFN>?} conj + verb/verb + conj:{(<INFN><CONJ>)|(<CONJ><INFN>)?} """ chunker = RegexpParser(patterns) tree = chunker.parse(pos_tagged_words) for subtree in tree.subtrees(): if subtree._label == "S": continue # highlight all words in collocation if one of them already was highlighted # TODO: Iterate through all elements of subtree (it might be > 2) term1, term2 = subtree[0][0], subtree[1][0] tf_idf_info1, tf_idf_info2 = next(x for x in result if x["normal_form"] == term1), \ next(x for x in result if x["normal_form"] == term2) if tf_idf_info1["highlight"] or tf_idf_info2["highlight"]: tf_idf_info1["highlight"], tf_idf_info2["highlight"] = True, True
def nounphrase2(abstract): ab = abstract.lower() ab_words = [word_tokenize(s) for s in sent_tokenize(ab)] pos_tagged = [] for sent in ab_words: pos_tagged.append(pos_tag(sent)) chunk_grammar = "NP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}" #chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" #chunk_grammar = "NP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" chunk_parser = RegexpParser(chunk_grammar) np_chunked = [] for sentence in pos_tagged: np_chunked.append(chunk_parser.parse(sentence)) most_com = np_chunk_counter(np_chunked) truelab = lambda row: row.true_label4 if row.Abstract == abstract else None p = df.apply(truelab, axis=1) if p.dropna().values[0] == 1.0: print('\n') print(p.dropna()) print(most_com) print('\n') count = 0 # print(most_com) #print('\n') # print("count2",count) if count >= 2: return 1 elif count == 1: return 0 elif count <= 0: return 2
def GetName(sentence): #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'NAME: {<NNP>*|<NN?>*}' #Create the Parser Object cp = RegexpParser(grammar) common_words = {'hi', 'name', 'hello', 'thank', 'you', 'i', 'am', 'oh', 'hey', 'sure', 'yes', 'named', 'known'} #Tokenize the input word_tokens = word_tokenize(sentence) #Eliminate the greeting words and get straight to discerning the Name as NNP or NN word_tokens = [x for x in word_tokens if x.lower() not in common_words] #Obtain parts of speech for each token and run through parser pos = pos_tag(word_tokens) result = cp.parse(pos) #print statements for debugging #print(result) #result.draw() #Loop through the tree datastructure and pull the x (actual name), if the Root is 'NAME' #we created for the result output = "" for tree in result.subtrees(): if tree.label() == 'NAME': name_match = ' '.join([x for x,y in tree.leaves()]) output = output + ' ' + name_match return output.replace(" ", " ").strip()
class Chunking: def __init__(self, formats: list): self.__text = "\n".join(formats) self.__parse: RegexpParser = RegexpParser(self.__text) def setParser(self, formats): if type(formats) is list: self.__text = "\n".join(formats) else: self.__text = formats self.__parse = RegexpParser(self.__text) def getParse(self) -> RegexpParser: return self.__parse def addChunking(self, s: str): self.__text += "\n".join(s) self.__parse: RegexpParser = RegexpParser(self.__text) def parse(self, tokens) -> Tree: return self.__parse.parse(tokens) def merge(self, tokens): chunks = self.__parse.parse(tokens) ret = [] for N in chunks: if type(N) is Tree: label = N.label() text = " ".join([T[0] for T in N.leaves()]) ret.append((text, label)) else: ret.append(N) return ret
def post_text_process(text): """ Performs a post process with the text, preparing it for text deep analysis or other purposes. """ # Splits words to tokens tokenized_word = word_tokenize(text.lower()) # Normalize words to normal form # aka: playing -> play lemmatizer = WordNetLemmatizer() lemmatized_words = [] for word in tokenized_word: lemmatized_words.append(lemmatizer.lemmatize(word)) # Remove stopwords from the set stopwords_set = set(stopwords.words('english')) stopwords_set.add('.') stopwords_text = [ word for word in lemmatized_words if word not in stopwords_set ] # Named entity recognition tags = nltk.pos_tag(stopwords_text) chunk = ne_chunk(tags) chunking_rule = "NP: {<DT>?<JJ>*<NN>}" chunking_text_parsed = RegexpParser(chunking_rule) chunking_result = chunking_text_parsed.parse(tags) # Returns important information created by the post processing return chunking_result, chunk
def on_get(self, req, resp, id): print(id) if (len(id) > 0): arts_obj = ArticleModel.objects(_id=ObjectId(id)) art = arts_obj[0] print(art['href']) title = art['title'] toks = word_tokenize(title) sent = sent_tokenize(art['txt']) sent = [word_tokenize(xt) for xt in sent] sent = [pos_tag(xt) for xt in sent] print(sent) tag = pos_tag(toks) grammar = "NP: {<DT>?<JJ>*<NN>}" patterns = """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}""" cp = RegexpParser(patterns) rslt = cp.parse(tag) print(rslt) resp.json = {'rslt': str(rslt)} else: #resp.status = falcon.HTTP_200 #arts = [] #arts_obj = ArticleModel.objects().all_fields() #for art in arts_obj: #print(art.to_json()) # arts.append(art.to_json()) callnames = ['tst'] resp.json = {'rslt': json.dumps(callnames)}
def process_command(command): try: words = word_tokenize(command) tagged = pos_tag(words) chunkGram = r""" Tasks: {<VB.?>} Numbers:{<CD>} """ chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) traverse(chunked) #chunked.draw() if (len(numbers) > 2): speak(to_many_numbers) return elif (len(numbers) < 2): speak(to_less_numbers) return if (possible_tasks.isdisjoint(tasks)): speak(unknown_task) return if ('add' in tasks): add(numbers[0], numbers[1]) elif ('subtract' in tasks): sub(numbers[0], numbers[1]) elif ('multiply' in tasks): mul(numbers[0], numbers[1]) elif ('divide' in tasks): div(numbers[0], numbers[1]) except Exception as e: print(str(e))
def __init__(self, grammar_filename: Optional[str] = None) -> None: """Helper class to chunk part-of-speech tagged text using grammar regular expressions. The default grammar regular expressions file is defined by the `FILE_GRAMMAR` variable in `src.make_feedback_tool_data.text_chunking`. >>> from src.make_feedback_tool_data.text_chunking import FIlE_GRAMMAR >>> FIlE_GRAMMAR '.../govuk-corona-analysis/src/make_feedback_tool_data/grammar.txt' :param grammar_filename: Default: None. A path string to file containing regular expression grammar patterns usable by the `grammar` argument of the nltk.chunk.regexp.RegexpParser class. For each grammar type, each pattern should be listed on a separate line, and in descending order of priority (highest first). If None, it will use the default regular expression file. """ self.logger = logging.getLogger(__name__) # If `grammar_filename` is None, use the default file path if not grammar_filename: grammar_filename = FIlE_GRAMMAR # Load the regular expressions from `grammar_filename` self.grammar = self._load_grammar_from_file(grammar_filename) # Initialise a nltk.RegexpParser object using `self.grammar` self.logger.info("Initializing parser...") self.parser = RegexpParser(self.grammar)
def get_noun_counter(text) -> collections.Counter: text = text.split() tokens_tag = pos_tag(text) patterns= """mychunk:{<JJ.?>*<NN.?.?>*}""" chunker = RegexpParser(patterns) output = chunker.parse(tokens_tag) noun_list = [] compound_noun_list = [] for n in output: if isinstance(n, nltk.tree.Tree): n = str(n) part_of_speech = [el.split('/')[1]for el in n.split()[1:]] if any([el.find('NN')>-1 for el in part_of_speech]): noun = [ stemmer.stem(el.split('/')[0]) if el.split('/')[1] == 'NNS' or el.split('/')[1] == 'NNPS' else el.split('/')[0] for el in n.split()[1:] ] compound_noun_list.append(''.join([ f'{n} ' for n in noun ])[:-1]) noun_list.extend(noun) noun_list = [ noun for noun in noun_list if len(noun) > 1] return collections.Counter(noun_list), compound_noun_list
def data_gathering_iterator(file_path, morph, grammar=COMPLEX_GRAMMAR): """На каждой итерации возвращает список полученных из одной строки комбинаций прилаг + сущ. Элемент списка кортеж (прилагательное, существительно). Прилагательное и существительное приведены к нормальной форме :param file_path: путь до файла с данными. Файл должен быть в формате UTF-8 :param morph: морфология """ chunk_parser = RegexpParser(grammar) f = open(file_path, "r") for line in f: try: line = line.decode('utf-8') except UnicodeDecodeError: continue line = line.strip() #разбиваем на предложения for sentence in sent_tokenize(line): sentence = sentence.strip() if sentence: tokens = word_tokenize(sentence) tagged_tokens = pos_tag(tokens) tree = chunk_parser.parse(tagged_tokens) for subtree in tree.subtrees(): if subtree.node == u"CHUNK": adj_noun_list = get_adj_noun_list_from_chunk(subtree) yield normilize_adj_noun_list(adj_noun_list, morph)
def run(posTaggedTokenListList, pos1, pos2): retVal = '' total = 0 # word pair 정규식을 정의한다. regex = ('pattern: {<%s><%s>}' % (getPosTagRegex(pos1), getPosTagRegex(pos2))) # 파서를 생성한다. parser = RegexpParser(regex) # 파싱한다. for posTaggedTokenList in posTaggedTokenListList: parsedTree = parser.parse(posTaggedTokenList) for subtree in parsedTree: if isinstance(subtree, tree.Tree): retVal += (subtree[0][0] + " ") retVal += (subtree[1][0] + "\r\n") total += 1 retVal = (("total: %d\r\npos1: %s, pos2: %s\r\n\r\n" % (total, pos1, pos2)) + retVal) return retVal
def GetVerbDetNounPhrase(sentence): #print('GetNounPhrase is called') output = '' #Parse either Proper Noun Singular or Noun because RegexpParser is inaccurate at times grammar = 'DNP: {<(VB |VBP)><DT>?<NN>}' #Create the Parser Object cp = RegexpParser(grammar) #Tokenize the input and get part of speech pos = pos_tag(word_tokenize(sentence)) result = cp.parse(pos) #result.draw() #print(result) #Loop through the tree datastructure and pull the values under DNP node #we created for the result for tree in result.subtrees(): if tree.label() == 'DNP': name_match = ' '.join([x for x, y in tree.leaves()]) output = output + name_match return output
def recuperarEntidadesEs(self,texto): chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {<NN|NNS><JJ>|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<NN|NNS><ENTI>} {<NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """) '''chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {<NN|NNS><JJ>|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<DT><NN|NNS><ENTI>} {<DT><NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """)''' ObjTag = Tokenizar() Lista = [] Lista2 = [] for sentence in sent_tokenize(texto): tags=ObjTag.tagear(sentence) tagsentX=word_tokenize(sentence) filtered_words = ' '.join(w for w in tagsentX if not w in nltk.corpus.stopwords.words('spanish')) parsed = chunker.parse(tags) for chunk in parsed: if hasattr(chunk, 'node'): Lista2.append([chunk.leaves(),filtered_words]) Lista.append (' '.join(c[0] for c in chunk.leaves())) return Lista2
def add_chunk_data(pos_data): chunker = RegexpParser(CHUNKER_GRAMMAR) chunks = {} for sentence_id, data in pos_data.iteritems(): result = chunker.parse(data) chunks[sentence_id] = {} chunks[sentence_id]['chunks'] = [' '.join([token for token, pos in t.leaves()]) for t in result.subtrees(lambda result: result.label() == 'SN')] return chunks
def get_continuous_chunks(tokenized_text): # this regex is not working, change to another later NP = "(?:(?:\w+ ART)?(?:\w+ ADJ) *)?\w + (?:N[NP] | PRN)" chunker = RegexpParser(NP) tagged_text = PortugueseTextualProcessing.postag(tokenized_text, as_list=False) chunked = chunker.parse(tagged_text) return PortugueseTextualProcessing().extract_chunks(chunked)
def VBD_question(tagged): """ Tries to leverage prepositions to generation questions. """ try: first_verb_index = next(i for i, pair in enumerate(tagged) if (pair[1] == 'VBZ' or pair[1] == 'VBP')) subject_phrase = [pair[0] for pair in tagged[:first_verb_index+1]] phrase_dict = {'VBDP': 'VBDP: {<VBD|VBP|VBN><RB|JJ>*<IN>}'} vbdp_fragments = [] for i, (key, phrase) in enumerate(phrase_dict.items()): cp = RegexpParser(phrase) if i==0: result = cp.parse(tagged) else: result = cp.parse(result) for i, item in enumerate(result): if type(item) is nltk.Tree: fragment = [pair[0] for pair in item] if item.node == 'VBDP': vbdp_fragments.append((fragment, i)) qa_list = [] for vbdp, index in vbdp_fragments: question_list = subject_phrase + vbdp question_list.append('what?') question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip() sentence_remainder = result[index+1:] sentence_remainder_treeless = [] for tree_or_tuple in sentence_remainder: try: tree_or_tuple.leaves() for leaf in tree_or_tuple.leaves(): sentence_remainder_treeless.append(leaf) except AttributeError: sentence_remainder_treeless.append(tree_or_tuple) answer_list = [pair[0] for pair in sentence_remainder_treeless] answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip() qa_list.append((question_string, answer_string)) return qa_list except: """ If not verb recognized above, simply split sentence based on prepositions. """ prep_indices = [i for i, pair in enumerate(tagged) if pair[1] == 'IN'] qa_list = [] for prep_index in prep_indices: question_list = [pair[0] for pair in tagged[:prep_index+1]] question_list.append('what?') question_string = ''.join([('' if c in string.punctuation else ' ')+c for c in question_list]).strip() answer_list = [pair[0] for pair in tagged[prep_index+1:]] answer_string = ''.join([('' if c in string.punctuation else ' ')+c for c in answer_list]).strip() qa_list.append((question_string, answer_string)) return qa_list
def tagtosem(sent): cp = RegexpParser(''' NP: {<DET>? (<ADJ>|<ADV>)* <CONJ>* (<NOUN>|<NUM>|<X>|(<PRON> <PRT>))* <PRON>?} R: {(<PRT> <VERB>?)* <A..>* <PRON>?} V: {<VERB>*(<PRT>*|<VERB>)*} PNC:{<\.>} C: {<ADP>} ''') return cp.parse(sent)
def extract_bow_from_raw_text(text_as_string): """Extracts bag-of-words from a raw text string. Parameters ---------- text (str): a text document given as a string Returns ------- list : the list of the tokens extracted and filtered from the text """ if (text_as_string == None): return [] if (len(text_as_string) < 1): return [] import nltk if '/home/hadoop/nltk_data' not in nltk.data.path: nltk.data.path.append('/home/hadoop/nltk_data') nfkd_form = unicodedata.normalize('NFKD', unicode(text_as_string)) text_input = nfkd_form.encode('ASCII', 'ignore') sent_tokens = sent_tokenize(text_input) tokens = map(word_tokenize, sent_tokens) sent_tags = map(pos_tag, tokens) grammar = r""" SENT: {<(J|N).*>} # chunk sequences of proper nouns """ cp = RegexpParser(grammar) ret_tokens = list() stemmer_snowball = SnowballStemmer('english') for sent in sent_tags: tree = cp.parse(sent) for subtree in tree.subtrees(): if subtree.label() == 'SENT': t_tokenlist = [tpos[0].lower() for tpos in subtree.leaves()] t_tokens_stemsnowball = map(stemmer_snowball.stem, t_tokenlist) #t_token = "-".join(t_tokens_stemsnowball) #ret_tokens.append(t_token) ret_tokens.extend(t_tokens_stemsnowball) #if subtree.label() == 'V2V': print(subtree) #tokens_lower = [map(string.lower, sent) for sent in tokens] stop_words = {'book', 'author', 'read', "'", 'character', ''}.union(ENGLISH_STOP_WORDS) tokens = [token for token in ret_tokens if token not in stop_words] return (tokens)
def get_chunks(tagged_sent): chunkgram = r"""VB-Phrase: {<DT><,>*<VB>} VB-Phrase: {<RB><VB>} VB-Phrase: {<UH><,>*<VB>} VB-Phrase: {<UH><,><VBP>} VB-Phrase: {<PRP><VB>} VB-Phrase: {<NN.?>+<,>*<VB>} Q-Tag: {<,><MD><RB>*<PRP><.>*}""" chunkparser = RegexpParser(chunkgram) return chunkparser.parse(tagged_sent)
def pos_chunks(): text = "learn php from guru99 and make study easy".split() print("After Split: ", text) tokens_tag = pos_tag(text) print("After Token: ", tokens_tag) patterns = """mychunk: {<NN.?>*<VBD.?>*<JJ.?>*<CC>?}""" chunker = RegexpParser(patterns) print("After Regex: ", chunker) output = chunker.parse(tokens_tag) print("After Chunking: ", output)
def findRelationshipUsingGrammer(phrase): phrase = tag(phrase) grammer ='REL: {<RB><RBR><IN>|' \ '<RB><JJ|JJR|JJS><IN>|' \ '<JJ|JJR|JJS><IN>|' \ '<JJ|JJR|JJS>|' \ '<JJ|JJR|JJS><TO>}' parseTree = RegexpParser(grammer).parse(phrase) for i in parseTree.subtrees(filter=lambda x: x.label() == 'REL'): return ' '.join([ k[0] for k in list(i)])
def getChunk(question): """ helper method to get NP """ qPOS = pos_tag(word_tokenize(question)) t = ne_chunk(qPOS) Pattern = "NP:{<DT>?<JJ|PR.>*<NN|NNS>}" np_parser = RegexpParser(Pattern) T = np_parser.parse(t) return T
def _word_combination(self, pos_tagged_sentence): # Finding entities still testing grammar = r""" EN: {<NN.*><CD>+} """ cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def extract_verbphrase(tagged_sent): chunkgram = r"""VB-Phrase: {<UH><,>*<VB>} VB-Phrase: {<UH><,><VBP>} VB-Phrase: {<PRP><VB>} VB-Phrase: {<NN.?>+<,>*<VB>} VB-Phrase: {<DT><,>*<VB>} VB-Phrase: {<RB><VB>} Q-Tag: {<,><MD><RB>*<PRP><.>*}""" vbchunkparser = RegexpParser(chunkgram) return vbchunkparser.parse(tagged_sent)
def test_tag_pattern2re_pattern_quantifier(self): """Test for bug https://github.com/nltk/nltk/issues/1597 Ensures that curly bracket quantifiers can be used inside a chunk rule. This type of quantifier has been used for the supplementary example in http://www.nltk.org/book/ch07.html#exploring-text-corpora. """ sent = [('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')] # source: brown corpus cp = RegexpParser('CHUNK: {<N.*>{4,}}') tree = cp.parse(sent) assert tree.pformat() == """(S
def filter_sentences_by_chunk(pos_data, tokens): chunker = RegexpParser(CHUNKER_GRAMMAR) filtered = {} for sentence_id, data in pos_data.iteritems(): result = chunker.parse(data) good_one = False if 'CHUNK' in [s.label() for s in result.subtrees()]: for t in result.subtrees(lambda result: result.label() == 'CHUNK'): for token, pos in t.leaves(): if pos.find('VER') != -1 and token in tokens: good_one = True if good_one: filtered[sentence_id] = ' '.join(item[0] for item in data) return filtered
def word_combination(self, pos_tagged_sentence): """Chunking of a part of speech tagged sentence based on specific grammar""" # grammar = r""" # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+} # """ # Previous one grammar = r""" EN: {<JJ.*>*<NN.*>+} """ cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def recuperarEntidades(texto): chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} # Nouns and Adjectives, terminated with Nouns {<NN|NNS>+<NN|NNS><JJ>} {<NNP|NNPS><IN|DT><NNP|NNPS|NN|NNS>} {(<NN|NNS><JJ>)|<JJ><NN|NNS>} {<NNP|NNPS>} ENTIDACOMP: {<DT><NN|NNS><ENTI>} {<DT><NN|NNS><IN><ENTI>} {<ENTI>(<IN>|<IN><DT>)<ENTI|NN|NNS>} {<ENTI|ENTIDACOMP><JJ><IN><ENTI|ENTIDACOMP>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} # Above, connected with in/of/etc... {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} FECHA: {<LS|CD><IN><ENTI><DT><LS|CD>} {<LS|CD><IN><ENTI>} {<ENTI><DT><LS|CD>} {<ENTI><LS|CD>} """) Lista = [] for sentence in sent_tokenize(texto): #print sentence tags=tagear(sentence) #tags=tagear(traducir(word_tokenize(sentence))) #print tags parsed = chunker.parse(tags) #print parsed for chunk in parsed: #print chunk #if hasattr(chunk, 'node'): # print chunk.node if hasattr(chunk, 'node'): # print chunk #print chunk.leaves() #print ' '.join(c[0] for c in chunk.leaves()) Lista.append (' '.join(c[0] for c in chunk.leaves())) Lista = [] tags=tagear(sentence) parsed = chunker.parse(tags) for chunk in parsed: if hasattr(chunk, 'node'): Lista.append (' '.join(c[0] for c in chunk.leaves())) return Lista
def grammar_selection(self, grammar=None): """ Select candidates using nltk RegexpParser with a grammar defining noun phrases (NP). Args: grammar (str): grammar defining POS patterns of NPs. """ # initialize default grammar if none provided if grammar is None: grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ # initialize chunker chunker = RegexpParser(grammar) # loop through the sentences for i, sentence in enumerate(self.sentences): # compute the offset shift for the sentence shift = sum([s.length for s in self.sentences[0:i]]) # convert sentence as list of (offset, pos) tuples tuples = [(str(j), sentence.pos[j]) for j in range(sentence.length)] # parse sentence tree = chunker.parse(tuples) # find candidates for subtree in tree.subtrees(): if subtree.label() == 'NP': leaves = subtree.leaves() # get the first and lest offset of the current candidate first = int(leaves[0][0]) last = int(leaves[-1][0]) # add the NP to the candidate container self.add_candidate(words=sentence.words[first:last+1], stems=sentence.stems[first:last+1], pos=sentence.pos[first:last+1], offset=shift+first, sentence_id=i)
def __init__(self): grammar = r""" NP: {<DT>?<JJ.*|CD>*<NN.*>+} NP: {<NP><of><NP>} # need to change tags of "of" to <of>!! NP: {<NP><in><NP>} # need to change tags of "of" to <of>!! """ self.parser = RegexpParser(grammar)
def parse(query_text, networks_json): query_text = preprocess(query_text) tokens = word_tokenize(query_text) double_tokens = [ (w, w) for w in tokens ] wg = word_grammar() w_cp = RegexpParser(compile_grammar(wg)) word_result = w_cp.parse(double_tokens) word_result = convert_dates(word_result) new_tokens = list(zip(*(word_result.leaves()))[0]) tagged = pos_tag(new_tokens) domain_tagged = tag_domains(tagged, networks_json) tg = tag_grammar() t_cp = RegexpParser(compile_grammar(tg)) tagged_result = t_cp.parse(domain_tagged) slots = assign_slots(new_tokens, tagged_result, word_result) interpreted_input = make_sentence(slots) print 'tagged-result = ',tagged_result print 'word-result = ',word_result return {"parse":slots, "interpreted":interpreted_input}
def __init__(self, word_tokenize=None, sent_tokenize=None, pos_tag=None, stop_words=None, punct=None, grammar=chunk_grammar_propernouns): self._word_tokenize = word_tokenize if word_tokenize else nltk.word_tokenize self._sent_tokenize = sent_tokenize if sent_tokenize else nltk.sent_tokenize self._pos_tag = pos_tag if pos_tag else nltk.pos_tag self._stop_words = stop_words if stop_words else set(nltk.corpus.stopwords.words('english')) self._punct = punct if punct else set(string.punctuation) self._chunk_grammar = grammar self._chunker = RegexpParser(self._chunk_grammar)
def chunkingList(dataS, chunkgram): """ This function will find the chunk """ #data = str(dataS) words = word_tokenize(str(dataS)[1:]) #print words ps = pos_tag(words) # print ps chunkParser = RegexpParser(chunkgram) chunked = chunkParser.parse(ps) #print chunked tree = Tree('s', chunked) docs = [] for subtree in tree.subtrees(filter=lambda t: t.label() == 'Chunk'): # Assemble the chunk into one line and strip extra punctuations docs.append(" ".join([a for (a,b) in subtree.leaves()])) return docs
def word_combination(pos_tagged_sentence, tag_set='ptb'): """Chunking of a part of speech tagged sentence based on specific grammar""" # grammar = r""" # EN:{(<JJ>*<NN.*>+<IN>)?<JJ>*<NN.*>+} # """ if tag_set == 'ptb': # Entity grammar used for the Penn Tree Bank Tagset grammar = r""" EN: {<JJ.*>*<NN.*>+} """ elif tag_set == 'universal': # Entity grammar used for the Universal Tagset grammar = r""" EN: {<ADJ>*<NOUN>+} """ else: raise SyntaxError cp = RegexpParser(grammar) result = cp.parse(pos_tagged_sentence) return result
def relationships_of(string): # relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio # e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)" pattern = "^\((.*?)\)" match = re.search(pattern, string, re.I) relationships = [] if match and len(match.groups()) > 0: relationship_text = match.group(1).encode("ascii", "replace") # since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar from nltk import tree, pos_tag, RegexpParser tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text) pos = pos_tag(tokens) grammar = r""" NAME: {<NNP>+} NAMES: { <IN><NAME>(?:<CC><NAME>)* } RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ } MATCH: { <RELATIONSHIP><NAMES> } """ cp = RegexpParser(grammar) chunks = cp.parse(pos) # iterate through the Relationship/Names pairs for n in chunks: if isinstance(n, tree.Tree) and n.node == "MATCH": people = [] relationship = None for piece in n: if piece.node == "RELATIONSHIP": relationship = " ".join([x[0] for x in piece]) elif piece.node == "NAMES": for name in [x for x in piece if isinstance(x, tree.Tree)]: people.append(" ".join([x[0] for x in name])) for person in people: relationships.append({ "relation": relationship, "name": person}) return relationships
def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
def parse_sent(self, pos_tagged_sentence, grammar=None): #wq = csv.writer(open('wiki2.csv','w')) parsed_tagged_sents = [] for grammar in GRAMMARS: #if not grammar: #grammar = r""" # NP_CHUNK: {<VBP|VBG|VB|VB*|IN|JJ>*<NNP|NN>*<VBP|VBG|VB|IN|JJ>?} # """ parsedsent = [] #parsed_tagged_sents = [] cp = RegexpParser(grammar) for sentence in pos_tagged_sentence: result = cp.parse(sentence) #print result for node in result: if str(type(node)) == "<class 'nltk.tree.Tree'>": #wq.writerow((grammar.strip(),sentence,node.leaves())) temp = ' '.join(word for word, POS in node.leaves()) if len(temp.split()) >= 2: parsedsent.append(temp) parsed_tagged_sents.append(node.leaves()) #print grammar,node.leaves() return parsed_tagged_sents
def __init__(self): """ """ self.filter = DefaultFilter() self.tokenizer = getUtility(ITokenizer, name="collective.classification.tokenizers.NLTKTokenizer") self.tagger = getUtility(IPOSTagger, name="collective.classification.taggers.PennTreebankTagger") self.tagger_metadata = {'type':'Pen TreeBank','categories':[]} self.np_grammar = r""" NP: {<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ self.np_finder = RegexpParser(self.np_grammar)
def compare(sentence, grammar): """ Compare sentence against a grammar rule to see if any matches are found Paramaters ---------- sentence: str a single sentence for which matches are to be found grammar: str grammar rule in regexp format Returns ------- matches: nltk.tree.Tree all matches with the grammar rule """ matches = [] # Apply grammar rule cp = RegexpParser(grammar) chunk = cp.parse(sentence) # Identify label of the rule label = grammar.split(':')[0] for n in chunk: if isinstance(n, nltk.tree.Tree): if n.label() == label: matches.append(n) if matches == []: matches.append('None') return matches
class Chunker: def __init__(self): grammar = r""" NP: {<DT>?<JJ.*|CD>*<NN.*>+} NP: {<NP><of><NP>} # need to change tags of "of" to <of>!! NP: {<NP><in><NP>} # need to change tags of "of" to <of>!! """ self.parser = RegexpParser(grammar) def parse(self, sent): """ sent should be a list of tuples of word and tag """ for i, (word, pos) in enumerate(sent): if word == 'of' or word == 'in': sent[i] = (word, word) return self.parser.parse(sent) def print_chunks(self, tree, label): for node in tree: if type(node) == Tree and node.node == label: print node.leaves() def get_chunks(self, tree, label): """ return a list of ranges (tuples) marking the start and end index of the chunk """ offset = 0 chunks = [] for node in tree: if type(node) == Tree and node.node == label: phrase_size = len(node.leaves()) chunks.append((offset, offset + phrase_size - 1)) offset += phrase_size else: offset += 1 return chunks
class GrammarExtractor(SentenceExtractor): """ Grammar-based extraction strategy: pick sentences that comply with a pre-defined grammar. """ splitter = None parser = None # Grammars rely on POS labels, which are language-dependent grammars = { 'en': r""" NOPH: {<PDT>?<DT|PP.*|>?<CD>?<JJ.*|VVN>*<N.+|FW>+<CC>?} CHUNK: {<NOPH>+<MD>?<V.+>+<IN|TO>?<NOPH>+} """, 'it': r""" SN: {<PRO.*|DET.*|>?<ADJ>*<NUM>?<NOM|NPR>+<NUM>?<ADJ|VER:pper>*} CHUNK: {<SN><VER.*>+<SN>} """, } def setup_extractor(self): self.splitter = PunktSentenceSplitter(self.language) grammar = self.grammars.get(self.language) if grammar: self.parser = RegexpParser(grammar) else: raise ValueError( "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % ( self.language, self.grammars.keys()) ) for lemma, match_tokens in self.lemma_to_token.iteritems(): self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens]) def extract_from_item(self, item): extracted = [] url = item.get('url') if not url: logger.warn('skipping item without url') return document = item.get(self.document_key) if not document: return elif isinstance(document, list): document = '\n'.join(document) # Sentence splitting sentences = self.splitter.split(document) tokens = 0 for sentence in sentences: tagged = [(token, pos) for token, pos, lemma in self.tagger.tag_one(sentence)] # Parsing via grammar parsed = self.parser.parse(tagged) # Loop over sub-sentences that match the grammar for grammar_match in parsed.subtrees(lambda t: t.label() == 'CHUNK'): logger.debug("Grammar match: '%s'" % grammar_match) # Look up the LU for token, pos in grammar_match.leaves(): # Restrict match to sub-sentence verbs only if pos.startswith('V'): for lemma, match_tokens in self.lemma_to_token.iteritems(): if token.lower() in match_tokens: # Return joined chunks only # TODO test with full sentence as well # TODO re-constitute original text (now join on space) text = ' '.join([leaf[0] for leaf in grammar_match.leaves()]) logger.debug("Extracted sentence: '%s'" % text) logger.debug("Sentence token '%s' is in matches %s" % (token, match_tokens)) logger.debug("Extracted sentence: %s" % text) extracted.append({ 'lu': lemma, 'text': text, 'tagged': tagged, 'url': url, }) if extracted: logger.debug("%d sentences extracted. Removing the full text from the item ...", len(extracted)) item.pop(self.document_key) return item, extracted else: logger.debug("No sentences extracted. Skipping the whole item ...")
del sents t3 = datetime.now() # pos_sents= # [[('In', 'IN'), # ('the', 'DT'), # ('land', 'NN'), # ('of', 'IN'), # ('submarines', 'NNS'), # ('.', '.')], # .... ] matcheur = RegexpParser( """ truc: {<JJ.*>*<NN.*>+(<P|IN> <JJ.*>*<NN.*>+)*} """ ) # # lecture/analyse expressions correspondant à notre recherche recog_trees = [] for s in pos_sents: reconnu = matcheur.parse(s) recog_trees.append(reconnu) del pos_sents t4 = datetime.now() # [('We', 'PRP'), ('all', 'DT'), ('live', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('yellow', 'JJ'), ('submarine', 'NN'), ('.', '.')] # (S # We/PRP # all/DT
class Extraction: """This class is used to extract nouns, proper nouns, phrases from text""" def __init__(self, word_tokenize=None, sent_tokenize=None, pos_tag=None, stop_words=None, punct=None, grammar=chunk_grammar_propernouns): self._word_tokenize = word_tokenize if word_tokenize else nltk.word_tokenize self._sent_tokenize = sent_tokenize if sent_tokenize else nltk.sent_tokenize self._pos_tag = pos_tag if pos_tag else nltk.pos_tag self._stop_words = stop_words if stop_words else set(nltk.corpus.stopwords.words('english')) self._punct = punct if punct else set(string.punctuation) self._chunk_grammar = grammar self._chunker = RegexpParser(self._chunk_grammar) def extract_chunks_sent(self, sent): """ Extract chunk phrases from a sentence. :param sent: a sentence level text. :return: chunk phrases """ tags = self._pos_tag(self._word_tokenize(sent)) chunks = nltk.chunk.tree2conlltags(self._chunker.parse(tags)) # join constituent chunk words into a single chunked phrase return [' '.join(word for word, pos, chunk in group) for key, group in itertools.groupby(chunks, lambda (word, pos, chunk): chunk != 'O') if key] def extract_chunks_doc(self, text): """ Extract chunk phrases from a document. :param text: a document level text :return: chunk phrases """ sents = self._sent_tokenize(text) sents = [s for s in sents if s] return list(itertools.chain.from_iterable(map(self.extract_chunks_sent, sents))) def extract_words_sent(self, sent, good_tags=set(['NN', 'NNS'])): """ Extract desired words from a sentence. :param sent: a sentence level text :param good_tags: desired word tags :return: words with desired word tags """ tagged_words = self._pos_tag(self._word_tokenize(sent)) words = [word for word, tag in tagged_words if tag in good_tags and word.lower() not in self._stop_words and not all(char in self._punct for char in word)] return list(set(words)) def extract_words_doc(self, text, good_tags=set(['NN', 'NNS'])): """ Extract desiredwords from document :param text: a document level text :param good_tags: desired word tags :return: words with desired word tags """ sents = self._sent_tokenize(text) sents = [s for s in sents if s] func_extract = lambda x: self.extract_words_sent(x, good_tags) words = list(itertools.chain.from_iterable(map(func_extract, sents))) return list(set(words))
def __init__(self): from nltk import RegexpTagger from nltk import RegexpParser self.tagger = RegexpTagger(patterns) self.chunker = RegexpParser(grammar, trace=COPYRIGHT_TRACE)
def InfoExtractor(text): ### Regex Expressions ### ######################### regex_email = re.compile(r'([a-zA-Z0-9._-]+@[a-zA-z0-9._-]+\.[^\s]*)',re.IGNORECASE | re.UNICODE) regex_phone = re.compile(r'(\d+[\-\+\(]?\d+[\)\-\s]?\d+[\-\s]?\d+)', re.UNICODE) regex_DOB = re.compile(r'([0-3]?[0-9](?:\.|\/|\-|\s)[0-3]?(?:[0-9]|' + r'(?:Feb|Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January' + r'|February|March|April|May|June|July|August|September|October|' + r'November|December))(?:\.|\/|\-|\s)(?:[0-9]{2})?[0-9]{2})',re.IGNORECASE | re.UNICODE) #regex_phone = re.compile(r'\s*(?:\+?(\d{1,3}))?([-. (]*(\d{3})[-. )]*)?((\d{3})[-. ]*(\d{2,4})(?:[-.x ]*(\d+))?)$',re.IGNORECASE | re.UNICODE) info = dict() regex = { 'email':regex_email, 'phone':regex_phone, 'DOB':regex_DOB } for exp in regex.keys(): info[exp] = regex[exp].findall(text) #Filtering phone numbers info['phone'] = [x for x in info['phone'] if len(x)>5] print text ### Sent Tokenize ### ###################### sent = sent_tokenize(text.decode("utf8")) print sent print ### Word Tokenize ### ##################### sent = [ word_tokenize(word) for word in sent ] #print words sent = [pos_tag(word) for word in sent] #print sent[0] #print sent #print sent[1] grammar = "NP: {<DT>?<JJ>*<NN>}" cp = RegexpParser(grammar) result = cp.parse(sent[0]) #print result #result.draw() #print sent ''' raw_tuples = sent[0].split('\n') for line in raw_tuples: try: key, value = line.split('\t') print key, value print except: pass ''' #return None return info
return token def sentimentanalysis(texto): testimonial = TextBlob(texto) for zen in testimonial.words: print zen.translate(to="en") chunker = RegexpParser(""" ENTI: {<NNP|NNPS>+<NNP|NNPS|NN|NNS>} # Nouns and Adjectives, terminated with Nouns {<NNP|NNPS><IN><NNP|NNPS>} {<NNP|NNPS>} ENTIDACOMP: {<DT><NN|NNS><ENTI>} {<DT><NN|NNS><IN><ENTI>} {<ENTI><IN><ENTI>} {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} # Above, connected with in/of/etc... {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} ENTIDACOMP2: {<ENTI|ENTIDACOMP><IN><ENTI|ENTIDACOMP>} """) """ NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ #sentimentanalysis("su") print 'asadasdasdasdsadadsa'
class NPExtractor(Persistent): """ """ implements(ITermExtractor) def __init__(self): """ """ self.filter = DefaultFilter() self.tokenizer = getUtility(ITokenizer, name="collective.classification.tokenizers.NLTKTokenizer") self.tagger = getUtility(IPOSTagger, name="collective.classification.taggers.PennTreebankTagger") self.tagger_metadata = {'type':'Pen TreeBank','categories':[]} self.np_grammar = r""" NP: {<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ self.np_finder = RegexpParser(self.np_grammar) def _add(self,norm, terms): terms.setdefault(norm, 0) terms[norm] += 1 @ram.cache(_extractor_cachekey) def extract(self,text): """ """ tokens = self.tokenizer.tokenize(text) tagged_terms = self.tagger.tag(tokens) terms = {} np_terms = {} noun_phrases = [ node for node in self.np_finder.parse(tagged_terms) if not isinstance(node,tuple)] for node in noun_phrases: coll_tag = tree2conlltags(node) if len(coll_tag) > 1: mterm = [ term.lower() for (term,tag,temp) in coll_tag if len(term)>1 ] mterm = ' '.join(mterm) self._add(mterm,np_terms) for (term,tag,temp) in coll_tag: if tag.startswith('N') and len(term)>1: if tag in ['NNS','NNPS']: term = singularize(term) self._add(term.lower(),terms) for term in terms.keys(): if not self.filter(term,terms[term]): del terms[term] for term in np_terms.keys(): if not self.filter(term,np_terms[term]): del np_terms[term] return (terms,np_terms) def setTagger(self,tagger,tagger_metadata={}): self.tagger = tagger if not tagger_metadata: self.tagger_metadata['type']='unknown' else: self.tagger_metadata = tagger_metadata
def parse_features(self,review): cp = RegexpParser(self.grammar) return cp.parse(review)