def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ] # Yield candidates that are not filtered by stopwords and punctuation. for chunk in normalizer.normalize(chunks): yield chunk
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False): # Create the chunker that uses our grammar chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.word_tokenize(sent)) # Parse the sentence, converting the parse tree into a tagged sequence sent = normalize(sent) if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract phrases and rejoin them with space phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
def build_vocabulary(self): """ Generate a list of candidate phrases from the documents, using POS tagging and chunking functionality of nltk. """ stop_words = set(stopwords.words('english')) vocabulary = [] for doc in self.documents: words = [] candidates = [] clean_doc = text_cleaner(doc) sentences = sent_tokenize(clean_doc) words.extend([word_tokenize(sentence) for sentence in sentences]) tagged_words = pos_tag_sents(words) grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) # split into a private function all_tag = chain.from_iterable( [tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) vocabulary.append(candidates) vocabulary = list(chain(*vocabulary)) vocabulary = list(np.unique(vocabulary)) self.vocabulary = vocabulary
def generate_candidate(texts, method='phrase', remove_punctuation=True): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation # sentence = re.sub(r'[^\w]', ' ', sentence) words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging words_.clear() if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': # grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' grammar = r'KT: {(<JJ><NN.*>)' \ r' | (<NN.*><NN.*>) ' \ r' | (<NN.*><NN.*><NN.*>) ' \ r'| (<JJ><JJ><NN.*>+)' \ r' | (<JJ><NN.*><NN.*>)' \ r' | (<NN.*><JJ><NN.*>) ' \ r'| (<NN.*><IN><NN.*>) ' \ r'| (<JJ><NN.*><IN><NN.*>) ' \ r'| (<NN.*><IN><JJ><NN.*>) ' \ r'| (<JJ><NN.*><IN><JJ><NN.*>) }' chunker = RegexpParser(grammar) all_tag = chain.from_iterable( [tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def buildchunkerlist(grammerlst, tagged): gtree = [] for g in grammerlst: chunker = RegexpParser(g) OP = chunker.parse(tagged) if (OP.height() >= 3 ): gtree.append(OP.subtrees(lambda t: t.height() == 2)) return gtree
def parseRelatedFeature(sent, tagged): chunker = RegexpParser(''' OP5: {<.*>+<NN>?<CD><.*>+<NN>?} ''') OP = chunker.parse(tagged) if (OP.height() >= 3 ): for m in OP.subtrees(lambda t: t.height() == 2): for (word,tag) in m: if ( tag == "NN" and r3.match(word)): return True
class KeyPhraseGenerator(): """ Extracts keyphrases from input list of strings. """ def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS): self.chunker = RegexpParser(grammar) self.stopwords = stopwords def clean_text(self, txt): """ Removes emoji and urls from text. """ cleaned = cleaner.remove_emojis(txt) cleaned = cleaner.remove_urls(cleaned) return cleaned def clean_tagged_text(self, tagged_text): """ Remove punctuation from tagged text. """ punct_tagged = lambda word: all( unicat(char).startswith("P") and char != "," for char in word) cleaned = filter(lambda t: not punct_tagged(t[0]), tagged_text) return list(cleaned) def extract_keyphrases_single(self, txt): """ Yields keyphrases for one piece of text. """ for sent in txt: sent = self.clean_tagged_text(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby(chunks, lambda term: term[-1] != "O") if key ] for phrase in phrases: if phrase.lower() not in self.stopwords and len(phrase) > 2: yield phrase def extract_keyphrases(self, txt_list): """ Returns keyphrases for input list of strings. """ key_docs = [] for txt in txt_list: tagged_doc = [] txt = self.clean_text(txt) for sent in nltk.sent_tokenize(txt): tagged_doc.append(nltk.pos_tag(nltk.word_tokenize(sent))) key_docs.append(list(self.extract_keyphrases_single(tagged_doc))) return key_docs
def getConcepts(text): grammar = """ CONCEPT: {(<DT>)?(<JJ>)?<NN|NNS>+} """ chunker = RegexpParser(grammar) taggedText = pos_tag(word_tokenize(text)) textChunks = chunker.parse(taggedText) current_chunk = [] for i in textChunks: if (type(i) == Tree and i.label() == "CONCEPT"): current_chunk.append(" ".join([token for token, pos in i.leaves()])) return current_chunk
def vocab_gen(texts, bool_key): list_word = [] vocabs = [] word_write = "" phrase_write = "" pos_write = "" sentences = sent_tokenize(texts) sentence_write = "\n".join(sentences) for sentence in sentences: words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) list_word.append(words) words_w_pos = pos_tag_sents(list_word) # POS dumb = [j for sub in words_w_pos for j in sub] dumb = pos_tag_sents(dumb) dumb = [j for sub in dumb for j in sub] for i in dumb: pos_write += str(i) pos_write += "\n" # define grammar to pull out the phrases grammar = r'KT: ' \ r'{' \ r'(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+' \ r'}' grammar = RegexpParser(grammar) all_tag = chain.from_iterable( [tree2conlltags(grammar.parse(tag)) for tag in words_w_pos]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): vocabs_temp = ' '.join([word for (word, pos, chunk) in group]) if bool_key == 'Phrase': if key is True and vocabs_temp not in stop_words and len( vocabs_temp) > 2 and (' ' in vocabs_temp) == True: vocabs.append(vocabs_temp) phrase_write += vocabs_temp phrase_write += "\n" else: if key is True and vocabs_temp not in stop_words and len( vocabs_temp) > 2 and (' ' in vocabs_temp) == False: vocabs.append(vocabs_temp) word_write += vocabs_temp word_write += "\n" update_file = open(vocabs_word_path, 'w') update_file.write(word_write) if bool_key == 'Phrase': update_file = open(vocabs_phrase_path, 'w') update_file.write(phrase_write) update_file = open(sentence_path, 'w') update_file.write(sentence_write) update_file = open(pos_path, 'w') update_file.write(pos_write) return vocabs
def extract_from_sentences(sentences, add_verbs=True, language="english"): """ Processes Sentence objects to calculate contained Noun Phrases based on a given grammar and maps them to the sentences they occur in. :param sentences: A list of Sentence objects. :param add_verbs: Optional. Default: True. Whether or not verbs are to be added to the mapping. :param language: Optional. Default: English. The langue of the sentences. :return: A dictionary mapping tokens to the sentence IDs of the sentences they appear in. """ # produce the mapping of sentences to their contained (words, pos) tuples pos_dictionary = {} NP_GRAMMAR_COMPOUND = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*((<CC>|,)<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*)*}" for sentence in sentences: pos_dictionary[sentence.sentence_id] = [ (token, tag) for token, tag in sentence.tokens.items() ] parser_cmp = RegexpParser(NP_GRAMMAR_COMPOUND) term2sentence_id = {} lemmatizer = WordNetLemmatizer() for sentence_id, pos_tagged_tokens in pos_dictionary.items(): if add_verbs: # updating the inverse occurrence index with verbs for subject, tag in pos_tagged_tokens: # check if subject is tagged as a verb if tag.startswith("VB"): verb = lemmatizer.lemmatize(subject, "v").lower() if verb not in stopwords.words(language): if verb not in term2sentence_id: term2sentence_id[verb] = set() term2sentence_id[verb].add(sentence_id) # trying to parse the sentence_id into a top-level chunk tree tree = parser_cmp.parse(pos_dictionary[sentence_id]) # getting the top-level tree triples and decomposing the NPs cmp_triples, simple_trees = get_cooccurence([tree], ignore_stopwords=False, language=language) smp_triples, _ = get_cooccurence(simple_trees, ignore_stopwords=True, language=language) # updating the inverse occurrence index with NPs for subject, _, objecT in cmp_triples + smp_triples: if subject.lower() not in term2sentence_id: term2sentence_id[subject.lower()] = set() if objecT.lower() not in term2sentence_id: term2sentence_id[objecT.lower()] = set() term2sentence_id[subject.lower()].add(sentence_id) term2sentence_id[objecT.lower()].add(sentence_id) return term2sentence_id
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Wraps a PickledCorpusReader consisting of pos-tagged documents. """ def __init__(self, grammar=GRAMMAR): self.grammar = GRAMMAR self.chunker = RegexpParser(self.grammar) def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(char).startswith('P') for char in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = list(sent) if len(sent) == 2: sent = map(lambda t: (t[0].lower(), t[1]), [sent]) sent = list(sent) else: sent = list() return sent def extract_keyphrases(self, document): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Yields extracted phrases. """ for sents in document: for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def transform(self, documents): for document in documents: yield list(self.extract_keyphrases(document))
def get_tokens(text): word_list = [] voc = [] voc_write = '' sent = sent_tokenize(text) word_single = word_tokenize(text) if os.path.exists('token_log.txt'): k = open('token_log.txt', 'w', encoding='UTF8') else: k = open('token_log.txt', 'x', encoding='UTF8') k = open('token_log.txt', 'w', encoding='UTF8') k.write(str(word_single)) for i in sent: word = word_tokenize(i) words = list(map(lambda s: s.lower(), word)) word_list.append(words) words_pos = pos_tag_sents(word_list) if os.path.exists('pos_log.txt'): f = open('pos_log.txt', 'w', encoding='UTF8') else: f = open('pos_log.txt', 'x', encoding='UTF8') f = open('pos_log.txt', 'w', encoding='UTF8') f.write(str(words_pos)) grammar = r'KT: ' \ r'{' \ r'(<JJ>* <NN.*>+ <In>)? <JJ>* <NN.*>+' \ r'}' grammar = RegexpParser(grammar) tags = chain.from_iterable( [tree2conlltags(grammar.parse(tag)) for tag in words_pos]) for key, group in groupby(tags, lambda tag: tag[2] != 'O'): voc_temp = ' '.join([word for (word, pos, chunk) in group]) if key is True and voc_temp not in stopwords.words( 'english') and voc_temp != 'https': voc.append(voc_temp) voc_write += voc_temp voc_write += '\n' if os.path.exists('voc_log.txt'): f = open('voc_log.txt', 'w', encoding='UTF8') else: f = open('voc_log.txt', 'x', encoding='UTF8') f = open('voc_log.txt', 'w', encoding='UTF8') f.write(voc_write) return voc
def getInstances(text): grammar = """ PRE: {<NNS|NNP|NN|NP|JJ|UH>+} MID: {<DT|IN|POS|FW|-|NP|NPS|NN|NNS>+} INSTANCE: {(<DT+>)?(<JJ+>)?<PRE>(<MID><PRE>)?} """ chunker = RegexpParser(grammar) taggedText = pos_tag(word_tokenize(text)) textChunks = chunker.parse(taggedText) current_chunk = [] for i in textChunks: if (type(i) == Tree and i.label() == "INSTANCE"): # print (i.leaves()) current_chunk.append(" ".join([token for token, pos in i.leaves()])) return current_chunk
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Wraps a PickledCorpusReader consisting of pos-tagged documents. """ def __init__(self, grammar=GRAMMAR): self.grammar = GRAMMAR self.chunker = RegexpParser(self.grammar) def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(char).startswith('P') for char in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = map(lambda t: (t[0].lower(), t[1]), sent) return list(sent) def extract_keyphrases(self, document): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Yields extracted phrases. """ for sents in document: for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def transform(self, documents): for document in documents: yield list(self.extract_keyphrases(document))
def generate_candidate(texts, method='word', remove_punctuation=False): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def create_phrase_vocabulary(raw_data): ''' Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram, if we want to use different format or different vocabulary, vocabulary must be created. ''' #grammar to extract the noun phrase grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}' #set the punctuation and chunker punct = set(string.punctuation) chunker = RegexpParser(grammar) def lambda_unpack(f): #function to unpack the tuple return lambda args: f(*args) #tokenize and create pos tags per sentence, then get its IOB tag postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) noun_phrases = list( chain.from_iterable( tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) #join B-NP and I-NP tags as one noun phrase excluding O tags merged_nounphrase = [ ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in itertools.groupby( noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key ] #filter the term below than two characters and punctuation all_nounphrases = [ cand for cand in merged_nounphrase if len(cand) > 2 and not all(char in punct for char in cand) ] #select distinct noun phrases vocabulary = (list(set(all_nounphrases))) return vocabulary
def get_cooccurence(chunk_trees, ignore_stopwords=True, language="english"): """ Parses a chunk tree and gets co-occurance of terms. :param chunk_trees: Tree from the NLTK RegexParser, generated over POS-tagged sentences using the provided grammar. :param ignore_stopwords: Optional. Default: True. Whether stopwords are to be ignored or not. :param language: Optional. Default: English. The language of the texts over which the chunk trees were generated. :return: A list of co-occuring tokens and a simple parse tree generated over the leaves of the chunks of the provided one. """ triples = [] simple_trees = [] lemmatizer = WordNetLemmatizer() NP_GRAMMAR_SIMPLE = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+}" parser_simple = RegexpParser(NP_GRAMMAR_SIMPLE) for t in chunk_trees: entities = [] for chunk in t: if isinstance(chunk, Tree) and chunk.label() == 'NP': # getting a tree for later processing of triples from the simple noun # phrases (if present) simple_trees.append(parser_simple.parse(chunk.leaves())) words = [] for word, tag in chunk: if (ignore_stopwords and word in stopwords.words(language)) or \ (not any(char.isalnum() for char in word)): # do not process stopwords for simple trees, do not process purely # non alphanumeric characters continue if tag.startswith('N'): words.append(lemmatizer.lemmatize(word, 'n')) elif tag.startswith('J'): words.append(lemmatizer.lemmatize(word, 'a')) else: words.append(word) if len(words) > 0: entities.append("_".join(words)) for e1, e2 in combinations(entities, 2): triples.append((e1, "close to", e2)) triples.append((e2, "close to", e1)) return triples, simple_trees
def chunk_location_sent(pos_text, temp_text): list_of_locs = list() chunk_grammar = r""" LOC: {((<CD>?<NNP>+<CD>?)|(<CD>?<NN>+<CD>?))+} """ chunker = RegexpParser(chunk_grammar) chunked_article = chunker.parse(pos_text) for subtree in chunked_article.subtrees(): if subtree.label()=='LOC': #print(' '.join((tuples[0] for tuples in list(subtree)))) #print(subtree.pprint()) NNPs = ' '.join((tuples[0] for tuples in list(subtree))) #print("LOC: " + NNPs) list_of_locs.append(NNPs) #print("loc list:", list_of_locs) return list_of_locs
def getNounPhrases(self): featureSet = [] # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) for sentence in self.sentences: tokens = word_tokenize(sentence) if len(tokens) == 0: continue else: pass tagged = pos_tag(tokens) tree = chunker.parse(tagged) terms = [] leafCollection = [] for subtree in tree.subtrees(filter = lambda t : t.node == 'NP'): leafCollection.append(subtree.leaves()) for leaf in leafCollection: term = [w for w,t in leaf if len(w) > 2] phrase = ' '.join(term) terms.append(phrase) featureSet += terms self.convertToFeatureDist(featureSet) self.helperObject.saveAllFeaturesExtracted(featureSet)
def getNounPhrases(self): featureSet = [] # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ chunker = RegexpParser(grammar) for sentence in self.sentences: tokens = word_tokenize(sentence) if len(tokens) == 0: continue else: pass tagged = pos_tag(tokens) tree = chunker.parse(tagged) terms = [] leafCollection = [] for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'): leafCollection.append(subtree.leaves()) for leaf in leafCollection: term = [w for w, t in leaf if len(w) > 2] phrase = ' '.join(term) terms.append(phrase) featureSet += terms self.convertToFeatureDist(featureSet) self.helperObject.saveAllFeaturesExtracted(featureSet)
def chunk_name_sent(pos_text, temp_text): list_of_names = list() chunk_grammar = r""" NAME: {<NNP>+} """ chunker = RegexpParser(chunk_grammar) chunked_article = chunker.parse(pos_text) #print("chunk:", chunked_article) for subtree in chunked_article.subtrees(): if subtree.label()=='NAME': #print(' '.join((tuples[0] for tuples in list(subtree)))) #print(subtree.pprint()) NNPs = ' '.join((tuples[0] for tuples in list(subtree))) #print("..: ", NNPs) #print("LOC: " + NNPs) list_of_names.append(NNPs) #print("namelist: ", list_of_names) return list_of_names
def extract_words(nodetext, t2, doc, location): try: # tokenizer = RegexT(r'\w*[a-zA-Z]\w*') # return tokenizer.tokenize(nodetext) #except TypeError: # return [] grammar = "NP: {<JJ>*<NN>+}" phrases = [] final_phrases = [] for sent in sent_tokenize(nodetext): doc.add_sentence(Sentence(location, sent)) tag_list = t2.tag(word_tokenize(sent)) parser = RegexpParser(grammar) result = parser.parse(tag_list) for phrase in result: if isinstance(phrase, NLTREE.Tree) and phrase.node == "NP": phrases.append("_".join([word for word,pos in phrase.leaves()])) #n_phrase = "_".join([word for word,pos in phrase.leaves()]) #if any(c.isdigit() for c in n_phrase): # continue # elif '.' in n_phrase: # continue # else: # doc.add_word(Word(location, n_phrase, sent)) except TypeError: return [] for phrase in phrases: if any(c.isdigit() for c in phrase): continue elif '.' in phrase: continue else: final_phrases.append(phrase) return final_phrases
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ]
grammar = """NP:{<DT>?<JJ>*(<NN.*>)+} PR:{<PRP.*>} """ #grammar for tagging noun phrases and pronouns #DT - determiners eg: The, a, an, my #JJ - adjectives #NN.* - any type of noun #PRP - personal pronoun eg: He, she, I, We, they rp = RegexpParser(grammar) count = 0 for s in listOfTaggedSents: chunkedTree = ParentedTree.convert( rp.parse(s)) #tree of chunked parts of the sentence #ParentedTree is used to convert tagged words to tree structure neTree = ne_chunk(s) #tree with named entity tags #print (chunkedTree) #chunkedTree.draw() #neTree.draw() for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP': mostSigNoun = [ w for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS'] ] for ne in neTree: #ne contains nouns and pos if isinstance(ne, nltk.tree.Tree):
def apply_grammar(pos_words): grammar_parser = RegexpParser(GRAMMAR) return grammar_parser.parse(pos_words)
def tagChunk(self, taggedword, loops=2): ## Cunking cp = RegexpParser(self.grammar, loop=loops) print('tagged word') print(taggedword) return cp.parse(taggedword)
def regex_chunk(self, tagged, pattern): pr = RegexpParser(pattern) chunked = [pr.parse(sent) for sent in tagged] return chunked
del words[i] i = i words_len = len(words) else: i = i + 1 words_len = len(words) words_only = words[1:] i = 1 while (i < words_len): lmtzr.lemmatize(words[i]) i = i + 1 pos_words = pos_tagger.tag(words_only) parsed_out_pcfg = reg_parser.parse(pos_words) pre_parsed_out = dependency_parser.parse(words_only) dep = pre_parsed_out.__next__() parsed_out = list(dep.triples()) Script_Word_Ct += len(pos_words) i = 0 while (i < words_len - 1): tags = pos_words[i][1] if (i < len(pos_words) - 1 and tags == 'NP' and pos_words[i + 1][1] == 'PRP'): NP_PRP += 1
def summarizer(tex, reduce_per): def norm(word, pos='x'): #normalizes all words except proper nouns word = word.lower() if pos not in ['NNP', 'NNPS']: wnl = WordNetLemmatizer() word = wnl.lemmatize(word) return (word) sentList = sent_tokenize(tex) #list of all tokenized sentences #print(sentList) sentNounDict = defaultdict( list ) # a dictionary key:sentence_number value:all nouns in the sentence... (nouns are normalised) for s in sentList: for w, pos in pos_tag(word_tokenize(s)): if pos in ['NN', 'NNS', 'NNP', 'NNPS']: sentNounDict[sentList.index(s)].append(norm(w, pos)) #print (sentNounDict) wordSentDict = defaultdict( list ) # a dictionary key:(word,pos) value:all sentences it appears in...(word is normalised) for s in sentList: for w, pos in pos_tag(word_tokenize(s)): wordSentDict[(norm(w, pos), pos)].append(sentList.index(s)) #print (wordSentDict) #list of all nouns in the text listOfNouns = list( sorted( set([ norm(w, pos) for s in sentList for w, pos in pos_tag(word_tokenize(s)) if pos in ['NN', 'NNS', 'NNP', 'NNPS'] ]))) #print (listOfNouns) listOfTaggedSents = [ ] #list of sentences of tokenized words with postags- list[tuple(w,pos)] for s in sentList: l = [(n, pos) for n, pos in pos_tag(word_tokenize(s))] listOfTaggedSents.append(l) #print (listOfTaggedSents) mostSigNoun = [] #most recently encountered significant noun mostSigNounObject = [ ] #most recently encountered significant noun which is not a person mostSigNounPerson = [ ] #most recently encountered significant noun which has named entity as person pronounNounDict = defaultdict( list ) #key:touple(pronoun,sentence_num) val:list(list(touple(noun,pos)))(noun not normalized) #grammar for tagging noun phrases and pronouns grammar = """NP:{<DT>?<JJ>*(<NN.*>)+} PR:{<PRP.*>} """ rp = RegexpParser(grammar) for s in listOfTaggedSents: begin = True chunkedTree = ParentedTree.convert( rp.parse(s)) #tree of chunked parts of the sentence neTree = ne_chunk(s) #tree with named entity tags #print (chunkedTree) #chunkedTree.draw() for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP': if begin == True: mostSigNoun = [ w for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS'] ] #print (mostSigNoun) for ne in neTree: if isinstance(ne, nltk.tree.Tree): if ne[0] in mostSigNoun: if ne.label() == 'PERSON': mostSigNounPerson = [] mostSigNounPerson.append(ne[0]) else: mostSigNounObject = [] mostSigNounObject.append(ne[0]) begin = False if n.label() == 'PR': pron = n[0][0].lower() #print pron if pron in ['it', 'its']: #for objects if len(mostSigNounObject) > 0: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNounObject) else: #if mostsignounobject does not exist pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNoun) else: if len(mostSigNounPerson) > 0: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNounPerson) else: pronounNounDict[(pron, listOfTaggedSents.index(s) )].append(mostSigNoun) begin = False #print pronounNounDict #adding the nouns corresponding to the pronouns to sentworddict and wordsentdict for v1 in pronounNounDict[(pron, listOfTaggedSents.index(s))]: for v11 in v1: #it is a list of lists sentNounDict[listOfTaggedSents.index(s)].append( norm(v11[0], v11[1])) wordSentDict[(norm(v11[0], v11[1]), v11[1])].append( listOfTaggedSents.index(s)) #print (sentNounDict) #print (wordSentDict) #print (pronounNounDict) for key, val in sentNounDict.items(): #making sentnoundict a set val = list(set(val)) sentNounDict[key] = val #print (sentNounDict) #following code calculates the distance between two phrases distance = defaultdict( int ) #a dict.. key:(noun or noun(pronoun),sentence_num) value:position in the sentence from the begining for s in listOfTaggedSents: dist = 0 chunkedTree = ParentedTree.convert(rp.parse(s)) for n in chunkedTree: if isinstance(n, nltk.tree.Tree): if n.label() == 'NP': tempNoun = [ w[0] for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS'] ] for w in tempNoun: distance[(norm(w), listOfTaggedSents.index(s))] = dist if n.label() == 'PR': pron = n[0][0].lower() tempNoun = pronounNounDict[(pron, listOfTaggedSents.index(s))] for v1 in tempNoun: for v11 in v1: distance[(norm(v11[0], v11[1]), listOfTaggedSents.index(s))] = dist dist += 1 #print (distance) #the following code assigns relation factor between two nouns nounGraph = np.zeros((len(listOfNouns), len(listOfNouns))) for key, value in sentNounDict.items(): for v1 in value: for v2 in value: d = 0 if v2 != v1: d = distance[v1, key] - distance[v2, key] nounGraph[listOfNouns.index(v1)][listOfNouns.index( v2)] += float((100 / (abs(d) + 1))) #if nounGraph[listOfNouns.index(v1)][listOfNouns.index(v2)]>=100: #print(v1+' '+v2+" "+str(d)) #print(nounGraph) nounPriority = defaultdict( int ) #dict to hold noun priorities... key:noun(normalized) value:priority sentencePriority = defaultdict( int ) #dict to hold sentence priorities...key:sentence_num value:priority def calcNounPriority( ): #function calculates the noun priority(sum of weights of all the edges attached to this noun in the noungraph) total = 0 i = 0 for x in nounGraph: total = sum(x) nounPriority[listOfNouns[i]] = total i += 1 #print (sorted(nounPriority.items(),key=lambda x:x[1], reverse=True)) def calcSentPriority( ): #function calculates sentence priority(sum of priorities of all nouns in the sent) for key, value in sentNounDict.items(): total = 0 for n in value: total += nounPriority[n] sentencePriority[key] = total calcNounPriority() calcSentPriority() #print (sorted(sentencePriority.items(),key=lambda x:x[1], reverse=True)) #for i in range(len(sentList)): #print(str(i)+' '+sentList[i]) reducingFactor = 0.9 #10% summary = [] #list to hold the summary reduce_per = reduce_per / 100 #print(reduce_per) for i in range(int(len(sentencePriority) * reduce_per)): summary.append(max(sentencePriority.items(), key=lambda x: x[1])) #print (summary) j = summary[-1][0] for n in sentNounDict[j]: nounPriority[ n] *= reducingFactor #reduce the priority of all nouns in the picked sentence del sentNounDict[j] del sentencePriority[j] #remove the picked sentence calcSentPriority() #recalculate sentence priority #print ("\n\n") i = 1 s_list = [] for s in sorted(summary): #print (i,sentList[s[0]]) s_list.append(sentList[s[0]]) i += 1 return (s_list)
def tagChunk(self, taggedword, loops=2): ## Cunking cp = RegexpParser(self.grammar, loop=loops) return cp.parse(taggedword)
class KeyphraseExtractor(BaseEstimator, TransformerMixin): """ Extract adverbial and adjective phrases, and transform documents into lists of these keyphrases, with a total keyphrase lexicon limited by the nfeatures parameter and a document length limited/padded to doclen """ def __init__(self, nfeatures=100000, doclen=60): self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}' # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}' # self.grammar = r'KT: {<RB.>|<JJ.>}' self.chunker = RegexpParser(self.grammar) self.nfeatures = nfeatures self.doclen = doclen def normalize(self, sent): """ Removes punctuation from a tokenized/tagged sentence and lowercases words. """ is_punct = lambda word: all(unicat(c).startswith('P') for c in word) sent = filter(lambda t: not is_punct(t[0]), sent) sent = map(lambda t: (t[0].lower(), t[1]), sent) return list(sent) def extract_candidate_phrases(self, sents): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Extract phrases, rejoin with a space, and yield the document represented as a list of it's keyphrases. """ for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby(chunks, lambda term: term[-1] != 'O') if key ] for phrase in phrases: yield phrase def fit(self, documents, y=None): return self def get_lexicon(self, keydocs): """ Build a lexicon of size nfeatures """ keyphrases = [keyphrase for doc in keydocs for keyphrase in doc] fdist = FreqDist(keyphrases) counts = fdist.most_common(self.nfeatures) lexicon = [phrase for phrase, count in counts] return {phrase: idx + 1 for idx, phrase in enumerate(lexicon)} def clip(self, keydoc, lexicon): """ Remove keyphrases from documents that aren't in the lexicon """ return [ lexicon[keyphrase] for keyphrase in keydoc if keyphrase in lexicon.keys() ] def transform(self, documents): docs = [list(self.extract_candidate_phrases(doc)) for doc in documents] lexicon = self.get_lexicon(docs) clipped = [list(self.clip(doc, lexicon)) for doc in docs] return sequence.pad_sequences(clipped, maxlen=self.doclen)