def extractNP(CONTENT): stopwords = getList() grammer = r""" NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ chunker = nltk.RegexpParser(grammer) # create a chunker with the parser # sentences = nltk.sent_tokenize(CONTENT) lemmaobj = WordNetLemmatizer() words = [] paragraphs = [p for p in CONTENT.split('\n') if p] for para in paragraphs: sentences = [s for s in nltk.sent_tokenize(para) if s] for sentence in sentences: word = [w.lower() for w in nltk.word_tokenize(sentence)] taggedwords = nltk.pos_tag(word) tree = chunker.parse(taggedwords) temp = [] for subtree in tree.subtrees(): if subtree.label() == "NP": for leaves in subtree.leaves(): w = leaves[0].lower() if w not in stopwords: w = lemmaobj.lemmatize(w) temp.append(w) if temp != []: words.append(temp) temp = [] return words
def extractNP(CONTENT): stopwords = getList() grammer = r""" NBAR: {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} {<NBAR><IN><NBAR>} """ chunker = nltk.RegexpParser(grammer) # create a chunker with the parser # sentences = nltk.sent_tokenize(CONTENT) lemmaobj = WordNetLemmatizer() words = [] paragraphs = [p for p in CONTENT.split('\n') if p] for para in paragraphs: sentences = [s for s in nltk.sent_tokenize(para) if s] for sentence in sentences: word = [w.lower() for w in nltk.word_tokenize(sentence)] taggedwords = nltk.pos_tag(word) tree = chunker.parse(taggedwords) temp = [] for subtree in tree.subtrees(): if subtree.label() == "NP": for leaves in subtree.leaves(): w = leaves[0].lower() if w not in stopwords: w = lemmaobj.lemmatize(w) temp.append(w) if temp!=[]: words.append(temp) temp = [] return words
import operator import sys from textblob import TextBlob from rake import RakeKeywordExtractor from textblob.np_extractors import ConllExtractor from nltk.stem import WordNetLemmatizer from nltk.tokenize import WordPunctTokenizer from textblob.taggers import NLTKTagger from stopwordList import getList import codecs ## GLOBAL VARIABLES top_fraction = 1 LEMMA_OBJ = WordNetLemmatizer() tokenizer = WordPunctTokenizer() nltk_tagger = NLTKTagger() stopwords = getList() COLL_OBJ = ConllExtractor() def extractKeywords(phrase_list): RAKE_OBJ = RakeKeywordExtractor(set([])) word_scores = RAKE_OBJ._calculate_word_scores(phrase_list) phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores) sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) return [x[0] for x in sorted_phrase_scores[0:int(n_phrases)]] def extractChunks(CONTENT):
def __init__(self,additional_stopwords): #self.stopwords = set(nltk.corpus.stopwords.words()) self.stopwords = set(getList()) self.stopwords = self.stopwords | additional_stopwords self.top_fraction = 4 # consider top third candidate keywords by score
def __init__(self): # self.stopwords = set(nltk.corpus.stopwords.words()) self.stopwords = getList() self.top_fraction = 2 # consider top third candidate keywords by score
def __init__(self, additional_stopwords): #self.stopwords = set(nltk.corpus.stopwords.words()) self.stopwords = set(getList()) self.stopwords = self.stopwords | additional_stopwords self.top_fraction = 3 # consider top third candidate keywords by score
def __init__(self): # self.stopwords = set(nltk.corpus.stopwords.words()) self.stopwords = getList() self.top_fraction = 1.5 # consider top third candidate keywords by score
import operator import sys from textblob import TextBlob from rake import RakeKeywordExtractor from textblob.np_extractors import ConllExtractor from nltk.stem import WordNetLemmatizer from nltk.tokenize import WordPunctTokenizer from textblob.taggers import NLTKTagger from stopwordList import getList import codecs ## GLOBAL VARIABLES top_fraction = 1 LEMMA_OBJ = WordNetLemmatizer() tokenizer = WordPunctTokenizer() nltk_tagger = NLTKTagger() stopwords = getList() COLL_OBJ = ConllExtractor() def rake_extract(phrase_list): RAKE_OBJ = RakeKeywordExtractor(set([])) word_scores = RAKE_OBJ._calculate_word_scores(phrase_list) phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores) sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) return sorted_phrase_scores[0:int(n_phrases)] #FILE = open(sys.argv[1],"r") FILE = codecs.open(sys.argv[1],"r","iso8859-15") CONTENT = FILE.read()