Exemple #1
0
def extractNP(CONTENT):
    stopwords = getList()
    grammer = r"""
		NBAR:
			{<NN.*|JJ>*<NN.*>}  
		NP:
			{<NBAR>}
			{<NBAR><IN><NBAR>} 

		"""
    chunker = nltk.RegexpParser(grammer)  # create a chunker with the parser
    #	sentences = nltk.sent_tokenize(CONTENT)
    lemmaobj = WordNetLemmatizer()
    words = []
    paragraphs = [p for p in CONTENT.split('\n') if p]
    for para in paragraphs:
        sentences = [s for s in nltk.sent_tokenize(para) if s]
        for sentence in sentences:
            word = [w.lower() for w in nltk.word_tokenize(sentence)]
            taggedwords = nltk.pos_tag(word)
            tree = chunker.parse(taggedwords)
            temp = []
            for subtree in tree.subtrees():
                if subtree.label() == "NP":
                    for leaves in subtree.leaves():
                        w = leaves[0].lower()
                        if w not in stopwords:
                            w = lemmaobj.lemmatize(w)
                            temp.append(w)
                    if temp != []:
                        words.append(temp)
                        temp = []
    return words
def extractNP(CONTENT):
	stopwords = getList()
	grammer = r"""
		NBAR:
			{<NN.*|JJ>*<NN.*>}  
		NP:
			{<NBAR>}
			{<NBAR><IN><NBAR>} 

		"""
	chunker = nltk.RegexpParser(grammer)	# create a chunker with the parser
#	sentences = nltk.sent_tokenize(CONTENT)
	lemmaobj = WordNetLemmatizer()
	words  = []
	paragraphs = [p for p in CONTENT.split('\n') if p]
	for para in paragraphs:
		sentences = [s for s in nltk.sent_tokenize(para) if s]
		for sentence in sentences:
			word = [w.lower() for w in nltk.word_tokenize(sentence)]
			taggedwords = nltk.pos_tag(word)
			tree = chunker.parse(taggedwords)
			temp = [] 
			for subtree in tree.subtrees():
				if subtree.label() == "NP":
					for leaves in subtree.leaves():
						w = leaves[0].lower()
						if w not in stopwords:
							w = lemmaobj.lemmatize(w)
							temp.append(w)
					if temp!=[]:
						words.append(temp)
						temp = []
	return words 			
import operator
import sys
from textblob import TextBlob
from rake import RakeKeywordExtractor
from textblob.np_extractors import ConllExtractor
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from textblob.taggers import NLTKTagger
from stopwordList import getList
import codecs
## GLOBAL VARIABLES
top_fraction = 1
LEMMA_OBJ = WordNetLemmatizer()
tokenizer = WordPunctTokenizer()
nltk_tagger = NLTKTagger()
stopwords = getList()
COLL_OBJ = ConllExtractor()


def extractKeywords(phrase_list):
    RAKE_OBJ = RakeKeywordExtractor(set([]))
    word_scores = RAKE_OBJ._calculate_word_scores(phrase_list)
    phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores)
    sorted_phrase_scores = sorted(phrase_scores.iteritems(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
    n_phrases = len(sorted_phrase_scores)
    return [x[0] for x in sorted_phrase_scores[0:int(n_phrases)]]


def extractChunks(CONTENT):
Exemple #4
0
 def __init__(self,additional_stopwords):
   #self.stopwords = set(nltk.corpus.stopwords.words())
   self.stopwords = set(getList())
   self.stopwords = self.stopwords | additional_stopwords	
   self.top_fraction = 4 # consider top third candidate keywords by score
  def __init__(self):
	# self.stopwords = set(nltk.corpus.stopwords.words())
	self.stopwords  = getList()  
	self.top_fraction = 2 # consider top third candidate keywords by score
Exemple #6
0
 def __init__(self, additional_stopwords):
     #self.stopwords = set(nltk.corpus.stopwords.words())
     self.stopwords = set(getList())
     self.stopwords = self.stopwords | additional_stopwords
     self.top_fraction = 3  # consider top third candidate keywords by score
 def __init__(self):
     # self.stopwords = set(nltk.corpus.stopwords.words())
     self.stopwords = getList()
     self.top_fraction = 1.5  # consider top third candidate keywords by score
import operator
import sys
from textblob import TextBlob
from rake import RakeKeywordExtractor
from textblob.np_extractors import ConllExtractor
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from textblob.taggers import NLTKTagger
from stopwordList import getList
import codecs
## GLOBAL VARIABLES 
top_fraction = 1
LEMMA_OBJ = WordNetLemmatizer()
tokenizer = WordPunctTokenizer()
nltk_tagger = NLTKTagger()
stopwords = getList()
COLL_OBJ = ConllExtractor()	

def rake_extract(phrase_list):
	RAKE_OBJ = RakeKeywordExtractor(set([]))
	word_scores = RAKE_OBJ._calculate_word_scores(phrase_list)
	phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores)
	sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True)
	n_phrases = len(sorted_phrase_scores)
	return sorted_phrase_scores[0:int(n_phrases)]

	

#FILE = open(sys.argv[1],"r")	
FILE = codecs.open(sys.argv[1],"r","iso8859-15")	
CONTENT = FILE.read()