コード例 #1
0
ファイル: stopwords.py プロジェクト: vogetihrsh/Internship
def completeRake(content):
	rakeObject  = RakeKeywordExtractor(set([]))
	keywordList = rakeObject.extract(content,True)
	words = nltk.word_tokenize(content)
	content_lemmatized = ""	
	words  = list(map(lambda x: lemma_obj.lemmatize(x),words))
	content_lemmatized = ' '.join(words)
	content = content_lemmatized
	freq = {}
	freq_dist  = nltk.FreqDist(words)
	keyword_freq=getKeywordFrequency(keywordList)
	adjacency_freq = getAdjacencyFrequency(keywordList,content,words)

	sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True)
	additional_stopwords=[]
	for key in sortedFreqList:
		keyword_freq.setdefault(key[0],0)
		adjacency_freq.setdefault(key[0],0)
		if(adjacency_freq[key[0]]>keyword_freq[key[0]]):
			additional_stopwords.append(key[0])
	additional_stopwords = set(additional_stopwords)
	newRakeObject = RakeKeywordExtractor(additional_stopwords)
	newKeywordList = newRakeObject.extract(content)
	for keywords in newKeywordList:
		print keywords
コード例 #2
0
def extractKeywords(phrase_list):
	RAKE_OBJ = RakeKeywordExtractor(set([]))
	word_scores = RAKE_OBJ._calculate_word_scores(phrase_list)
	phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores)
	sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True)
	n_phrases = len(sorted_phrase_scores)
	return sorted_phrase_scores[0:int(n_phrases)]
コード例 #3
0
def extractKeywords(phrase_list):
	RAKE_OBJ = RakeKeywordExtractor(set([]))
	word_scores = RAKE_OBJ._calculate_word_scores(phrase_list)
	phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores)
	sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True)
	n_phrases = len(sorted_phrase_scores)
	return sorted_phrase_scores[0:int(n_phrases/2)]
コード例 #4
0
def main():
    # SET N AS THE NUMBER OF KEYWORDS TO EVALUATE FROM THE KEYWORDS LISTS
    N = 243
    base_dir =  os.path.dirname(os.path.realpath(__file__))
    stopwords_simple = os.path.join(base_dir,'stopwords_nltk.txt')
    rake = RakeKeywordExtractor(stopwords_simple)
    
    tp_total = 0
    fp_total = 0
    fn_total = 0    
    corpus = os.path.join(base_dir,'./corpus')
    txtfiles = [file for file in os.listdir(corpus) if file.endswith('.txt')]
    s=""
    for txtfile in txtfiles:
        try:        
            keyfile = os.path.join(corpus,txtfile).replace('.txt', '.key')
            content = open(os.path.join(corpus,txtfile), 'r').read().decode('utf-8')
        
            keywordsExtracted = set(rake.extract(content, incl_scores=False)[0:N])
            keywordsExpected = set(listfromfilelines(keyfile)[0:N])
    
            tp, fp, fn = confusionMatrix(keywordsExtracted, keywordsExpected);
            p, r, f1 = getF1(tp, fp, fn)

            tp_total += tp
            fp_total += fp
            fn_total += fn
      
            print "F1 for top " + str(N) + " keywords in " + txtfile + ":\t" + str(f1)
            s="" 
            for x in keywordsExtracted:
                s = s + x +" "             
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")
            s="" 
            for x in keywordsExpected:
                s = s + x +" "             
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")
            s=content
            wordcloud = WordCloud().generate(s)
            plt.imshow(wordcloud)
            plt.axis('off')
            plt.show()
            raw_input(">")

        except Exception,err:
            print Exception,err
           
        #COMMENT NEXT LINES IN FOR DEBUGGING
        """print "Extracted Keywords:"
コード例 #5
0
def completeRake(content):
    rakeObject = RakeKeywordExtractor(set([]))
    keywordList = rakeObject.extract(content, True)
    words = nltk.word_tokenize(content)
    content_lemmatized = ""
    words = list(map(lambda x: lemma_obj.lemmatize(x), words))
    content_lemmatized = ' '.join(words)
    content = content_lemmatized
    freq = {}
    freq_dist = nltk.FreqDist(words)
    keyword_freq = getKeywordFrequency(keywordList)
    adjacency_freq = getAdjacencyFrequency(keywordList, content, words)

    sortedFreqList = sorted(freq_dist.items(),
                            key=lambda x: x[1],
                            reverse=True)
    additional_stopwords = []
    for key in sortedFreqList:
        keyword_freq.setdefault(key[0], 0)
        adjacency_freq.setdefault(key[0], 0)
        if (adjacency_freq[key[0]] > keyword_freq[key[0]]):
            additional_stopwords.append(key[0])
    additional_stopwords = set(additional_stopwords)
    newRakeObject = RakeKeywordExtractor(additional_stopwords)
    newKeywordList = newRakeObject.extract(content)
    for keywords in newKeywordList:
        print keywords
コード例 #6
0
    def keywords(self,text,tag_table):
        tagtable = open(tag_table,'r')
        tags = []
        features = []     
        wordset = self.freqwords(text)
        for i in tagtable.readlines():
            tags.append(i.split(',')[0])

        rake = RakeKeywordExtractor()    
        rakelist = rake.test(text)
        for i in rakelist:        
            features.append(self.normalise(i[0]).replace(' ','-'))

        for keyword in wordset:
            for phrase in features:
                if phrase.find(keyword)!= -1 :
                    features.append(keyword)
                    break
                
        intersection = set(tags) & set(features)
    #    print features
        return ' '.join(list(intersection))
コード例 #7
0
ファイル: stopwords.py プロジェクト: vogetihrsh/Python_Files
	keyword_freq={}
	for i in range(0,NUM):
		keywords = keywordList[i][0].split(' ')
		length = len(keywords)
		for word in keywords:
			keyword_freq.setdefault(word,0)
			keyword_freq[word] = keyword_freq[word] + 1
	return keyword_freq	


#f = open(sys.argv[1],'r')
f = codecs.open(sys.argv[1],'r',"iso8859-15")
content = f.read()	
content = content.encode('ascii','ignore')	
content = re.sub(r"[1-9][0-9]*\.?[0-9]*",'',content)
rakeObject  = RakeKeywordExtractor(set([]))
keywordList = rakeObject.extract(content,True)
words = nltk.word_tokenize(content)
content_lemmatized = ""	
words  = list(map(lambda x: lemma_obj.lemmatize(x),words))
content_lemmatized = ' '.join(words)
content = content_lemmatized
freq = {}
freq_dist  = nltk.FreqDist(words)
	
keyword_freq=getKeywordFrequency(keywordList)
adjacency_freq = getAdjacencyFrequency(keywordList,content,words)

sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True)
additional_stopwords=[]
for key in sortedFreqList:
コード例 #8
0
    ).join(
    Portal,
    Peticion.folioPeticion == Portal.folioSAC).join(
    Temas, Peticion.tema_id == Temas.temaId
    ).join(
    Dependencia,
    Peticion.dependencia_id == Dependencia.dependenciaId
    )

kw = {}

temas = {}

for pet in peticion_list:
    tema_pet = pet.Temas.nomTema
    if tema_pet in temas:
        temas[tema_pet] = temas[tema_pet] + 1
    else:
        temas[tema_pet] = 1
    rake = RakeKeywordExtractor()
    kw_tmp = rake.extract(pet.Portal.descripcion)
    for word in list(kw_tmp):
      if word not in kw:
          kw[word] = 1
      else:
          kw[word] += 1
mylist = sorted(temas.items(), key=lambda x:x[1], reverse=True)

for item in mylist:
    print('"'+item[0]+'": {"peticiones_atendidas": '+str(item[1])+', "palabras_clave": "'+tema_keyword[item[0]]+'"},')
コード例 #9
0
    exit(-1)

working_dir = sys.argv[1]  # param
question_id = int(sys.argv[2])  # param
max_tags = int(sys.argv[3])  # param
must_delete_former_automatic_tags = int(sys.argv[4])  # param

download_dir="/tmp/nltk_data"
# resources download
nltk.download('stopwords', download_dir)
nltk.download('punkt', download_dir)
nltk.download('averaged_perceptron_tagger', download_dir)
nltk.download('wordnet', download_dir)
nltk.download('pickle', download_dir)

rake = RakeKeywordExtractor();

TAG_TYPE = "AUTOMATIC"
queryTag = "SELECT ID FROM SurveyAnswerTag WHERE Type='AUTOMATIC' AND Value = '%s' "
queryInsertTag = "INSERT INTO SurveyAnswerTag (Created,LastEdited, Value, Type, CreatedByID) VALUES(NOW(), NOW(), '%s', 'AUTOMATIC', 0 )"
queryAnswers = "SELECT ID, Value FROM SurveyAnswer WHERE Value IS NOT NULL AND QuestionID = %d"
queryInsertAnswerTag = "INSERT INTO SurveyAnswer_Tags(SurveyAnswerID, SurveyAnswerTagID) VALUES(%d, %d)"
querySelectAnswerTag = "SELECT ID FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID = %d"
queryDeleteFormerTags = "DELETE FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID IN (SELECT ID FROM SurveyAnswerTag where Type = 'AUTOMATIC')"

config = DBConfig(working_dir+"/db.ini").read_db_config()

cursor = None

try:
    # Open database connection