def completeRake(content): rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content,True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x),words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq=getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList,content,words) sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True) additional_stopwords=[] for key in sortedFreqList: keyword_freq.setdefault(key[0],0) adjacency_freq.setdefault(key[0],0) if(adjacency_freq[key[0]]>keyword_freq[key[0]]): additional_stopwords.append(key[0]) additional_stopwords = set(additional_stopwords) newRakeObject = RakeKeywordExtractor(additional_stopwords) newKeywordList = newRakeObject.extract(content) for keywords in newKeywordList: print keywords
def extractKeywords(phrase_list): RAKE_OBJ = RakeKeywordExtractor(set([])) word_scores = RAKE_OBJ._calculate_word_scores(phrase_list) phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores) sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) return sorted_phrase_scores[0:int(n_phrases)]
def extractKeywords(phrase_list): RAKE_OBJ = RakeKeywordExtractor(set([])) word_scores = RAKE_OBJ._calculate_word_scores(phrase_list) phrase_scores = RAKE_OBJ._calculate_phrase_scores(phrase_list, word_scores) sorted_phrase_scores = sorted(phrase_scores.iteritems(),key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) return sorted_phrase_scores[0:int(n_phrases/2)]
def main(): # SET N AS THE NUMBER OF KEYWORDS TO EVALUATE FROM THE KEYWORDS LISTS N = 243 base_dir = os.path.dirname(os.path.realpath(__file__)) stopwords_simple = os.path.join(base_dir,'stopwords_nltk.txt') rake = RakeKeywordExtractor(stopwords_simple) tp_total = 0 fp_total = 0 fn_total = 0 corpus = os.path.join(base_dir,'./corpus') txtfiles = [file for file in os.listdir(corpus) if file.endswith('.txt')] s="" for txtfile in txtfiles: try: keyfile = os.path.join(corpus,txtfile).replace('.txt', '.key') content = open(os.path.join(corpus,txtfile), 'r').read().decode('utf-8') keywordsExtracted = set(rake.extract(content, incl_scores=False)[0:N]) keywordsExpected = set(listfromfilelines(keyfile)[0:N]) tp, fp, fn = confusionMatrix(keywordsExtracted, keywordsExpected); p, r, f1 = getF1(tp, fp, fn) tp_total += tp fp_total += fp fn_total += fn print "F1 for top " + str(N) + " keywords in " + txtfile + ":\t" + str(f1) s="" for x in keywordsExtracted: s = s + x +" " wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") s="" for x in keywordsExpected: s = s + x +" " wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") s=content wordcloud = WordCloud().generate(s) plt.imshow(wordcloud) plt.axis('off') plt.show() raw_input(">") except Exception,err: print Exception,err #COMMENT NEXT LINES IN FOR DEBUGGING """print "Extracted Keywords:"
def completeRake(content): rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content, True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x), words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq = getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList, content, words) sortedFreqList = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True) additional_stopwords = [] for key in sortedFreqList: keyword_freq.setdefault(key[0], 0) adjacency_freq.setdefault(key[0], 0) if (adjacency_freq[key[0]] > keyword_freq[key[0]]): additional_stopwords.append(key[0]) additional_stopwords = set(additional_stopwords) newRakeObject = RakeKeywordExtractor(additional_stopwords) newKeywordList = newRakeObject.extract(content) for keywords in newKeywordList: print keywords
def keywords(self,text,tag_table): tagtable = open(tag_table,'r') tags = [] features = [] wordset = self.freqwords(text) for i in tagtable.readlines(): tags.append(i.split(',')[0]) rake = RakeKeywordExtractor() rakelist = rake.test(text) for i in rakelist: features.append(self.normalise(i[0]).replace(' ','-')) for keyword in wordset: for phrase in features: if phrase.find(keyword)!= -1 : features.append(keyword) break intersection = set(tags) & set(features) # print features return ' '.join(list(intersection))
keyword_freq={} for i in range(0,NUM): keywords = keywordList[i][0].split(' ') length = len(keywords) for word in keywords: keyword_freq.setdefault(word,0) keyword_freq[word] = keyword_freq[word] + 1 return keyword_freq #f = open(sys.argv[1],'r') f = codecs.open(sys.argv[1],'r',"iso8859-15") content = f.read() content = content.encode('ascii','ignore') content = re.sub(r"[1-9][0-9]*\.?[0-9]*",'',content) rakeObject = RakeKeywordExtractor(set([])) keywordList = rakeObject.extract(content,True) words = nltk.word_tokenize(content) content_lemmatized = "" words = list(map(lambda x: lemma_obj.lemmatize(x),words)) content_lemmatized = ' '.join(words) content = content_lemmatized freq = {} freq_dist = nltk.FreqDist(words) keyword_freq=getKeywordFrequency(keywordList) adjacency_freq = getAdjacencyFrequency(keywordList,content,words) sortedFreqList = sorted(freq_dist.items(), key = lambda x: x[1],reverse=True) additional_stopwords=[] for key in sortedFreqList:
).join( Portal, Peticion.folioPeticion == Portal.folioSAC).join( Temas, Peticion.tema_id == Temas.temaId ).join( Dependencia, Peticion.dependencia_id == Dependencia.dependenciaId ) kw = {} temas = {} for pet in peticion_list: tema_pet = pet.Temas.nomTema if tema_pet in temas: temas[tema_pet] = temas[tema_pet] + 1 else: temas[tema_pet] = 1 rake = RakeKeywordExtractor() kw_tmp = rake.extract(pet.Portal.descripcion) for word in list(kw_tmp): if word not in kw: kw[word] = 1 else: kw[word] += 1 mylist = sorted(temas.items(), key=lambda x:x[1], reverse=True) for item in mylist: print('"'+item[0]+'": {"peticiones_atendidas": '+str(item[1])+', "palabras_clave": "'+tema_keyword[item[0]]+'"},')
exit(-1) working_dir = sys.argv[1] # param question_id = int(sys.argv[2]) # param max_tags = int(sys.argv[3]) # param must_delete_former_automatic_tags = int(sys.argv[4]) # param download_dir="/tmp/nltk_data" # resources download nltk.download('stopwords', download_dir) nltk.download('punkt', download_dir) nltk.download('averaged_perceptron_tagger', download_dir) nltk.download('wordnet', download_dir) nltk.download('pickle', download_dir) rake = RakeKeywordExtractor(); TAG_TYPE = "AUTOMATIC" queryTag = "SELECT ID FROM SurveyAnswerTag WHERE Type='AUTOMATIC' AND Value = '%s' " queryInsertTag = "INSERT INTO SurveyAnswerTag (Created,LastEdited, Value, Type, CreatedByID) VALUES(NOW(), NOW(), '%s', 'AUTOMATIC', 0 )" queryAnswers = "SELECT ID, Value FROM SurveyAnswer WHERE Value IS NOT NULL AND QuestionID = %d" queryInsertAnswerTag = "INSERT INTO SurveyAnswer_Tags(SurveyAnswerID, SurveyAnswerTagID) VALUES(%d, %d)" querySelectAnswerTag = "SELECT ID FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID = %d" queryDeleteFormerTags = "DELETE FROM SurveyAnswer_Tags WHERE SurveyAnswerID = %d AND SurveyAnswerTagID IN (SELECT ID FROM SurveyAnswerTag where Type = 'AUTOMATIC')" config = DBConfig(working_dir+"/db.ini").read_db_config() cursor = None try: # Open database connection