def test_run(self): r = Rake() phrases = r.run(self.test_text) scored_phrases = r.run(self.test_text, with_scores=True) for phrase in phrases: self.assertEqual(type(phrase), str) for score, phrase in scored_phrases: self.assertEqual(type(score), float) self.assertEqual(type(phrase), str)
def extractKeywordsUsingRake(subject, num_char=3, num_words=3, num_freq=2): stoppath = 'smartStopList.txt' Rake = RAKE.Rake(RAKE.SmartStopList()) #Wikipedia module exract page content from wiki page extractedText = wikipedia.page(subject).content #(number of char in keyword, upto how many words, min freq of keyword) res = Rake.run(extractedText, num_char, num_words, num_freq) return res
try: desc = news['metadata']['description'].replace('<p>', '').replace( '</p>', '').replace('...', '').replace('\n', '').replace( '[…]', '').replace(' ', '').replace('”', '').replace( ' ', '').replace('’', '').replace('“', '').replace('’', '') df_ele = [ news['published_at'][:19], news['title'], desc, ". ".join([news['title'], desc]) ] print("-Title: %s" % df_ele[1]) print("-Description: %s" % df_ele[2]) title_kw = Rake.run(df_ele[1], maxWords=2) if len(title_kw) != 0: print("-Keywords from title: %s" % title_kw[0][0]) else: print("-Keywords from title: []") description_kw = Rake.run(df_ele[2], maxWords=2) if len(description_kw) != 0: print("-Keywords from description: %s" % description_kw[0][0]) else: print("-Keywords from description: []") title_description_kw = Rake.run(df_ele[3], maxWords=2) if len(title_description_kw) != 0: print("-Keywords from title & description: %s" % title_description_kw[0][0]) kw = title_description_kw[0][0] else:
uniqbiz=set(list(train.biz))# reviws=[]#review grouped by each restaurant for biz in uniqbiz: biz_reviw=list(train.loc[train.biz==biz,"text"]) biz_onestr=" ".join(biz_reviw) reviws.append(biz_onestr) print(datetime.datetime.now())#1 keywords=[] for rev in reviws: #r=Rake() #r.extract_keywords_from_text(rev) #keywords.append(r.get_ranked_phrases()[0:5]) Rake = rk.Rake(rk.SmartStopList()) kw=Rake.run(rev) kwap=[] counter=0 invalid = ['+', '=', '/', '@', '*','_'] for k in kw: if len(k[0].split(" "))==1 and counter<=9 and len(k[0])<=15: flag = False for i in invalid: if i in k[0]: flag = True break if not flag: kwap.append(k[0]) counter=counter+1 keywords.append(kwap)