Beispiel #1
0
    def test_run(self):
        r = Rake()

        phrases = r.run(self.test_text)
        scored_phrases = r.run(self.test_text, with_scores=True)

        for phrase in phrases:
            self.assertEqual(type(phrase), str)

        for score, phrase in scored_phrases:
            self.assertEqual(type(score), float)
            self.assertEqual(type(phrase), str)
Beispiel #2
0
def extractKeywordsUsingRake(subject, num_char=3, num_words=3, num_freq=2):
    stoppath = 'smartStopList.txt'
    Rake = RAKE.Rake(RAKE.SmartStopList())
    #Wikipedia module exract page content from wiki page
    extractedText = wikipedia.page(subject).content
    #(number of char in keyword, upto how many words, min freq of keyword)
    res = Rake.run(extractedText, num_char, num_words, num_freq)
    return res
Beispiel #3
0
try:
    desc = news['metadata']['description'].replace('<p>', '').replace(
        '</p>', '').replace('...', '').replace('\n', '').replace(
            '[&#8230;]',
            '').replace('&nbsp;', '').replace('&#8221;', '').replace(
                '&#160',
                '').replace('&#8217;',
                            '').replace('&#8220;',
                                        '').replace('&#8217;', '')
    df_ele = [
        news['published_at'][:19], news['title'], desc,
        ". ".join([news['title'], desc])
    ]
    print("-Title: %s" % df_ele[1])
    print("-Description: %s" % df_ele[2])
    title_kw = Rake.run(df_ele[1], maxWords=2)
    if len(title_kw) != 0:
        print("-Keywords from title: %s" % title_kw[0][0])
    else:
        print("-Keywords from title: []")
    description_kw = Rake.run(df_ele[2], maxWords=2)
    if len(description_kw) != 0:
        print("-Keywords from description: %s" % description_kw[0][0])
    else:
        print("-Keywords from description: []")
    title_description_kw = Rake.run(df_ele[3], maxWords=2)
    if len(title_description_kw) != 0:
        print("-Keywords from title & description: %s" %
              title_description_kw[0][0])
        kw = title_description_kw[0][0]
    else:
Beispiel #4
0
uniqbiz=set(list(train.biz))#
reviws=[]#review grouped by each restaurant
for biz in uniqbiz:
    biz_reviw=list(train.loc[train.biz==biz,"text"])
    biz_onestr=" ".join(biz_reviw)
    reviws.append(biz_onestr)

print(datetime.datetime.now())#1

keywords=[]
for rev in reviws:
    #r=Rake()
    #r.extract_keywords_from_text(rev)
    #keywords.append(r.get_ranked_phrases()[0:5])
    Rake = rk.Rake(rk.SmartStopList())
    kw=Rake.run(rev)
    kwap=[]
    counter=0
    invalid = ['+', '=', '/', '@', '*','_']
    for k in kw:
        if len(k[0].split(" "))==1 and counter<=9 and len(k[0])<=15:
            flag = False
            for i in invalid:
                if i in k[0]:
                    flag = True
                    break
            if  not flag:
                kwap.append(k[0])
                counter=counter+1
            
    keywords.append(kwap)