-
Notifications
You must be signed in to change notification settings - Fork 0
/
keywordr.py
29 lines (25 loc) · 830 Bytes
/
keywordr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#! /usr/bin/env python3
import sys
from urllib.request import urlopen
from lxml import html
from rake_nltk import Rake
def get_phrases(text=''):
rake = Rake()
rake.extract_keywords_from_text(''.join(text))
phrases = rake.get_ranked_phrases()
if len(phrases) >= 5:
return phrases[:5]
else:
return phrases
body = urlopen(sys.argv[1]).read()
tree = html.fromstring(body)
title =''
content =''
keywords =''
try:
title = tree.xpath('/html/head/title/text()')[0]
content = tree.xpath('/html/head/meta[@name="description"]')[0].get('content')
keywords = tree.xpath('/html/head/meta[@name="keywords"]')[0].get('content')
except: pass
text = ''.join([i.strip() for i in tree.xpath('//p/text() | //code/text() | //li/text()') if i.strip() not in ['', '.', ',']])
print(get_phrases(text))