-
Notifications
You must be signed in to change notification settings - Fork 0
/
relevant.py
executable file
·104 lines (79 loc) · 2.83 KB
/
relevant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python2
import random
import re
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head',
'title']:
return False
elif re.match('<!--.*-->', element.encode('utf-8')):
return False
return True
def siteWords(url):
from urllib import urlopen
from bs4 import BeautifulSoup
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")
texts = soup.findAll(text=True)
visible_texts = filter(visible, texts)
for txt in visible_texts:
for wd in filter(lambda wd: re.match('^[\w-]{5,}$', wd), txt.split()):
yield wd
def collectWords(urls):
wds = set()
for url in urls:
wds |= set(siteWords(url))
return wds
class ApiError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def getRelevantURLForWord(wd, api_key):
from bing_search_api import BingSearchAPI
bing = BingSearchAPI(api_key)
params = {'$format': 'json', '$skip': '10'}
result = bing.search_web(wd, payload=params)
if result.status_code == 200:
entries = result.json()['d']['results']
if entries:
rank = random.randint(0, len(entries)-1)
url = entries[rank]['Url']
return url, rank+10
else:
return None
else:
raise ApiError("Web search api error: {}".format(result.status_code))
def getRelevantURLs(wds, n, api_key):
from urlparse import urlparse
hosts = set()
urls = []
for wd in wds:
url, rank = getRelevantURLForWord(wd, api_key)
if url is not None:
pr = urlparse(url)
host = pr.hostname
if host is not None and host not in hosts:
hosts.add(host)
urls.append((url, rank))
if len(urls) >= n:
break
return urls
if (__name__ == '__main__'):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('urls', metavar='<url>', type=str, nargs='+',
help='Entry point(s) for relevant word extraction.')
parser.add_argument('-n', '--nr', metavar='<n>', type=int, default=100,
help='Number of URLs to find.')
parser.add_argument('-k', '--apikey', metavar='<key>', type=str, nargs=1,
required=True, help='API key for search engine \
requests (currently Microsoft \
Bing Search API)')
args = parser.parse_args()
wds = collectWords(args.urls)
try:
urls = getRelevantURLs(wds, args.nr, args.apikey[0])
for url, rank in urls:
print "{} ({})".format(url, rank)
except ApiError as e:
print e.value