-
Notifications
You must be signed in to change notification settings - Fork 0
/
linkcollector.py
executable file
·97 lines (69 loc) · 2.64 KB
/
linkcollector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
"""
Collects all installable looking links from PyPI
"""
import itertools
import json
import traceback
import urlparse
import xmlrpclib
import gevent
import gevent.queue
import lxml.html
import redis
import requests
from pkg_resources import safe_name
from setuptools.package_index import distros_for_url
WORKERS = 100
redis = redis.StrictRedis()
session = requests.session()
session.verify = False
def installable(project, url):
normalized = safe_name(project).lower()
return bool([dist for dist in distros_for_url(url) if safe_name(dist.project_name).lower() == normalized])
def process_links(project, url, spider):
if redis.sismember("seen", json.dumps([project, url, spider])):
print "Skipping %s; it has already been processed (For %s)" % (url.encode("utf-8"), project.encode("utf-8"))
else:
print "Processing %s for urls (For %s)" % (url.encode("utf-8"), project.encode("utf-8"))
resp = session.get(url, timeout=15)
resp.raise_for_status()
html = lxml.html.document_fromstring(resp.content)
if spider:
for link in itertools.chain(html.find_rel_links("download"), html.find_rel_links("homepage")):
try:
link.make_links_absolute(url)
except ValueError:
continue
if "href" in link.attrib and not installable(project, link.attrib["href"]):
parsed = urlparse.urlparse(link.attrib["href"])
if parsed.scheme.lower() in ["http", "https"]:
redis.rpush("queue", json.dumps([project, link.attrib["href"], False]))
# Process all links in html for installable items
for link in html.xpath("//a"):
try:
link.make_links_absolute(url)
except ValueError:
continue
if "href" in link.attrib and installable(project, link.attrib["href"]):
redis.rpush("results", json.dumps([project, url, link.attrib["href"]]))
redis.sadd("seen", json.dumps([project, url, spider]))
def worker():
while True:
item = redis.lpop("queue")
if item is None:
break
try:
process_links(*json.loads(item))
except Exception:
traceback.print_exc()
def main():
# Grab a list of projects from PyPI
projects = xmlrpclib.Server("http://pypi.python.org/pypi").list_packages()
# Add some urls to our queue
for project in projects:
redis.rpush("queue", json.dumps([project, "https://pypi.python.org/simple/" + project + "/", True]))
workers = [gevent.spawn(worker) for _ in xrange(WORKERS)]
gevent.joinall(workers)
if __name__ == "__main__":
main()