/
extractdatapypi.py
99 lines (76 loc) · 2.74 KB
/
extractdatapypi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pkg_resources
import re
import sys
import tarfile
import xmlrpclib
from cStringIO import StringIO
import requests
import fileinfo
def _extract_deps(content):
""" Extract dependencies using install_requires directive """
results = re.findall("install_requires=\[([\W'a-zA-Z0-9]*?)\]", content, re.M)
deps = []
if results:
deps = [a.replace("'", "").strip()
for a in results[0].strip().split(",")
if a.replace("'", "").strip() != ""]
deps = [a.replace('"', '') for a in deps]
return deps
def _extract_setup_content(package_file, name):
"""Extract setup.py content as string from downladed tar """
tar_file = tarfile.open(fileobj=package_file)
setup_candidates = [elem for elem in tar_file.getmembers() if 'setup.py' in elem.name]
if len(setup_candidates) >= 1:
a = [elem.name for elem in setup_candidates]
setup_member = min(a, key=lambda x:len(x))
content = tar_file.extractfile(setup_member).read()
return content
else:
print "Too few candidates for setup.py in tar for package: %s" % (name, )
return ''
def get_metadata(client, name, release):
doc = [d for d in client.release_urls(name, release) if 'url' in d and '.tar' in d['filename']]
if not doc:
return
urls = [d.get('url') for d in doc if d.get('url')]
url = min(urls, key=lambda x:len(x))
#print "Downloading url %s" % url
req = requests.get(url)
if req.status_code != 200:
print "Could not download file %s. URL: %s" % (req.status_code, url)
return
tar_file = StringIO()
tar_file.write(req.content)
result_tar_file = StringIO(tar_file.getvalue())
try:
content = _extract_setup_content(result_tar_file, name)
requirements = [pkg_resources.Requirement.parse(d) for d in _extract_deps(content) if '#' not in d]
deps = [(dep.project_name, dep.specs) for dep in requirements]
except Exception as inst:
print inst, url
print "Unexpected error:", sys.exc_info()[0]
deps = []
try:
metadata = client.release_data(name, release)
except:
metadata = None
return {
'requires': deps,
'metadata' : metadata,
'url': url,
'size': len(req.content)
}
def get_package_data(name):
client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi')
releases = client.package_releases(name)
if not releases:
return
release = releases[0]
package = {
'name': name,
'version': release,
}
more_package_data = get_metadata(client, name, release)
if more_package_data:
package.update(more_package_data)
fileinfo.save_data(name, package, fileinfo.PYPI_DATA_DIR)