-
Notifications
You must be signed in to change notification settings - Fork 2
/
hdx-crawler.py
210 lines (189 loc) · 7.31 KB
/
hdx-crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import ckanapi, json, sys, time
from urllib2 import Request, urlopen, URLError, HTTPError
from itertools import islice
from openpyxl import load_workbook
import hxl
from io import BytesIO
import datetime
import hashlib
import xlrd
DELAY = 2
"""Time delay in seconds between datasets, to give HDX a break."""
CKAN_URL = "https://data.humdata.org"
"""Base URL for the CKAN instance."""
#indexFile = {}
with open('working/index_3350.json') as json_data:
indexFile = json.load(json_data)
def populateIndex(uniqueTags,sampleData,i,attributes,md5,name,url,package_id):
includeatts = False
includefile = False
print sampleData
for tag in uniqueTags:
if tag not in indexFile:
indexFile[tag] = {'samples':['sample_'+str(i)],'attributes':{},'md5s':[md5]}
includeatts = True
includefile = True
else:
if md5 not in indexFile[tag]['md5s']:
includeatts = True
if len(indexFile[tag]['samples'])<5 and md5 not in indexFile[tag]['md5s']:
indexFile[tag]['samples'].append('sample_'+str(i))
indexFile[tag]['md5s'].append(md5)
includefile = True
if includeatts == True:
for key in attributes:
for att in attributes[key]:
if att in indexFile[key]['attributes']:
indexFile[key]['attributes'][att]+=1
else:
indexFile[key]['attributes'][att]=1
if includefile == True:
sample = {'data':sampleData,'name':name,'url':url,'package_id':package_id}
with open('working/sample_'+str(i)+'.json', 'w') as file:
json.dump(sample, file)
def processHXLData(dataset):
x = dataset.values
if len(dataset.values)>2:
sample = [dataset.headers,dataset.display_tags,dataset.values[0],dataset.values[1],dataset.values[2]]
uniqueTags = []
atts = {}
tags =''
for tag in dataset.tags:
if tag!=None:
tags +=tag
if tag not in uniqueTags:
uniqueTags.append(tag)
m = hashlib.md5()
m.update(tags)
md5 = m.hexdigest()
for tag in dataset.display_tags:
tagAtts = tag.split('+')
if len(tagAtts)>1:
if tagAtts[0] not in atts:
atts[tagAtts[0]]=[]
for i in range(1, len(tagAtts)):
atts[tagAtts[0]].append(tagAtts[i])
return [uniqueTags,sample,atts,md5]
return False
def readCsv(csvLocation):
try:
content = urlopen(csvLocation)
except URLError as e:
print("CSV Failed to download")
try:
print "File downloaded and attempting to read HXL"
dataset = hxl.data(content).cache()
output = processHXLData(dataset)
print "HXL output"
return output
except Exception as e:
print e
return False
def readXlsx(fileLocation):
print "Trying to download XLSX"
try:
response = urlopen(fileLocation)
try:
print "Reading XLSX"
wb = load_workbook(BytesIO(response.read()))
except:
print "Error reading "+ str(fileLocation)
return False
sheet = wb.active
data={}
except URLError as e:
print("XLS Failed to download")
try:
rows_iter = sheet.iter_rows(min_col=1, min_row=1, max_col=sheet.max_column, max_row=sheet.max_row)
dataset = [[cell.value for cell in row] for row in rows_iter]
for i, row in enumerate(dataset):
for j, cell in enumerate(dataset[i]):
if isinstance(cell, datetime.date):
dataset[i][j] = cell.strftime('%m/%d/%Y')
elif isinstance(cell, basestring):
dataset[i][j] = cell.encode('ascii', 'ignore')
dataset = hxl.data(dataset).cache()
output = processHXLData(dataset)
print "HXL output"
return output
except Exception as e:
print e
return False
def readXls(fileLocation):
print "Trying to download XLS"
try:
response = urlopen(fileLocation).read()
try:
print "Reading XLS"
wb = xlrd.open_workbook(file_contents=response)
except Exception as e:
print e
print "Error reading "+ str(fileLocation)
return False
xl_sheet = wb.sheet_by_index(0)
except URLError as e:
print("XLS Failed to download")
try:
dataset = []
for row in range (0, xl_sheet.nrows):
r = []
for col in range(0, xl_sheet.ncols):
if isinstance(xl_sheet.cell_value(row, col), basestring):
r.append(xl_sheet.cell_value(row, col).encode('ascii', 'ignore'))
else:
r.append(xl_sheet.cell_value(row, col))
#if isinstance(cell, datetime.date):
# dataset[i][j] = cell.strftime('%m/%d/%Y')
dataset.append(r)
dataset = hxl.data(dataset).cache()
output = processHXLData(dataset)
print "HXL output"
return output
except Exception as e:
print e
return False
# find datasets tagged HXL
def find_hxl_datasets(start, rows):
"""Return a page of HXL datasets."""
return ckan.action.package_search(start=start, rows=rows, fq="tags:hxl")
# Open a connection to HDX
ckan = ckanapi.RemoteCKAN(CKAN_URL)
result_start_pos = 1000
result_page_size = 4000
result = find_hxl_datasets(0, result_page_size)
packages = result["results"]
result2 = find_hxl_datasets(1000, result_page_size)
packages2 = result2["results"]
allpackages = packages + packages2
# Iterate through all the datasets ("packages") and resources on HDX
i=0
for package in allpackages:
# package = ckan.action.package_show(id=package_id)
print("Package: " + format(package["title"]))
print package
# for each resource in a package (some packages have multiple csv files for example), print the name, url and format
for resource in package["resources"]:
# if i>3140:
if i>3350:
print "---------------------"
print(" {}".format(resource["name"].encode('ascii', 'ignore')))
print(" {}".format(resource["url"]))
print resource["format"]
if resource["format"] == "CSV":
file_data = readCsv(resource["url"])
if(file_data!=False):
populateIndex(file_data[0],file_data[1],i,file_data[2],file_data[3],resource["name"],resource["url"],resource["package_id"])
if resource["format"] == "XLSX":
file_data = readXlsx(resource["url"])
if(file_data!=False):
populateIndex(file_data[0],file_data[1],i,file_data[2],file_data[3],resource["name"],resource["url"],resource["package_id"])
if resource["format"] == "XLS":
file_data = readXls(resource["url"])
if(file_data!=False):
populateIndex(file_data[0],file_data[1],i,file_data[2],file_data[3],resource["name"],resource["url"],resource["package_id"])
if i%10==0:
with open('working/index_'+str(i)+'.json', 'w') as file:
json.dump(indexFile, file)
time.sleep(DELAY) # give HDX a short rest
i+=1
print i