def process_relations(site_root, datasets, run_state = "scrape", processed_relations_path = temp_processed_relations, datarelations_path = default_datarelations_path, delay = 10) : print("data relation") datarelations = [] ckan_package_names = [] processed = set() if run_state == 'continue' : datarelations = read_json(datarelations_path) processed = read_json(processed_relations_path) ckan_package_names = { x['fields']['datahub_name'] for x in datasets } try: for x in ckan_package_names : print(x) if x not in processed : related = related_list(site_root, x, delay) for d in datasets : if d['fields']['datahub_name'] == x : for a_source in related : print(" " + x + " : " + a_source['title'] ) a_relation = { "model":"data_connections.Scientist", "pk":None, 'fields':{} } a_relation['fields']["how_data_was_processed"] = "" a_relation['fields']["source"] = [ a_source['url'], a_source['title'] ] a_relation['fields']["derivative"] = [ d['fields']['url'], d['fields']['name'] ] a_relation['fields']["datahub_name"] = d['fields']['datahub_name'] datarelations.append(a_relation) #"how_data_was_processed" : "" #"source" : [this.url, this.title] #"derivative" : [dataset.url, dataset.name] #"datahub_name" : dataset.datahub_name #end for a_source in related #end for d in datasets processed.add(x) #end for x in ckan_package_names write_json(datarelations_path, datarelations) write_json(processed_relations_path, processed) print("len(datarelations) : " + str(len(datarelations))) except (KeyboardInterrupt, Exception) as e: print("relations KeyboardInterrupt : " + str(sys.exc_info())) print("len(datarelations) : " + str(len(datarelations))) write_json(datarelations_path, datarelations) write_json(processed_relations_path, processed) import traceback traceback.print_exc()
def ckan_spider(site_root = 'http://datahub.io/', max_spam = default_max_spam, spam_digest = temp_spam_digest, run_state = "scrape", saved_state = temp_saved_state, datasets_path=default_datasets_path, my_licenses_path=default_my_licenses_path, formats_path=default_formats_path, datacatalogs_path=default_datacatalogs_path, datarelations_path=default_datarelations_path, organizations_path=default_organizations_path, scientists_path=default_scientists_path, core_licenses_path=default_core_licenses_path, core_formats_path=default_core_formats_path ): # test robots.txt # test licenses read, write # test datasets read, write # datacatalogs, scientists and organizations # test formats print("running spider") delay = robots_txt(site_root) print("crawl-delay: " + str(delay)) if run_state == 'scrape' : ckan_licenses = license_list(site_root,delay) #mylicenses = my_licenses(ckan_licenses, core_licenses_path, my_licenses_path) #read package list ckan_package_list = package_list(site_root, delay) processed_packages = {} datacatalogs = [] datasets = [] formats = [] organizations = [] scientists = [] if run_state == 'continue' : datacatalogs = read_json(default_datacatalogs_path) datasets = read_json(default_datasets_path) formats = read_json(default_formats_path) organizations = read_json(default_organizations_path) scientists = read_json(default_scientists_path) processed_packages = read_json(temp_saved_state) ckan_licenses = read_json(temp_ckan_licenses) ckan_package_list = list(set(ckan_package_list) - set(processed_packages)) #for each package in package list #read package_show try: for package_name in ckan_package_list : if run_state == 'continue' : if package_name in processed_packages : print(package_name + " : skipped") continue x = None while not x: print(package_name + " : processing") x = package_show(site_root, package_name, delay) if not x : sleep(delay*5) #processed_packages[package_name] = x['revision_timestamp'] spam_score = add_spam_score(x, spam_digest) if spam_score >= max_spam or x['num_resources'] < 1 : #spam print(package_name + " : spam") continue license_title = "" license_url = "" if "license_title" in x : license_title = x["license_title"] if "license_url" in x : license_url = x["license_url"] if {license_title,license_url} not in [ {license["title"],license["url"]} for license in ckan_licenses ] : ckan_licenses.append({"title":license_title,"url":license_url}) if x["author"] not in [ item['fields']['name'] for item in organizations ] : a_organization = { "model":"data_connections.Organization", "pk":None, 'fields':{} } a_organization['fields']['name'] = x["author"] a_organization['fields']['url'] = '' organizations.append(a_organization) a_manager = [] if x['maintainer'] and len(x['maintainer']) > 0 : temp = x['maintainer'].split(',') temp = temp[0].split(' and ') temp = temp[0].split(None, 1) a_scientist = { "model":"data_connections.Scientist", "pk":None, 'fields':{} } if len(temp) == 2 : a_scientist['fields']['firstname'] = temp[0] a_scientist['fields']['lastname'] = temp[1] a_scientist['fields']['profile_url'] = "" a_manager = [ temp[0], temp[1], "" ] else : a_scientist['fields']['firstname'] = temp[0] a_scientist['fields']['lastname'] = x['maintainer'] a_scientist['fields']['profile_url'] = "" a_manager = [ temp[0], x['maintainer'], "" ] # added condition here if [a_scientist['fields']['firstname'],a_scientist['fields']['lastname']] not in [ [s['fields']['firstname'],s['fields']['lastname']] for s in scientists ] : scientists.append(a_scientist) else : a_manager = None #end scientists if x['num_resources'] > 1 : #datacataglog a_catalog = {"model":"data_connections.DataCatalog","pk":None,"fields":{} } a_catalog['fields']['name'] = x['title'] a_catalog['fields']['manager'] = a_manager a_catalog['fields']['managing_organization'] = x['author'] #extra fields for datahub a_catalog['fields']['datahub_name'] = x['name'] if 'spam_score' in x : a_catalog['fields']['spam_score'] = x['spam_score'] datacatalogs.append(a_catalog) #end datacatalogs #resources for a_resource in x['resources'] : #datasets a_dataset = { "model":"data_connections.Dataset","pk":None,"fields":{}} a_dataset['fields']['description'] = unicode(a_resource['description']).strip() +' ( ' + unicode(x['notes']).strip() + ' ) ' a_dataset['fields']['license'] = [ x['license_title'] ] a_dataset['fields']['date_last_edited'] = a_resource["last_modified"] a_dataset['fields']['url'] = a_resource["url"] if a_resource["format"] and len(a_resource["format"]) > 0 : a_dataset['fields']['data_format'] = [ a_resource["format"] ] else : a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ] a_dataset['fields']['date_published'] = a_resource["last_modified"] a_dataset['fields']['manager'] = a_catalog['fields']['manager'] a_dataset['fields']['managing_organization'] = x['author'] a_dataset['fields']['data_catalog'] = [ x['title'] ] a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format #extra fields for datahub a_dataset['fields']['datahub_name'] = x['name'] a_dataset['fields']['spam_score'] = spam_score datasets.append(a_dataset) #formats if a_resource["format"] not in [ item['fields']['name'] for item in formats ] : a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} } a_format['fields']['name'] = a_resource["format"] a_format['fields']['url'] = '' formats.append(a_format) #end if x['num_resources'] > 1 elif x['num_resources'] == 1 : #datasets a_dataset = { "model":"data_connections.Dataset","pk":None,"fields":{}} a_resource = x['resources'][0] a_dataset['fields']['description'] = unicode(x['notes']).strip() + unicode(a_resource['description']).strip() a_dataset['fields']['license'] = [ x['license_title'] ] a_dataset['fields']['date_last_edited'] = x["metadata_modified"] a_dataset['fields']['url'] = a_resource["url"] if a_resource["format"] and len(a_resource["format"]) > 0 : a_dataset['fields']['data_format'] = [ a_resource["format"] ] else : a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ] a_dataset['fields']['date_published'] = x["metadata_created"] a_dataset['fields']['manager'] = a_manager a_dataset['fields']['managing_organization'] = x['author'] a_dataset['fields']['data_catalog'] = [ x['title'] ] a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format #extra fields for datahub a_dataset['fields']['datahub_name'] = x['name'] a_dataset['fields']['spam_score'] = spam_score datasets.append(a_dataset) #end datasets #formats if a_resource["format"] not in [ item['fields']['name'] for item in formats ] : a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} } a_format['fields']['name'] = a_resource["format"] a_format['fields']['url'] = '' formats.append(a_format) #end formats #end elif x['num_resources'] == 1 #if appending to output json files processed_packages[package_name] = x['revision_timestamp'] #end for x in package_list :#write to datacatalogs.json #end try except (KeyboardInterrupt, Exception) as e: print("spider KeyboardInterrupt : " + str(sys.exc_info())) print("len(datasets) : " + str(len(datasets))) write_json(datacatalogs_path, datacatalogs) write_json(datasets_path, datasets) write_json(organizations_path, organizations) write_json(scientists_path, scientists) write_json(temp_ckan_licenses, ckan_licenses) core_formats = read_json(core_formats_path) core_format_names = [ item['fields']['name'] for item in core_formats ] my_formats = [] for a_format in formats : if a_format['fields']['name'] not in core_format_names : a_format['pk'] = len(my_formats)+1 my_formats.append(a_format) write_json(formats_path, my_formats) write_json(temp_saved_state, processed_packages) import traceback traceback.print_exc() #end except mylicenses = my_licenses(license_convert(ckan_licenses), core_licenses_path, my_licenses_path) write_json(datacatalogs_path, datacatalogs) write_json(datasets_path, datasets) write_json(organizations_path, organizations) write_json(scientists_path, scientists) f = open(core_formats_path) core_formats = json.loads(f.read()) core_format_names = [ item['fields']['name'].lower() for item in core_formats ] f.close() my_formats = [] for a_format in formats : if a_format['fields']['name'].lower() not in core_format_names : a_format['pk'] = len(my_formats)+1 my_formats.append(a_format) write_json(formats_path, my_formats) write_json(temp_saved_state, processed_packages) process_relations(site_root, datasets, run_state, temp_processed_relations, datarelations_path, delay)
if debug_item == 'robots' : ckan_test_robots(site_root) if debug_item == 'licenses' : ckan_test_licenses(site_root, core_licenses_path, my_licenses_path) if debug_item == 'package_list' : ckan_test_package_list(site_root, 10) if debug_item == 'package_read' : ckan_test_package_read(site_root, temp_ckan_package_list, temp_ckan_datasets, 10) if debug_item == 'spam' : ckan_test_spam_score(temp_ckan_datasets, temp_scored_datasets, spam_digest) if debug_item == 'convert' : #ckan_test_convert_package('ckan_datasets.json') ckan_test_convert_package(temp_scored_datasets) if debug_item == 'relations' or debug_item == 'relations_scrape': #ckan_test_convert_package('ckan_datasets.json') datasets = read_json(datasets_path) process_relations(site_root, datasets, "scrape", temp_processed_relations, datarelations_path, 10) if debug_item == 'relations_continue': #ckan_test_convert_package('ckan_datasets.json') datasets = read_json(datasets_path) process_relations(site_root, datasets, "continue", temp_processed_relations, datarelations_path, 10) elif sys.argv[1] == 'scrape' or sys.argv[1] == 'continue' : #print("spider " + sys.argv[1]) ckan_spider(site_root, max_spam, spam_digest, sys.argv[1], saved_state, datasets_path, my_licenses_path, formats_path, datacatalogs_path, datarelations_path, organizations_path, scientists_path, core_licenses_path, core_formats_path)