def ckan_test_licenses(site_root, core_license_path, my_license_path) : ckan_licenses = license_convert(license_list(site_root,10)) f = open('ckan_licenses.json','w') f.write(json.dumps(ckan_licenses,indent=4)) f.close() f = open('ckan_licenses.json') ckan_licenses = json.loads(f.read()) f.close() mylicenses = my_licenses(ckan_licenses, core_license_path, my_license_path) print(mylicenses)
def ckan_spider(site_root = 'http://datahub.io/', max_spam = default_max_spam, spam_digest = temp_spam_digest, run_state = "scrape", saved_state = temp_saved_state, datasets_path=default_datasets_path, my_licenses_path=default_my_licenses_path, formats_path=default_formats_path, datacatalogs_path=default_datacatalogs_path, datarelations_path=default_datarelations_path, organizations_path=default_organizations_path, scientists_path=default_scientists_path, core_licenses_path=default_core_licenses_path, core_formats_path=default_core_formats_path ): # test robots.txt # test licenses read, write # test datasets read, write # datacatalogs, scientists and organizations # test formats print("running spider") delay = robots_txt(site_root) print("crawl-delay: " + str(delay)) if run_state == 'scrape' : ckan_licenses = license_list(site_root,delay) #mylicenses = my_licenses(ckan_licenses, core_licenses_path, my_licenses_path) #read package list ckan_package_list = package_list(site_root, delay) processed_packages = {} datacatalogs = [] datasets = [] formats = [] organizations = [] scientists = [] if run_state == 'continue' : datacatalogs = read_json(default_datacatalogs_path) datasets = read_json(default_datasets_path) formats = read_json(default_formats_path) organizations = read_json(default_organizations_path) scientists = read_json(default_scientists_path) processed_packages = read_json(temp_saved_state) ckan_licenses = read_json(temp_ckan_licenses) ckan_package_list = list(set(ckan_package_list) - set(processed_packages)) #for each package in package list #read package_show try: for package_name in ckan_package_list : if run_state == 'continue' : if package_name in processed_packages : print(package_name + " : skipped") continue x = None while not x: print(package_name + " : processing") x = package_show(site_root, package_name, delay) if not x : sleep(delay*5) #processed_packages[package_name] = x['revision_timestamp'] spam_score = add_spam_score(x, spam_digest) if spam_score >= max_spam or x['num_resources'] < 1 : #spam print(package_name + " : spam") continue license_title = "" license_url = "" if "license_title" in x : license_title = x["license_title"] if "license_url" in x : license_url = x["license_url"] if {license_title,license_url} not in [ {license["title"],license["url"]} for license in ckan_licenses ] : ckan_licenses.append({"title":license_title,"url":license_url}) if x["author"] not in [ item['fields']['name'] for item in organizations ] : a_organization = { "model":"data_connections.Organization", "pk":None, 'fields':{} } a_organization['fields']['name'] = x["author"] a_organization['fields']['url'] = '' organizations.append(a_organization) a_manager = [] if x['maintainer'] and len(x['maintainer']) > 0 : temp = x['maintainer'].split(',') temp = temp[0].split(' and ') temp = temp[0].split(None, 1) a_scientist = { "model":"data_connections.Scientist", "pk":None, 'fields':{} } if len(temp) == 2 : a_scientist['fields']['firstname'] = temp[0] a_scientist['fields']['lastname'] = temp[1] a_scientist['fields']['profile_url'] = "" a_manager = [ temp[0], temp[1], "" ] else : a_scientist['fields']['firstname'] = temp[0] a_scientist['fields']['lastname'] = x['maintainer'] a_scientist['fields']['profile_url'] = "" a_manager = [ temp[0], x['maintainer'], "" ] # added condition here if [a_scientist['fields']['firstname'],a_scientist['fields']['lastname']] not in [ [s['fields']['firstname'],s['fields']['lastname']] for s in scientists ] : scientists.append(a_scientist) else : a_manager = None #end scientists if x['num_resources'] > 1 : #datacataglog a_catalog = {"model":"data_connections.DataCatalog","pk":None,"fields":{} } a_catalog['fields']['name'] = x['title'] a_catalog['fields']['manager'] = a_manager a_catalog['fields']['managing_organization'] = x['author'] #extra fields for datahub a_catalog['fields']['datahub_name'] = x['name'] if 'spam_score' in x : a_catalog['fields']['spam_score'] = x['spam_score'] datacatalogs.append(a_catalog) #end datacatalogs #resources for a_resource in x['resources'] : #datasets a_dataset = { "model":"data_connections.Dataset","pk":None,"fields":{}} a_dataset['fields']['description'] = unicode(a_resource['description']).strip() +' ( ' + unicode(x['notes']).strip() + ' ) ' a_dataset['fields']['license'] = [ x['license_title'] ] a_dataset['fields']['date_last_edited'] = a_resource["last_modified"] a_dataset['fields']['url'] = a_resource["url"] if a_resource["format"] and len(a_resource["format"]) > 0 : a_dataset['fields']['data_format'] = [ a_resource["format"] ] else : a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ] a_dataset['fields']['date_published'] = a_resource["last_modified"] a_dataset['fields']['manager'] = a_catalog['fields']['manager'] a_dataset['fields']['managing_organization'] = x['author'] a_dataset['fields']['data_catalog'] = [ x['title'] ] a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format #extra fields for datahub a_dataset['fields']['datahub_name'] = x['name'] a_dataset['fields']['spam_score'] = spam_score datasets.append(a_dataset) #formats if a_resource["format"] not in [ item['fields']['name'] for item in formats ] : a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} } a_format['fields']['name'] = a_resource["format"] a_format['fields']['url'] = '' formats.append(a_format) #end if x['num_resources'] > 1 elif x['num_resources'] == 1 : #datasets a_dataset = { "model":"data_connections.Dataset","pk":None,"fields":{}} a_resource = x['resources'][0] a_dataset['fields']['description'] = unicode(x['notes']).strip() + unicode(a_resource['description']).strip() a_dataset['fields']['license'] = [ x['license_title'] ] a_dataset['fields']['date_last_edited'] = x["metadata_modified"] a_dataset['fields']['url'] = a_resource["url"] if a_resource["format"] and len(a_resource["format"]) > 0 : a_dataset['fields']['data_format'] = [ a_resource["format"] ] else : a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ] a_dataset['fields']['date_published'] = x["metadata_created"] a_dataset['fields']['manager'] = a_manager a_dataset['fields']['managing_organization'] = x['author'] a_dataset['fields']['data_catalog'] = [ x['title'] ] a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format #extra fields for datahub a_dataset['fields']['datahub_name'] = x['name'] a_dataset['fields']['spam_score'] = spam_score datasets.append(a_dataset) #end datasets #formats if a_resource["format"] not in [ item['fields']['name'] for item in formats ] : a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} } a_format['fields']['name'] = a_resource["format"] a_format['fields']['url'] = '' formats.append(a_format) #end formats #end elif x['num_resources'] == 1 #if appending to output json files processed_packages[package_name] = x['revision_timestamp'] #end for x in package_list :#write to datacatalogs.json #end try except (KeyboardInterrupt, Exception) as e: print("spider KeyboardInterrupt : " + str(sys.exc_info())) print("len(datasets) : " + str(len(datasets))) write_json(datacatalogs_path, datacatalogs) write_json(datasets_path, datasets) write_json(organizations_path, organizations) write_json(scientists_path, scientists) write_json(temp_ckan_licenses, ckan_licenses) core_formats = read_json(core_formats_path) core_format_names = [ item['fields']['name'] for item in core_formats ] my_formats = [] for a_format in formats : if a_format['fields']['name'] not in core_format_names : a_format['pk'] = len(my_formats)+1 my_formats.append(a_format) write_json(formats_path, my_formats) write_json(temp_saved_state, processed_packages) import traceback traceback.print_exc() #end except mylicenses = my_licenses(license_convert(ckan_licenses), core_licenses_path, my_licenses_path) write_json(datacatalogs_path, datacatalogs) write_json(datasets_path, datasets) write_json(organizations_path, organizations) write_json(scientists_path, scientists) f = open(core_formats_path) core_formats = json.loads(f.read()) core_format_names = [ item['fields']['name'].lower() for item in core_formats ] f.close() my_formats = [] for a_format in formats : if a_format['fields']['name'].lower() not in core_format_names : a_format['pk'] = len(my_formats)+1 my_formats.append(a_format) write_json(formats_path, my_formats) write_json(temp_saved_state, processed_packages) process_relations(site_root, datasets, run_state, temp_processed_relations, datarelations_path, delay)