Beispiel #1
0
def ckan_spider(site_root = 'http://datahub.io/', max_spam = default_max_spam, spam_digest = temp_spam_digest, 
                run_state = "scrape", saved_state = temp_saved_state,
                datasets_path=default_datasets_path, my_licenses_path=default_my_licenses_path, formats_path=default_formats_path,
                datacatalogs_path=default_datacatalogs_path, datarelations_path=default_datarelations_path,
                organizations_path=default_organizations_path, scientists_path=default_scientists_path,
                core_licenses_path=default_core_licenses_path, core_formats_path=default_core_formats_path ): 
    # test robots.txt
    # test licenses read, write
    # test datasets read, write
    # datacatalogs, scientists and organizations
    # test formats
    print("running spider")
    delay = robots_txt(site_root)
    print("crawl-delay: " + str(delay))
                                            
    if run_state == 'scrape' :
        ckan_licenses = license_list(site_root,delay)
        #mylicenses = my_licenses(ckan_licenses, core_licenses_path, my_licenses_path)
    
    #read package list
    ckan_package_list = package_list(site_root, delay)    
    processed_packages = {}   
    datacatalogs = []
    datasets = []
    formats = []
    organizations = []
    scientists = []
    
    if run_state == 'continue' :
        datacatalogs = read_json(default_datacatalogs_path)
        datasets = read_json(default_datasets_path)
        formats = read_json(default_formats_path)
        organizations = read_json(default_organizations_path)
        scientists = read_json(default_scientists_path)
        processed_packages = read_json(temp_saved_state)
        ckan_licenses = read_json(temp_ckan_licenses)   
        ckan_package_list = list(set(ckan_package_list) - set(processed_packages))
        
    #for each package in package list
    #read package_show
    try:
                        
        for package_name in ckan_package_list :
            if run_state == 'continue' :
                if package_name in processed_packages :
                    print(package_name + " : skipped")
                    continue
            x = None       
            while not x:                          
                print(package_name + " : processing")
                x = package_show(site_root, package_name, delay)
                if not x :
                    sleep(delay*5)
            #processed_packages[package_name] = x['revision_timestamp']
            spam_score = add_spam_score(x, spam_digest)
            if spam_score >= max_spam or x['num_resources'] < 1 :       
                #spam
                print(package_name + " : spam")
                continue
            
            license_title = ""
            license_url = ""
            if "license_title" in x :
                license_title = x["license_title"]
            if "license_url" in x :
                license_url = x["license_url"]
            if {license_title,license_url} not in [ {license["title"],license["url"]} for license in ckan_licenses ] :
                ckan_licenses.append({"title":license_title,"url":license_url}) 
            if x["author"] not in [ item['fields']['name'] for item in organizations ]  :   
                a_organization = { "model":"data_connections.Organization", "pk":None, 'fields':{} }
                a_organization['fields']['name'] = x["author"]              
                a_organization['fields']['url'] = ''
                organizations.append(a_organization)    
            a_manager = []
            if x['maintainer'] and len(x['maintainer']) > 0 :
                temp = x['maintainer'].split(',')
                temp = temp[0].split(' and ')
                temp = temp[0].split(None, 1)
                a_scientist = { "model":"data_connections.Scientist", "pk":None, 'fields':{} }
                if len(temp) == 2 :         
                    a_scientist['fields']['firstname'] = temp[0]           
                    a_scientist['fields']['lastname'] = temp[1]            
                    a_scientist['fields']['profile_url'] = ""            
                    a_manager = [ temp[0], temp[1], "" ]          
                else :
                    a_scientist['fields']['firstname'] = temp[0]           
                    a_scientist['fields']['lastname'] = x['maintainer']           
                    a_scientist['fields']['profile_url'] = ""                   
                    a_manager = [ temp[0], x['maintainer'], "" ]
# added condition here                          
                if [a_scientist['fields']['firstname'],a_scientist['fields']['lastname']] not in [ [s['fields']['firstname'],s['fields']['lastname']] for s in scientists ] :
                    scientists.append(a_scientist)          
            else :
                a_manager = None         
            #end scientists
            if x['num_resources'] > 1 :
                #datacataglog
                a_catalog = {"model":"data_connections.DataCatalog","pk":None,"fields":{} }
                a_catalog['fields']['name'] = x['title']                    
                a_catalog['fields']['manager'] = a_manager         
                a_catalog['fields']['managing_organization'] = x['author']   
                #extra fields for datahub             
                a_catalog['fields']['datahub_name'] = x['name']              
                if 'spam_score' in x :                                                   
                    a_catalog['fields']['spam_score'] = x['spam_score']
                datacatalogs.append(a_catalog)
                #end datacatalogs
                #resources
                for a_resource in x['resources'] :
                    #datasets                    
                    a_dataset = {		"model":"data_connections.Dataset","pk":None,"fields":{}}                 
                    a_dataset['fields']['description'] = unicode(a_resource['description']).strip() +' ( ' + unicode(x['notes']).strip() + ' ) '     
                    a_dataset['fields']['license'] = [ x['license_title'] ]                         
                    a_dataset['fields']['date_last_edited'] = a_resource["last_modified"]
                    a_dataset['fields']['url'] = a_resource["url"]                
                    if a_resource["format"] and len(a_resource["format"]) > 0 :                           
                        a_dataset['fields']['data_format'] = [ a_resource["format"] ]
                    else :                                                                            
                        a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ]                    
                    a_dataset['fields']['date_published'] = a_resource["last_modified"]
                    a_dataset['fields']['manager'] = a_catalog['fields']['manager']                            
                    a_dataset['fields']['managing_organization'] = x['author']       
                    a_dataset['fields']['data_catalog'] = [ x['title'] ]
                    a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format       
                    #extra fields for datahub             
                    a_dataset['fields']['datahub_name'] = x['name']             
                    a_dataset['fields']['spam_score'] = spam_score
                    datasets.append(a_dataset)
                    #formats
                    if a_resource["format"] not in [ item['fields']['name'] for item in formats ]  :
                        a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} }
                        a_format['fields']['name'] = a_resource["format"]              
                        a_format['fields']['url'] = ''
                        formats.append(a_format)
            #end if x['num_resources'] > 1          
            elif x['num_resources'] == 1 :
                #datasets                       
                a_dataset = {		"model":"data_connections.Dataset","pk":None,"fields":{}}     
                a_resource = x['resources'][0]           
                a_dataset['fields']['description'] = unicode(x['notes']).strip() + unicode(a_resource['description']).strip()   
                a_dataset['fields']['license'] = [ x['license_title'] ]                         
                a_dataset['fields']['date_last_edited'] = x["metadata_modified"]
                a_dataset['fields']['url'] = a_resource["url"]                
                if a_resource["format"] and len(a_resource["format"]) > 0 :                           
                    a_dataset['fields']['data_format'] = [ a_resource["format"] ]
                else :                                                                            
                    a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ]                    
                a_dataset['fields']['date_published'] = x["metadata_created"]
                a_dataset['fields']['manager'] = a_manager                                
                a_dataset['fields']['managing_organization'] = x['author']       
                a_dataset['fields']['data_catalog'] = [ x['title'] ]
                a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format       
                #extra fields for datahub             
                a_dataset['fields']['datahub_name'] = x['name']                                         
                a_dataset['fields']['spam_score'] = spam_score
                datasets.append(a_dataset)
                #end datasets
                #formats
                if a_resource["format"] not in [ item['fields']['name'] for item in formats ]  :
                    a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} }
                    a_format['fields']['name'] = a_resource["format"]              
                    a_format['fields']['url'] = ''
                    formats.append(a_format)                
                #end formats
            #end elif x['num_resources'] == 1
            #if appending to output json files              
            processed_packages[package_name] = x['revision_timestamp']
        #end for x in package_list :#write to datacatalogs.json  
    #end try
    except (KeyboardInterrupt, Exception) as e:        
        print("spider KeyboardInterrupt : "  +  str(sys.exc_info())) 
        print("len(datasets) : " + str(len(datasets)))  
        write_json(datacatalogs_path, datacatalogs)   
        write_json(datasets_path, datasets)                  
        write_json(organizations_path, organizations)
        write_json(scientists_path, scientists)              
        write_json(temp_ckan_licenses, ckan_licenses)         
        core_formats = read_json(core_formats_path)
        core_format_names = [ item['fields']['name'] for item in core_formats ]
        my_formats = []
        for a_format in formats :
            if a_format['fields']['name'] not in core_format_names :
                a_format['pk'] = len(my_formats)+1
                my_formats.append(a_format)          
        write_json(formats_path, my_formats)
        write_json(temp_saved_state, processed_packages)
        import traceback
        traceback.print_exc()
    #end except        
    mylicenses = my_licenses(license_convert(ckan_licenses), core_licenses_path, my_licenses_path)
    write_json(datacatalogs_path, datacatalogs)   
    write_json(datasets_path, datasets)                  
    write_json(organizations_path, organizations)
    write_json(scientists_path, scientists)                       
    f = open(core_formats_path)
    core_formats = json.loads(f.read())
    core_format_names = [ item['fields']['name'].lower() for item in core_formats ]
    f.close()
    my_formats = []
    for a_format in formats :
        if a_format['fields']['name'].lower() not in core_format_names :
            a_format['pk'] = len(my_formats)+1
            my_formats.append(a_format)          
    write_json(formats_path, my_formats)
    write_json(temp_saved_state, processed_packages)
    process_relations(site_root, datasets, run_state, temp_processed_relations, datarelations_path, delay)
Beispiel #2
0
def ckan_test_package_list(site_root, delay) :
    ckan_package_list = package_list(site_root, delay)
    f = open('ckan_package_list.json','w')
    f.write(json.dumps(ckan_package_list,indent=4))
    f.close()