Example #1
0
def process_relations(site_root, datasets, run_state = "scrape", processed_relations_path = temp_processed_relations,
                      datarelations_path = default_datarelations_path, delay = 10) :
    print("data relation")
    datarelations = []
    ckan_package_names = []
    processed = set()
    if run_state == 'continue' :    
        datarelations = read_json(datarelations_path)
        processed = read_json(processed_relations_path)
    ckan_package_names = { x['fields']['datahub_name'] for x in datasets } 
    try: 
        for x in ckan_package_names :
            print(x)
            if x not in processed :
                related = related_list(site_root, x, delay)
                for d in datasets :
                    if d['fields']['datahub_name'] == x :
                        for a_source in related :
                            print("    " + x + " : " + a_source['title'] )
                            a_relation = { "model":"data_connections.Scientist", "pk":None, 'fields':{} }
                            a_relation['fields']["how_data_was_processed"] = ""                            
                            a_relation['fields']["source"] = [ a_source['url'], a_source['title'] ] 
                            a_relation['fields']["derivative"] = [ d['fields']['url'], d['fields']['name'] ]
                            a_relation['fields']["datahub_name"] = d['fields']['datahub_name']
                            datarelations.append(a_relation)
                            #"how_data_was_processed" : ""
                            #"source" : [this.url, this.title]
                            #"derivative" : [dataset.url, dataset.name]
                            #"datahub_name" : dataset.datahub_name
                        #end for a_source in related
                #end for d in datasets             
                processed.add(x)
        #end for x in ckan_package_names                          
        write_json(datarelations_path, datarelations)             
        write_json(processed_relations_path, processed)                   
        print("len(datarelations) : " + str(len(datarelations)))  
    except (KeyboardInterrupt, Exception) as e:        
        print("relations KeyboardInterrupt : "  +  str(sys.exc_info())) 
        print("len(datarelations) : " + str(len(datarelations))) 
        write_json(datarelations_path, datarelations)                      
        write_json(processed_relations_path, processed) 
        import traceback
        traceback.print_exc()
Example #2
0
def ckan_spider(site_root = 'http://datahub.io/', max_spam = default_max_spam, spam_digest = temp_spam_digest, 
                run_state = "scrape", saved_state = temp_saved_state,
                datasets_path=default_datasets_path, my_licenses_path=default_my_licenses_path, formats_path=default_formats_path,
                datacatalogs_path=default_datacatalogs_path, datarelations_path=default_datarelations_path,
                organizations_path=default_organizations_path, scientists_path=default_scientists_path,
                core_licenses_path=default_core_licenses_path, core_formats_path=default_core_formats_path ): 
    # test robots.txt
    # test licenses read, write
    # test datasets read, write
    # datacatalogs, scientists and organizations
    # test formats
    print("running spider")
    delay = robots_txt(site_root)
    print("crawl-delay: " + str(delay))
                                            
    if run_state == 'scrape' :
        ckan_licenses = license_list(site_root,delay)
        #mylicenses = my_licenses(ckan_licenses, core_licenses_path, my_licenses_path)
    
    #read package list
    ckan_package_list = package_list(site_root, delay)    
    processed_packages = {}   
    datacatalogs = []
    datasets = []
    formats = []
    organizations = []
    scientists = []
    
    if run_state == 'continue' :
        datacatalogs = read_json(default_datacatalogs_path)
        datasets = read_json(default_datasets_path)
        formats = read_json(default_formats_path)
        organizations = read_json(default_organizations_path)
        scientists = read_json(default_scientists_path)
        processed_packages = read_json(temp_saved_state)
        ckan_licenses = read_json(temp_ckan_licenses)   
        ckan_package_list = list(set(ckan_package_list) - set(processed_packages))
        
    #for each package in package list
    #read package_show
    try:
                        
        for package_name in ckan_package_list :
            if run_state == 'continue' :
                if package_name in processed_packages :
                    print(package_name + " : skipped")
                    continue
            x = None       
            while not x:                          
                print(package_name + " : processing")
                x = package_show(site_root, package_name, delay)
                if not x :
                    sleep(delay*5)
            #processed_packages[package_name] = x['revision_timestamp']
            spam_score = add_spam_score(x, spam_digest)
            if spam_score >= max_spam or x['num_resources'] < 1 :       
                #spam
                print(package_name + " : spam")
                continue
            
            license_title = ""
            license_url = ""
            if "license_title" in x :
                license_title = x["license_title"]
            if "license_url" in x :
                license_url = x["license_url"]
            if {license_title,license_url} not in [ {license["title"],license["url"]} for license in ckan_licenses ] :
                ckan_licenses.append({"title":license_title,"url":license_url}) 
            if x["author"] not in [ item['fields']['name'] for item in organizations ]  :   
                a_organization = { "model":"data_connections.Organization", "pk":None, 'fields':{} }
                a_organization['fields']['name'] = x["author"]              
                a_organization['fields']['url'] = ''
                organizations.append(a_organization)    
            a_manager = []
            if x['maintainer'] and len(x['maintainer']) > 0 :
                temp = x['maintainer'].split(',')
                temp = temp[0].split(' and ')
                temp = temp[0].split(None, 1)
                a_scientist = { "model":"data_connections.Scientist", "pk":None, 'fields':{} }
                if len(temp) == 2 :         
                    a_scientist['fields']['firstname'] = temp[0]           
                    a_scientist['fields']['lastname'] = temp[1]            
                    a_scientist['fields']['profile_url'] = ""            
                    a_manager = [ temp[0], temp[1], "" ]          
                else :
                    a_scientist['fields']['firstname'] = temp[0]           
                    a_scientist['fields']['lastname'] = x['maintainer']           
                    a_scientist['fields']['profile_url'] = ""                   
                    a_manager = [ temp[0], x['maintainer'], "" ]
# added condition here                          
                if [a_scientist['fields']['firstname'],a_scientist['fields']['lastname']] not in [ [s['fields']['firstname'],s['fields']['lastname']] for s in scientists ] :
                    scientists.append(a_scientist)          
            else :
                a_manager = None         
            #end scientists
            if x['num_resources'] > 1 :
                #datacataglog
                a_catalog = {"model":"data_connections.DataCatalog","pk":None,"fields":{} }
                a_catalog['fields']['name'] = x['title']                    
                a_catalog['fields']['manager'] = a_manager         
                a_catalog['fields']['managing_organization'] = x['author']   
                #extra fields for datahub             
                a_catalog['fields']['datahub_name'] = x['name']              
                if 'spam_score' in x :                                                   
                    a_catalog['fields']['spam_score'] = x['spam_score']
                datacatalogs.append(a_catalog)
                #end datacatalogs
                #resources
                for a_resource in x['resources'] :
                    #datasets                    
                    a_dataset = {		"model":"data_connections.Dataset","pk":None,"fields":{}}                 
                    a_dataset['fields']['description'] = unicode(a_resource['description']).strip() +' ( ' + unicode(x['notes']).strip() + ' ) '     
                    a_dataset['fields']['license'] = [ x['license_title'] ]                         
                    a_dataset['fields']['date_last_edited'] = a_resource["last_modified"]
                    a_dataset['fields']['url'] = a_resource["url"]                
                    if a_resource["format"] and len(a_resource["format"]) > 0 :                           
                        a_dataset['fields']['data_format'] = [ a_resource["format"] ]
                    else :                                                                            
                        a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ]                    
                    a_dataset['fields']['date_published'] = a_resource["last_modified"]
                    a_dataset['fields']['manager'] = a_catalog['fields']['manager']                            
                    a_dataset['fields']['managing_organization'] = x['author']       
                    a_dataset['fields']['data_catalog'] = [ x['title'] ]
                    a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format       
                    #extra fields for datahub             
                    a_dataset['fields']['datahub_name'] = x['name']             
                    a_dataset['fields']['spam_score'] = spam_score
                    datasets.append(a_dataset)
                    #formats
                    if a_resource["format"] not in [ item['fields']['name'] for item in formats ]  :
                        a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} }
                        a_format['fields']['name'] = a_resource["format"]              
                        a_format['fields']['url'] = ''
                        formats.append(a_format)
            #end if x['num_resources'] > 1          
            elif x['num_resources'] == 1 :
                #datasets                       
                a_dataset = {		"model":"data_connections.Dataset","pk":None,"fields":{}}     
                a_resource = x['resources'][0]           
                a_dataset['fields']['description'] = unicode(x['notes']).strip() + unicode(a_resource['description']).strip()   
                a_dataset['fields']['license'] = [ x['license_title'] ]                         
                a_dataset['fields']['date_last_edited'] = x["metadata_modified"]
                a_dataset['fields']['url'] = a_resource["url"]                
                if a_resource["format"] and len(a_resource["format"]) > 0 :                           
                    a_dataset['fields']['data_format'] = [ a_resource["format"] ]
                else :                                                                            
                    a_dataset['fields']['data_format'] = [ a_resource["mimetype"] ]                    
                a_dataset['fields']['date_published'] = x["metadata_created"]
                a_dataset['fields']['manager'] = a_manager                                
                a_dataset['fields']['managing_organization'] = x['author']       
                a_dataset['fields']['data_catalog'] = [ x['title'] ]
                a_dataset['fields']['name'] = unicode(x['title']) + " " + unicode(a_resource['name']) + " " + unicode(a_resource['format']) # catalog.title + name + format       
                #extra fields for datahub             
                a_dataset['fields']['datahub_name'] = x['name']                                         
                a_dataset['fields']['spam_score'] = spam_score
                datasets.append(a_dataset)
                #end datasets
                #formats
                if a_resource["format"] not in [ item['fields']['name'] for item in formats ]  :
                    a_format = { "model":"data_connections.Format", "pk":None, 'fields':{} }
                    a_format['fields']['name'] = a_resource["format"]              
                    a_format['fields']['url'] = ''
                    formats.append(a_format)                
                #end formats
            #end elif x['num_resources'] == 1
            #if appending to output json files              
            processed_packages[package_name] = x['revision_timestamp']
        #end for x in package_list :#write to datacatalogs.json  
    #end try
    except (KeyboardInterrupt, Exception) as e:        
        print("spider KeyboardInterrupt : "  +  str(sys.exc_info())) 
        print("len(datasets) : " + str(len(datasets)))  
        write_json(datacatalogs_path, datacatalogs)   
        write_json(datasets_path, datasets)                  
        write_json(organizations_path, organizations)
        write_json(scientists_path, scientists)              
        write_json(temp_ckan_licenses, ckan_licenses)         
        core_formats = read_json(core_formats_path)
        core_format_names = [ item['fields']['name'] for item in core_formats ]
        my_formats = []
        for a_format in formats :
            if a_format['fields']['name'] not in core_format_names :
                a_format['pk'] = len(my_formats)+1
                my_formats.append(a_format)          
        write_json(formats_path, my_formats)
        write_json(temp_saved_state, processed_packages)
        import traceback
        traceback.print_exc()
    #end except        
    mylicenses = my_licenses(license_convert(ckan_licenses), core_licenses_path, my_licenses_path)
    write_json(datacatalogs_path, datacatalogs)   
    write_json(datasets_path, datasets)                  
    write_json(organizations_path, organizations)
    write_json(scientists_path, scientists)                       
    f = open(core_formats_path)
    core_formats = json.loads(f.read())
    core_format_names = [ item['fields']['name'].lower() for item in core_formats ]
    f.close()
    my_formats = []
    for a_format in formats :
        if a_format['fields']['name'].lower() not in core_format_names :
            a_format['pk'] = len(my_formats)+1
            my_formats.append(a_format)          
    write_json(formats_path, my_formats)
    write_json(temp_saved_state, processed_packages)
    process_relations(site_root, datasets, run_state, temp_processed_relations, datarelations_path, delay)
Example #3
0
            if debug_item == 'robots' :
                ckan_test_robots(site_root)     
            if debug_item == 'licenses' :
                ckan_test_licenses(site_root, core_licenses_path, my_licenses_path)
            if debug_item == 'package_list' :
                ckan_test_package_list(site_root, 10)
            if debug_item == 'package_read' :
                ckan_test_package_read(site_root, temp_ckan_package_list, temp_ckan_datasets, 10)
            if debug_item == 'spam' :
                ckan_test_spam_score(temp_ckan_datasets, temp_scored_datasets, spam_digest)          
            if debug_item == 'convert' :
                #ckan_test_convert_package('ckan_datasets.json')       
                ckan_test_convert_package(temp_scored_datasets)                                 
            if debug_item == 'relations' or debug_item == 'relations_scrape':
                #ckan_test_convert_package('ckan_datasets.json')  
                datasets = read_json(datasets_path)     
                process_relations(site_root, datasets, "scrape", temp_processed_relations, datarelations_path, 10)       
            if debug_item == 'relations_continue':
                #ckan_test_convert_package('ckan_datasets.json')  
                datasets = read_json(datasets_path)     
                process_relations(site_root, datasets, "continue", temp_processed_relations, datarelations_path, 10)
    elif sys.argv[1] == 'scrape' or sys.argv[1] == 'continue' :
        #print("spider " + sys.argv[1])
        ckan_spider(site_root, max_spam, spam_digest,
                    sys.argv[1], saved_state,
                    datasets_path, my_licenses_path, formats_path,
                    datacatalogs_path, datarelations_path,
                    organizations_path, scientists_path,
                    core_licenses_path, core_formats_path)