Beispiel #1
0
 def convert_all_to_rdf(self, start_from = 0):        
     conversion_log = DatabasePlainFiles(self.conversion_log_folder)
     process_log = DatabasePlainFiles(self.log_folder)
     process_log_filename = "rdf_conversion.log"
     all_ids = self.get_files()
     overall = len(all_ids)
     for num, resource_id in enumerate(all_ids):
         if(num < start_from):
             continue
         print "Converting resource to RDF " + str(num) + " out of " + str(overall)
         print str(resource_id)
         string = "Converting resource to RDF " + str(num) + " out of " + str(overall) + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         string = str(resource_id) + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         
         #Skip folders
         if(resource_id == ".analyzed" or resource_id == '.all-resources' or resource_id =='.broken_retrieved'
            or resource_id == "files.tar.gz"):
             continue
         
         #Init the resource
         resource = ckaninterface.Resource(resource_id)
         
         #create wiki-page for resource
         string = "creating wiki page for resource" + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         wiki_page = resource.generate_default_wiki_page()
         string = str(resource.create_wiki_page(wiki_page))
         process_log.addDbaseRaw(process_log_filename, string)
         
         #transform resource to RDF
         sparqlify_message, returncode = resource.transform_to_rdf('default-tranformation-configuration')
         conversion_log.addDbaseRaw(resource_id + '.log', sparqlify_message + "\n" + str(returncode))
Beispiel #2
0
 def download_all_csv_resources(self):
     """ Download csv resources
         if resource unaccessible (404 or 503) - add to the list
         post-processing
             - check mimetype of the file
             - if not csv - report
     """
     db = DatabasePlainFiles(self.log_folder)
     download_all_log = "download_all_log.txt"
     ckan = ckaninterface.CKAN_Application()
     csv_resource_list = ckan.get_csv_resource_list()
     csv_resource_list_max = len(csv_resource_list) - 1
     for i in range(csv_resource_list_max):
         resource = ckaninterface.Resource(csv_resource_list[i])
         db.addDbaseRaw(download_all_log, resource._download())
Beispiel #3
0
 def download_n_random_csv(self, n):
     db = DatabasePlainFiles(self.log_folder)
     random_csv_filename = "random_csv.txt"
     import random
     ckan = ckaninterface.CKAN_Application()
     csv_resource_list = ckan.get_csv_resource_list()
     csv_resource_list_max = len(csv_resource_list) - 1
     for i in range(n):
         rand = random.randint(0, csv_resource_list_max)
         db.addDbaseRaw(random_csv_filename, str(rand) + "\n")
         resource = ckaninterface.Resource(csv_resource_list[rand])
         try:
             resource._download()
         except:
             pass