Esempio n. 1
0
 def get_stats(self):
     import pprint
     printer = pprint.PrettyPrinter(indent=4)
     db = DatabasePlainFiles('stats/')
     stats = db.loadDbase('stats17028')
     
     #tag cloud
     """
     tag_cloud = []
     for tag in stats['tags']:
         if stats['tags'][tag] > 15: #5 is okay
             for i in range(int(stats['tags'][tag] / 15)):
                 tag_cloud.append(tag)
         
     import json
     db.saveDbaseRaw('tag_cloud', json.dumps(tag_cloud))
     """
     
     #tags overall
     tag_usage = 0
     tag_count = 0
     for tag in stats['tags']:
         tag_usage = tag_usage + stats['tags'][tag]
         tag_count = tag_count + 1
     
     print tag_usage
     print tag_count
     
     
     #format statistics
     """
Esempio n. 2
0
 def process_download_all_log(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append({resource_id: status_code})
         else:
             resources_check.append({resource_id: status_code})
     
     print len(resources_success)
     print len(resources_check)
     import pprint
     pp = pprint.PrettyPrinter(indent=4)
     pp.pprint(resources_check)
Esempio n. 3
0
 def delete_bad_response(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append(resource_id)
         else:
             resources_check.append(resource_id)
     
     for resource in resources_check:
         if(os.path.exists('files/'+resource)):
             os.remove('files/'+resource)
     print 'resources clean-up complete!'
Esempio n. 4
0
 def get_failed_resources_ckan_urls(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     resources_fail = db_logs.loadDbaseRaw('resources_fail.csv')
     resources_fail = resources_fail.split('\n')
     for line in resources_fail:
         resource_id = line.strip()
         resource = ckaninterface.Resource(resource_id)
         print resource_id + ' ' + resource.ckan_url
Esempio n. 5
0
 def delete_html_pages(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     html_pages = db_logs.loadDbaseRaw('html_pages.txt')
     html_pages = html_pages.split('\n')
     for resource in html_pages:
         if(os.path.exists('files/'+resource) and resource != ''):
             os.remove('files/'+resource)
     
     print "clean-up complete!"
Esempio n. 6
0
 def download_all_csv_resources(self):
     """ Download csv resources
         if resource unaccessible (404 or 503) - add to the list
         post-processing
             - check mimetype of the file
             - if not csv - report
     """
     db = DatabasePlainFiles(self.log_folder)
     download_all_log = "download_all_log.txt"
     ckan = ckaninterface.CKAN_Application()
     csv_resource_list = ckan.get_csv_resource_list()
     csv_resource_list_max = len(csv_resource_list) - 1
     for i in range(csv_resource_list_max):
         resource = ckaninterface.Resource(csv_resource_list[i])
         db.addDbaseRaw(download_all_log, resource._download())
Esempio n. 7
0
 def download_n_random_csv(self, n):
     db = DatabasePlainFiles(self.log_folder)
     random_csv_filename = "random_csv.txt"
     import random
     ckan = ckaninterface.CKAN_Application()
     csv_resource_list = ckan.get_csv_resource_list()
     csv_resource_list_max = len(csv_resource_list) - 1
     for i in range(n):
         rand = random.randint(0, csv_resource_list_max)
         db.addDbaseRaw(random_csv_filename, str(rand) + "\n")
         resource = ckaninterface.Resource(csv_resource_list[rand])
         try:
             resource._download()
         except:
             pass
Esempio n. 8
0
 def choose_n_random(self, n=10):
     db = DatabasePlainFiles('files/.analyzed/')
     analyzed_ids = db.loadDbaseRaw('100_analyze_ids')
     analyzed_ids = analyzed_ids.split('\n')
     all_ids=self.get_files()
     csv_resource_list_max = len(all_ids) - 1
     
     output = []
     import random
     for i in range(n):
         rand = random.randint(0, csv_resource_list_max)
         if(not all_ids[rand] in analyzed_ids):
             output.append(all_ids[rand])
     
     import pprint
     pp = pprint.PrettyPrinter(indent=4)
     print pp.pprint(output)
Esempio n. 9
0
 def check_good_response(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append(resource_id)
         else:
             resources_check.append(resource_id)
     
     bad_resources = []        
     for resource in resources_success:
         max_size_bytes = 1048576
         statinfo = os.stat('files/'+resource)
         if(statinfo.st_size > max_size_bytes):
             print str(resource) + ' larger than 1Mb!'
             continue
         
         file = open('files/'+resource, 'rb')
         string = file.read()
         file.close()
         
         if(re.match('.*html.*', string, flags=re.I)):
             print str(resource) + ' html page!'
             bad_resources.append(resource)
         else:
             print str(resource) + ' ok!'
Esempio n. 10
0
 def read_data_folder(self):
     import pickle
     #get data folder list
     data_folder = 'data/'
     file_list = os.listdir(data_folder)
     
     stats = {
         'maintainer': {},
         'isopen': {},
         'author': {},
         'version': {},
         'license_id': {},
         'type': {},
         'mimetype': {},
         'format': {},
         'resource_type': {},
         'tags': {},
         'groups': {},
         'license': {},
         'license_title': {},
         'geographic_coverage': {},
         'geographical_granularity': {},
         'temporal_coverage-from': {},
         'temporal_coverage-to': {},
         'temporal_granularity': {},
         'national_statistic': {},
         'precision': {},
         'series': {},
         'date_released': {},
         'categories': {}            
     }
     
     import pprint
     printer = pprint.PrettyPrinter(indent=4)
     db = DatabasePlainFiles('stats/')    
     
     stats = db.loadDbase('stats14061')
             
     for num, file in enumerate(file_list):
         print num
         if(num < 14061 or file == "package_list"):
             continue
         f = open(data_folder + file)
         object = pickle.load(f)
         f.close()
         self.add_to_stats(object['maintainer'], 'maintainer', stats)
         self.add_to_stats(object['isopen'], 'isopen', stats)
         self.add_to_stats(object['author'], 'author', stats)
         self.add_to_stats(object['version'], 'version', stats)
         self.add_to_stats(object['type'], 'type', stats)
         for resource in object['resources']:
             self.add_to_stats(resource['mimetype'], 'mimetype', stats)
             self.add_to_stats(resource['format'], 'format', stats)
             self.add_to_stats(resource['resource_type'], 'resource_type', stats)
         
         for tag in object['tags']:
             self.add_to_stats(tag, 'tags', stats)
         
         for group in object['groups']:
             self.add_to_stats(group, 'groups', stats)
             
         self.add_to_stats(object['license'], 'license', stats)
         self.add_to_stats(object['license_title'], 'license_title', stats)
         
         try:
             self.add_to_stats(object['extras']['geographic_coverage'], 'geographic_coverage', stats)
             self.add_to_stats(object['extras']['geographical_granularity'], 'geographical_granularity', stats)
             self.add_to_stats(object['extras']['temporal_coverage-from'], 'temporal_coverage-from', stats)
             self.add_to_stats(object['extras']['temporal_coverage-to'], 'temporal_coverage-to', stats)
             self.add_to_stats(object['extras']['temporal_granularity'], 'temporal_granularity', stats)
             self.add_to_stats(object['extras']['series'], 'series', stats)
             self.add_to_stats(object['extras']['precision'], 'precision', stats)
             self.add_to_stats(object['extras']['national_statistic'], 'national_statistic', stats)
             self.add_to_stats(object['extras']['date_released'], 'date_released', stats)
             self.add_to_stats(object['extras']['categories'], 'categories', stats)
         except BaseException as e:
             pass
             #print str(e)
         
         db.saveDbase('stats' + str(num), stats)    
         
     #output stats to file
     print 'script executed!'
Esempio n. 11
0
 def convert_all_to_rdf(self, start_from = 0):        
     conversion_log = DatabasePlainFiles(self.conversion_log_folder)
     process_log = DatabasePlainFiles(self.log_folder)
     process_log_filename = "rdf_conversion.log"
     all_ids = self.get_files()
     overall = len(all_ids)
     for num, resource_id in enumerate(all_ids):
         if(num < start_from):
             continue
         print "Converting resource to RDF " + str(num) + " out of " + str(overall)
         print str(resource_id)
         string = "Converting resource to RDF " + str(num) + " out of " + str(overall) + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         string = str(resource_id) + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         
         #Skip folders
         if(resource_id == ".analyzed" or resource_id == '.all-resources' or resource_id =='.broken_retrieved'
            or resource_id == "files.tar.gz"):
             continue
         
         #Init the resource
         resource = ckaninterface.Resource(resource_id)
         
         #create wiki-page for resource
         string = "creating wiki page for resource" + "\n"
         process_log.addDbaseRaw(process_log_filename, string)
         wiki_page = resource.generate_default_wiki_page()
         string = str(resource.create_wiki_page(wiki_page))
         process_log.addDbaseRaw(process_log_filename, string)
         
         #transform resource to RDF
         sparqlify_message, returncode = resource.transform_to_rdf('default-tranformation-configuration')
         conversion_log.addDbaseRaw(resource_id + '.log', sparqlify_message + "\n" + str(returncode))