Beispiel #1
0
 def process_download_all_log(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append({resource_id: status_code})
         else:
             resources_check.append({resource_id: status_code})
     
     print len(resources_success)
     print len(resources_check)
     import pprint
     pp = pprint.PrettyPrinter(indent=4)
     pp.pprint(resources_check)
Beispiel #2
0
 def delete_bad_response(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append(resource_id)
         else:
             resources_check.append(resource_id)
     
     for resource in resources_check:
         if(os.path.exists('files/'+resource)):
             os.remove('files/'+resource)
     print 'resources clean-up complete!'
Beispiel #3
0
 def get_failed_resources_ckan_urls(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     resources_fail = db_logs.loadDbaseRaw('resources_fail.csv')
     resources_fail = resources_fail.split('\n')
     for line in resources_fail:
         resource_id = line.strip()
         resource = ckaninterface.Resource(resource_id)
         print resource_id + ' ' + resource.ckan_url
Beispiel #4
0
 def delete_html_pages(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     html_pages = db_logs.loadDbaseRaw('html_pages.txt')
     html_pages = html_pages.split('\n')
     for resource in html_pages:
         if(os.path.exists('files/'+resource) and resource != ''):
             os.remove('files/'+resource)
     
     print "clean-up complete!"
Beispiel #5
0
 def choose_n_random(self, n=10):
     db = DatabasePlainFiles('files/.analyzed/')
     analyzed_ids = db.loadDbaseRaw('100_analyze_ids')
     analyzed_ids = analyzed_ids.split('\n')
     all_ids=self.get_files()
     csv_resource_list_max = len(all_ids) - 1
     
     output = []
     import random
     for i in range(n):
         rand = random.randint(0, csv_resource_list_max)
         if(not all_ids[rand] in analyzed_ids):
             output.append(all_ids[rand])
     
     import pprint
     pp = pprint.PrettyPrinter(indent=4)
     print pp.pprint(output)
Beispiel #6
0
 def check_good_response(self):
     db_logs = DatabasePlainFiles(self.log_folder)
     
     download_all_log = db_logs.loadDbaseRaw('download_all_log.txt')
     download_all_log = download_all_log.split('\n')
     resources_success = []
     resources_check = []
     resources_fail = []
     for line in download_all_log:
         if( re.match("^Could not download", line) ):
             resources_fail.append(line)
             continue
         
         if(line == ''):
             continue
         
         resource_id = line.split()[1]
         status_code = int(line.split()[3])
         if(status_code == 200):
             resources_success.append(resource_id)
         else:
             resources_check.append(resource_id)
     
     bad_resources = []        
     for resource in resources_success:
         max_size_bytes = 1048576
         statinfo = os.stat('files/'+resource)
         if(statinfo.st_size > max_size_bytes):
             print str(resource) + ' larger than 1Mb!'
             continue
         
         file = open('files/'+resource, 'rb')
         string = file.read()
         file.close()
         
         if(re.match('.*html.*', string, flags=re.I)):
             print str(resource) + ' html page!'
             bad_resources.append(resource)
         else:
             print str(resource) + ' ok!'