def process_download_all_log(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append({resource_id: status_code}) else: resources_check.append({resource_id: status_code}) print len(resources_success) print len(resources_check) import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(resources_check)
def delete_bad_response(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append(resource_id) else: resources_check.append(resource_id) for resource in resources_check: if(os.path.exists('files/'+resource)): os.remove('files/'+resource) print 'resources clean-up complete!'
def get_failed_resources_ckan_urls(self): db_logs = DatabasePlainFiles(self.log_folder) resources_fail = db_logs.loadDbaseRaw('resources_fail.csv') resources_fail = resources_fail.split('\n') for line in resources_fail: resource_id = line.strip() resource = ckaninterface.Resource(resource_id) print resource_id + ' ' + resource.ckan_url
def delete_html_pages(self): db_logs = DatabasePlainFiles(self.log_folder) html_pages = db_logs.loadDbaseRaw('html_pages.txt') html_pages = html_pages.split('\n') for resource in html_pages: if(os.path.exists('files/'+resource) and resource != ''): os.remove('files/'+resource) print "clean-up complete!"
def choose_n_random(self, n=10): db = DatabasePlainFiles('files/.analyzed/') analyzed_ids = db.loadDbaseRaw('100_analyze_ids') analyzed_ids = analyzed_ids.split('\n') all_ids=self.get_files() csv_resource_list_max = len(all_ids) - 1 output = [] import random for i in range(n): rand = random.randint(0, csv_resource_list_max) if(not all_ids[rand] in analyzed_ids): output.append(all_ids[rand]) import pprint pp = pprint.PrettyPrinter(indent=4) print pp.pprint(output)
def check_good_response(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append(resource_id) else: resources_check.append(resource_id) bad_resources = [] for resource in resources_success: max_size_bytes = 1048576 statinfo = os.stat('files/'+resource) if(statinfo.st_size > max_size_bytes): print str(resource) + ' larger than 1Mb!' continue file = open('files/'+resource, 'rb') string = file.read() file.close() if(re.match('.*html.*', string, flags=re.I)): print str(resource) + ' html page!' bad_resources.append(resource) else: print str(resource) + ' ok!'