def main(source_path, dest_path, processes, source_names, start_from): if not os.path.exists(dest_path): print "°°° Creating missing destination root:", dest_path os.makedirs(dest_path) provider_names = csxjdb.get_all_provider_names(source_path) provider_names = NAME_TO_SOURCE_MODULE_MAPPING.keys() before = datetime.now() n_samples = 0 errors_by_source = dict() if processes > 1: import multiprocessing as mp p = mp.Pool(processes) results = p.map(reprocess_raw_html, [(name, source_path, dest_path, start_from) for name in provider_names if name in source_names]) else: results = list() for name in [_ for _ in provider_names if _ in source_names]: print "***", name results.append(reprocess_raw_html((name, source_path, dest_path, start_from))) n_samples = sum([x[0] for x in results]) errors_by_source = [x[1] for x in results] after = datetime.now() dt = after - before if n_samples: print u"Total time for {0} articles: {1} seconds".format(n_samples, dt.seconds) avg_time = float(dt.seconds) / n_samples print u"Avg time per articles: {0} seconds".format(avg_time) else: print u"No articles were processed" write_dict_to_file(errors_by_source, os.path.join(dest_path, os.path.pardir), os.path.basename(dest_path) + "_errors.json")
def list_errors(db_root, outfile, source_list): res = dict() all_errors = dict() if not source_list: source_names = get_all_provider_names(db_root) else: source_names = source_list.split(",") for source_name in source_names: provider_db = Provider(db_root, source_name) error_count = 0 all_errors[source_name] = dict() all_errors[source_name] = list() for date_string in provider_db.get_all_days(): errors_by_batch = provider_db.get_errors2_per_batch(date_string) for (batch_time, errors) in errors_by_batch: errors = it.chain(*errors) #errors = flatten_list(errors) errors = filter_identical_ErrorLogEntries(errors) error_count += len(errors) if errors: #print source_name, date_string, batch_time for e in errors: new_item = ((u"{0}/{1}".format(date_string, batch_time)), (e.url, e.title, e.stacktrace)) print u"+++ [{0}] {1} ({2})".format(new_item[0], new_item[1][1], new_item[1][0]) all_errors[source_name].append(new_item) source_parser = NAME_TO_SOURCE_MODULE_MAPPING[source_name] res[source_name] = error_count print "\n" * 4 for name, error_count in res.items(): print "{0}: Had {1} errors".format(name, error_count) print "{0}: Had {1} errors".format(name, len(all_errors[name])) with open(outfile, 'w') as f: json.dump(all_errors, f, indent=2)
import argparse import csxj.db as csxjdb if __name__ == '__main__': parser = argparse.ArgumentParser(description='Deletes the cached metainfo (article and error counts)') parser.add_argument('--jsondb', type=str, dest='jsondb', required=True, help='json db root directory') args = parser.parse_args() for source_name in csxjdb.get_all_provider_names(args.jsondb): p = csxjdb.Provider(args.jsondb, source_name) p.remove_all_cached_metainfo()