def main(source_path, dest_path, processes, source_names, start_from):
    if not os.path.exists(dest_path):
        print "°°° Creating missing destination root:", dest_path
        os.makedirs(dest_path)
    provider_names = csxjdb.get_all_provider_names(source_path)
    provider_names = NAME_TO_SOURCE_MODULE_MAPPING.keys()

    before = datetime.now()
    n_samples = 0
    errors_by_source = dict()

    if processes > 1:
        import multiprocessing as mp
        p = mp.Pool(processes)
        results = p.map(reprocess_raw_html, [(name, source_path, dest_path, start_from) for name in provider_names if name in source_names])
    else:
        results = list()
        for name in [_ for _ in provider_names if _ in source_names]:
            print "***", name
            results.append(reprocess_raw_html((name, source_path, dest_path, start_from)))

    n_samples = sum([x[0] for x in results])
    errors_by_source = [x[1] for x in results]

    after = datetime.now()
    dt = after - before

    if n_samples:
        print u"Total time for {0} articles: {1} seconds".format(n_samples, dt.seconds)
        avg_time = float(dt.seconds) / n_samples
        print u"Avg time per articles: {0} seconds".format(avg_time)
    else:
        print u"No articles were processed"

    write_dict_to_file(errors_by_source, os.path.join(dest_path, os.path.pardir), os.path.basename(dest_path) + "_errors.json")
def list_errors(db_root, outfile, source_list):
    res = dict()
    all_errors = dict()
    if not source_list:
        source_names = get_all_provider_names(db_root)
    else:
        source_names = source_list.split(",")

    for source_name in source_names:
        provider_db = Provider(db_root, source_name)
        error_count = 0
        all_errors[source_name] = dict()
        all_errors[source_name] = list()
        for date_string in provider_db.get_all_days():
            errors_by_batch = provider_db.get_errors2_per_batch(date_string)

            for (batch_time, errors) in errors_by_batch:
                errors = it.chain(*errors)
                #errors = flatten_list(errors)

                errors = filter_identical_ErrorLogEntries(errors)
                error_count += len(errors)

                if errors:
                    #print source_name, date_string, batch_time
                    for e in errors:
                        new_item = ((u"{0}/{1}".format(date_string, batch_time)), (e.url, e.title, e.stacktrace))
                        print u"+++ [{0}] {1}   ({2})".format(new_item[0], new_item[1][1], new_item[1][0])
                        all_errors[source_name].append(new_item)
                        source_parser = NAME_TO_SOURCE_MODULE_MAPPING[source_name]

        res[source_name] = error_count

    print "\n" * 4
    for name, error_count in res.items():
        print "{0}: Had {1} errors".format(name, error_count)
        print "{0}: Had {1} errors".format(name, len(all_errors[name]))

    with open(outfile, 'w') as f:
        json.dump(all_errors, f, indent=2)
import argparse
import csxj.db as csxjdb

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Deletes the cached metainfo (article and error counts)')
    parser.add_argument('--jsondb', type=str, dest='jsondb', required=True, help='json db root directory')
    args = parser.parse_args()

    for source_name in csxjdb.get_all_provider_names(args.jsondb):
        p = csxjdb.Provider(args.jsondb, source_name)
        p.remove_all_cached_metainfo()