def reprocess_errors(db_root, sources):
    res = dict()

    for source in sources:
        provider_db = Provider(db_root, source.SOURCE_NAME)

        for date_string in provider_db.get_all_days():
            errors_by_batch = provider_db.get_errors2_per_batch(date_string)
            for (time, errors) in errors_by_batch:
                if errors:
                    print source.SOURCE_NAME, date_string, time, "found {0} errors".format(len(errors))
                    batch_directory = os.path.join(db_root, source.SOURCE_NAME, date_string, time)
                    articles, deleted_articles, errors, raw_data = reprocess_batch_errors(source, date_string, time, errors)
                    save_reprocessed_data(batch_directory, articles, deleted_articles, raw_data)
                    update_errors_file(batch_directory, errors)
def try_download_queue(json_db):
    sources = [rtlinfo, lesoir, lalibre, dhnet, lavenir]
    for source in sources:
        p = Provider(json_db, source.SOURCE_NAME)
        batches_by_day = p.get_queued_batches_by_day()
        print source.SOURCE_NAME
        for day, batches in batches_by_day:
            print "\tDay:", day
            for batch, items in batches:
                print "\t\tBatch:", batch
                articles = items['articles']
                for title, url in articles:
                    print "\t\t\tDownloading {0}".format(url)
                    art, html = source.extract_article_data(url)
                    if art:
                        print "\t\t\t\t got {0} links".format(len(art.links))
                    else:
                        print "\t\t\t\t no article found"
def show_queue_info(json_db):
    sources = [rtlinfo, lesoir, lalibre, dhnet, lavenir, sudinfo, sudpresse, rtbfinfo, levif, septsursept]
    res = dict()
    for source in sources:
        p = Provider(json_db, source.SOURCE_NAME)
        print source.SOURCE_NAME
        batches_by_day = p.get_queued_batches_by_day()
        total_item_count = 0

        for day, batches in batches_by_day:
            print "\tDay:", day
            queued_item_count = 0
            for batch, items in batches:
                queued_item_count += len(items['articles'])

            print "\t\t", queued_item_count, "items"
            total_item_count += queued_item_count
        res[source.SOURCE_NAME] = (total_item_count, len(batches_by_day))

    for name, (item_count, day_count) in res.items():
        print "{0}: {1} items for {2} days".format(name, item_count, day_count)
Example #4
0
def list_errors(db_root, outfile, source_list):
    res = dict()
    all_errors = dict()
    if not source_list:
        source_names = get_all_provider_names(db_root)
    else:
        source_names = source_list.split(",")

    for source_name in source_names:
        provider_db = Provider(db_root, source_name)
        error_count = 0
        all_errors[source_name] = dict()
        all_errors[source_name] = list()
        for date_string in provider_db.get_all_days():
            errors_by_batch = provider_db.get_errors2_per_batch(date_string)

            for (batch_time, errors) in errors_by_batch:
                errors = it.chain(*errors)
                #errors = flatten_list(errors)

                errors = filter_identical_ErrorLogEntries(errors)
                error_count += len(errors)

                if errors:
                    #print source_name, date_string, batch_time
                    for e in errors:
                        new_item = ((u"{0}/{1}".format(date_string, batch_time)), (e.url, e.title, e.stacktrace))
                        print u"+++ [{0}] {1}   ({2})".format(new_item[0], new_item[1][1], new_item[1][0])
                        all_errors[source_name].append(new_item)
                        source_parser = NAME_TO_SOURCE_MODULE_MAPPING[source_name]

        res[source_name] = error_count

    print "\n" * 4
    for name, error_count in res.items():
        print "{0}: Had {1} errors".format(name, error_count)
        print "{0}: Had {1} errors".format(name, len(all_errors[name]))

    with open(outfile, 'w') as f:
        json.dump(all_errors, f, indent=2)