def single_evaluation(extractor_cls, results, dataset_type, dataset_name):
    logger.info('started evaluating extractor %s', extractor_cls.NAME)
    results.set_extractor(extractor_cls.SLUG)
    storage = LocalResultStorage(dataset_name, extractor_cls)

    loader = LocalDatasetLoader(dataset_name)
    for doc in loader:
        logger.debug('doc: %s', doc.id)
        format_clean = from_document_factory(doc, slug=dataset_type)
        try:
            result_string = storage.fetch_result(doc)
        except DataError:
            logger.info('no stored result for %s at %s extractor', doc.id,
                        extractor_cls.NAME)
            continue
        else:
            format_result = extractor_cls.formatted_result(result_string)
            evaluator = TextOnlyEvaluator(retrieved=format_result,
                                          relevant=format_clean,
                                          id=doc.id)
            results.add_result(evaluator.get_eval_results())
def single_evaluation(extractor_cls, results, dataset_type, dataset_name):
    logger.info('started evaluating extractor %s', extractor_cls.NAME)
    results.set_extractor(extractor_cls.SLUG)
    storage = LocalResultStorage(dataset_name, extractor_cls)
    
    loader = LocalDatasetLoader(dataset_name)
    for doc in loader:
        logger.debug('doc: %s', doc.id)
        format_clean = from_document_factory(doc, slug = dataset_type)
        try:
            result_string = storage.fetch_result(doc)
        except DataError:
            logger.info('no stored result for %s at %s extractor',
                        doc.id, extractor_cls.NAME)
            continue
        else:
            format_result = extractor_cls.formatted_result(result_string)
            evaluator = TextOnlyEvaluator(
                        retrieved = format_result,
                        relevant = format_clean,
                        id = doc.id)
            results.add_result(evaluator.get_eval_results())
def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing):
    # init storage and loader
    ex = get_extractor_cls(extractor_slug)
    
    failed_slug = extractor_slug if retry_failed else None
    skip_slug = extractor_slug if skip_existing else None
    
    loader = LocalDatasetLoader(dataset_name, 
                                load_failed=failed_slug, 
                                skip_existing=skip_slug)
    storage = LocalResultStorage(dataset_name, ex)
    
    logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME)
    for doc in loader:
        storage.push_result(doc)
        if timeout:
            time.sleep(timeout)
        
    storage.dump_summary()
    logger.info('finished with %s dataset', dataset_name)