def single_evaluation(extractor_cls, results, dataset_type, dataset_name): logger.info('started evaluating extractor %s', extractor_cls.NAME) results.set_extractor(extractor_cls.SLUG) storage = LocalResultStorage(dataset_name, extractor_cls) loader = LocalDatasetLoader(dataset_name) for doc in loader: logger.debug('doc: %s', doc.id) format_clean = from_document_factory(doc, slug=dataset_type) try: result_string = storage.fetch_result(doc) except DataError: logger.info('no stored result for %s at %s extractor', doc.id, extractor_cls.NAME) continue else: format_result = extractor_cls.formatted_result(result_string) evaluator = TextOnlyEvaluator(retrieved=format_result, relevant=format_clean, id=doc.id) results.add_result(evaluator.get_eval_results())
def single_evaluation(extractor_cls, results, dataset_type, dataset_name): logger.info('started evaluating extractor %s', extractor_cls.NAME) results.set_extractor(extractor_cls.SLUG) storage = LocalResultStorage(dataset_name, extractor_cls) loader = LocalDatasetLoader(dataset_name) for doc in loader: logger.debug('doc: %s', doc.id) format_clean = from_document_factory(doc, slug = dataset_type) try: result_string = storage.fetch_result(doc) except DataError: logger.info('no stored result for %s at %s extractor', doc.id, extractor_cls.NAME) continue else: format_result = extractor_cls.formatted_result(result_string) evaluator = TextOnlyEvaluator( retrieved = format_result, relevant = format_clean, id = doc.id) results.add_result(evaluator.get_eval_results())
def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing): # init storage and loader ex = get_extractor_cls(extractor_slug) failed_slug = extractor_slug if retry_failed else None skip_slug = extractor_slug if skip_existing else None loader = LocalDatasetLoader(dataset_name, load_failed=failed_slug, skip_existing=skip_slug) storage = LocalResultStorage(dataset_name, ex) logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME) for doc in loader: storage.push_result(doc) if timeout: time.sleep(timeout) storage.dump_summary() logger.info('finished with %s dataset', dataset_name)