def _classify_paper(obj, eng): from invenio_classifier.errors import ClassifierException from invenio_classifier import ( get_keywords_from_text, get_keywords_from_local_file, ) params = dict( taxonomy_name=taxonomy, output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) fast_mode = False try: # FIXME: May need to find another canonical way of getting PDF if "pdf" in obj.extra_data: result = get_keywords_from_local_file( obj.extra_data["pdf"], **params ) else: data = [] titles = obj.data.get('titles') if titles: data.extend([t.get('title', '') for t in titles]) abstracts = obj.data.get('abstracts') if abstracts: data.extend([t.get('value', '') for t in abstracts]) if not data: obj.log.error("No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fast_mode = True except ClassifierException as e: obj.log.exception(e) return result['complete_output'] = clean_instances_from_data( result.get("complete_output", {}) ) result["fast_mode"] = fast_mode # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result
def test_composite_keywords(app, hep_taxonomy, pdf_with_composite_keywords): with app.app_context(): out = get_keywords_from_local_file( pdf_with_composite_keywords, taxonomy_name=hep_taxonomy, output_mode='dict', ) output = out.get('complete_output') composite_keywords = output.get('composite_keywords', []) assert len(composite_keywords) == 20, output assert {'details': [64, 132], 'keyword': 'electronics: noise', 'number': 23} in composite_keywords
def test_funny_author_keywords(app, demo_pdf_file_with_funny_author_kw_sep, demo_taxonomy): """Test extracting author keywords separated by '·'""" with app.app_context(): out = get_keywords_from_local_file( demo_pdf_file_with_funny_author_kw_sep, taxonomy_name=demo_taxonomy, output_mode="dict", with_author_keywords=True) output = out.get("complete_output") author_keywords = output.get("author_keywords", []) assert len(author_keywords) == 4, output assert {'author_keyword': 'Depth cameras'} in author_keywords
def _classify_paper(obj, eng): from invenio_classifier.errors import ClassifierException from invenio_classifier import ( get_keywords_from_text, get_keywords_from_local_file, ) params = dict(taxonomy_name=taxonomy, output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) fast_mode = False try: # FIXME: May need to find another canonical way of getting PDF if "pdf" in obj.extra_data: result = get_keywords_from_local_file(obj.extra_data["pdf"], **params) else: data = [] titles = obj.data.get('titles') if titles: data.extend([t.get('title', '') for t in titles]) abstracts = obj.data.get('abstracts') if abstracts: data.extend([t.get('value', '') for t in abstracts]) if not data: obj.log.error( "No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fast_mode = True except ClassifierException as e: obj.log.exception(e) return result['complete_output'] = clean_instances_from_data( result.get("complete_output", {})) result["fast_mode"] = fast_mode # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result
def test_author_keywords(app, demo_pdf_file_with_author_keywords, demo_taxonomy): """Test extracting author keywords from PDF.""" with app.app_context(): out = get_keywords_from_local_file( demo_pdf_file_with_author_keywords, taxonomy_name=demo_taxonomy, output_mode="dict", with_author_keywords=True ) output = out.get("complete_output") author_keywords = output.get("author_keywords", []) assert len(author_keywords) == 4, output assert {'author_keyword': 'Dyson model', 'matched_keywords': ['model']} in author_keywords
def test_file_extration(app, demo_pdf_file, demo_taxonomy): """Test extracting keywords from PDF.""" with app.app_context(): out = get_keywords_from_local_file(demo_pdf_file, taxonomy_name=demo_taxonomy, output_mode="dict") output = out.get("complete_output") single_keywords = output.get("single_keywords", []) assert len(single_keywords) == 4 assert {'keyword': "gauge field theory Yang-Mills", 'number': 9} \ in single_keywords core_keywords = output.get("core_keywords", []) assert len(core_keywords) == 3 assert {'keyword': "Yang-Mills", 'number': 12} in core_keywords
def _classify_paper(obj, eng): params = dict(taxonomy_name=taxonomy, output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) fast_mode = False tmp_pdf = get_pdf_in_workflow(obj) try: if tmp_pdf: result = get_keywords_from_local_file(tmp_pdf, **params) else: data = [] titles = obj.data.get('titles') if titles: data.extend([t.get('title', '') for t in titles]) abstracts = obj.data.get('abstracts') if abstracts: data.extend([t.get('value', '') for t in abstracts]) if not data: obj.log.error( "No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fast_mode = True except ClassifierException as e: obj.log.exception(e) return finally: if tmp_pdf and os.path.exists(tmp_pdf): os.unlink(tmp_pdf) result['complete_output'] = clean_instances_from_data( result.get("complete_output", {})) result["fast_mode"] = fast_mode # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result
def test_file_extration(app, demo_pdf_file, demo_taxonomy): """Test extracting keywords from PDF.""" with app.app_context(): out = get_keywords_from_local_file( demo_pdf_file, taxonomy_name=demo_taxonomy, output_mode="dict" ) output = out.get("complete_output") single_keywords = output.get("Single keywords", []).keys() assert len(single_keywords) == 4 assert "gauge field theory Yang-Mills" in single_keywords core_keywords = output.get("Core keywords", []).keys() assert len(core_keywords) == 3 assert "Yang-Mills" in core_keywords
def _classify_paper(obj, eng): from flask import current_app params = dict( taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'], output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms ) fulltext_used = True with get_document_in_workflow(obj) as tmp_document: try: if tmp_document: result = get_keywords_from_local_file(tmp_document, **params) else: data = get_value(obj.data, 'titles.title', []) data.extend(get_value(obj.data, 'titles.subtitle', [])) data.extend(get_value(obj.data, 'abstracts.value', [])) data.extend(get_value(obj.data, 'keywords.value', [])) if not data: obj.log.error("No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fulltext_used = False except ClassifierException as e: obj.log.exception(e) return result['complete_output'] = clean_instances_from_data( result.get("complete_output", {}) ) result["fulltext_used"] = fulltext_used # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result
def _classify_paper(obj, eng): from flask import current_app params = dict(taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'], output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) fulltext_used = True with get_document_in_workflow(obj) as tmp_document: try: if tmp_document: result = get_keywords_from_local_file( tmp_document, **params) else: data = get_value(obj.data, 'titles.title', []) data.extend(get_value(obj.data, 'titles.subtitle', [])) data.extend(get_value(obj.data, 'abstracts.value', [])) data.extend(get_value(obj.data, 'keywords.value', [])) if not data: obj.log.error( "No classification done due to missing data.") return result = get_keywords_from_text(data, **params) fulltext_used = False except ClassifierException as e: obj.log.exception(e) return result['complete_output'] = clean_instances_from_data( result.get("complete_output", {})) result["fulltext_used"] = fulltext_used # Check if it is not empty output before adding if any(result.get("complete_output", {}).values()): obj.extra_data['classifier_results'] = result