def test_split_page_artid_unicode_dash(): page_string = u'45−47' result = utils.split_page_artid(page_string) expected = '45', '47', None assert expected == result
def test_split_page_artid_long_page_start(): page_string = 'B1234' result = utils.split_page_artid(page_string) expected = 'B1234', None, 'B1234' assert expected == result
def test_split_page_artid_page_start(): page_string = '451' result = utils.split_page_artid(page_string) expected = '451', None, '451' assert expected == result
def test_split_page_artid_artid(): page_string = 'CONF546' result = utils.split_page_artid(page_string) expected = None, None, 'CONF546' assert expected == result
def test_split_page_artid_page_range(): page_string = '451-487' result = utils.split_page_artid(page_string) expected = '451', '487', None assert expected == result
def publication_info(self, key, value): """Populate the ``publication_info`` key.""" def _get_cnum(value): w_value = force_single_element(value.get('w', '')) normalized_w_value = w_value.replace('/', '-').upper() return normalized_w_value def _get_material(value): schema = load_schema('elements/material') valid_materials = schema['enum'] m_value = force_single_element(value.get('m', '')) normalized_m_value = m_value.lower() if normalized_m_value in valid_materials: return normalized_m_value def _get_parent_isbn(value): z_value = force_single_element(value.get('z', '')) if z_value: return normalize_isbn(z_value) def _get_pubinfo_freetext(value): x_value = force_single_element(value.get('x', '')) if not x_value.startswith('#DONE'): return x_value page_start, page_end, artid = split_page_artid(value.get('c')) parent_recid = maybe_int(force_single_element(value.get('0'))) parent_record = get_record_ref(parent_recid, 'literature') journal_recid = maybe_int(force_single_element(value.get('1'))) journal_record = get_record_ref(journal_recid, 'journals') conference_recid = maybe_int(force_single_element(value.get('2'))) conference_record = get_record_ref(conference_recid, 'conferences') return { 'artid': artid, 'cnum': _get_cnum(value), 'conf_acronym': force_single_element(value.get('q')), 'conference_record': conference_record, 'hidden': key.startswith('7731') or None, 'journal_issue': force_single_element(value.get('n')), 'journal_record': journal_record, 'journal_title': force_single_element(value.get('p')), 'journal_volume': force_single_element(value.get('v')), 'material': _get_material(value), 'page_end': page_end, 'page_start': page_start, 'parent_isbn': _get_parent_isbn(value), 'parent_record': parent_record, 'parent_report_number': force_single_element(value.get('r')), 'pubinfo_freetext': _get_pubinfo_freetext(value), 'year': maybe_int(force_single_element(value.get('y'))), }
def extract_journal_info(obj, eng): """Extract the journal information from ``pubinfo_freetext``. Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each ``publication_info``, if it exists, and uses the extracted information to populate the other keys. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if not obj.data.get('publication_info'): return for publication_info in obj.data['publication_info']: try: with local_refextract_kbs_path() as kbs_path: extracted_publication_info = extract_journal_reference( publication_info['pubinfo_freetext'], override_kbs_files=kbs_path, ) if not extracted_publication_info: continue if extracted_publication_info.get('title'): publication_info['journal_title'] = extracted_publication_info[ 'title'] if extracted_publication_info.get('volume'): publication_info[ 'journal_volume'] = extracted_publication_info['volume'] if extracted_publication_info.get('page'): page_start, page_end, artid = split_page_artid( extracted_publication_info['page']) if page_start: publication_info['page_start'] = page_start if page_end: publication_info['page_end'] = page_end if artid: publication_info['artid'] = artid if extracted_publication_info.get('year'): year = maybe_int(extracted_publication_info['year']) if year: publication_info['year'] = year except KeyError: pass obj.data['publication_info'] = convert_old_publication_info_to_new( obj.data['publication_info'])
def split_page_range_article_id(obj, formdata): page_range_article_id = formdata.get('page_range_article_id') if page_range_article_id: page_start, page_end, artid = split_page_artid(page_range_article_id) formdata['start_page'] = page_start formdata['end_page'] = page_end formdata['artid'] = artid return formdata
def extract_journal_info(obj, eng): """Extract the journal information from ``pubinfo_freetext``. Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each ``publication_info``, if it exists, and uses the extracted information to populate the other keys. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if not obj.data.get('publication_info'): return for publication_info in obj.data['publication_info']: try: with local_refextract_kbs_path() as kbs_path: extracted_publication_info = extract_journal_reference( publication_info['pubinfo_freetext'], override_kbs_files=kbs_path, ) if not extracted_publication_info: continue if extracted_publication_info.get('title'): publication_info['journal_title'] = extracted_publication_info['title'] if extracted_publication_info.get('volume'): publication_info['journal_volume'] = extracted_publication_info['volume'] if extracted_publication_info.get('page'): page_start, page_end, artid = split_page_artid(extracted_publication_info['page']) if page_start: publication_info['page_start'] = page_start if page_end: publication_info['page_end'] = page_end if artid: publication_info['artid'] = artid if extracted_publication_info.get('year'): year = maybe_int(extracted_publication_info['year']) if year: publication_info['year'] = year except KeyError: pass obj.data['publication_info'] = convert_old_publication_info_to_new(obj.data['publication_info'])
def extract_journal_info(obj, eng): """Extract journal, volume etc. from any freetext publication info.""" publication_info = get_value(obj.data, "publication_info") if not publication_info: return new_publication_info = [] for pubnote in publication_info: if not pubnote: continue freetext = pubnote.get("pubinfo_freetext") if freetext: if isinstance(freetext, (list, tuple)): freetext = ". ".join(freetext) extracted_publication_info = extract_journal_reference( freetext, # override_kbs_files={ # 'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME']) # } ) if extracted_publication_info: if "volume" in extracted_publication_info: pubnote["journal_volume"] = extracted_publication_info.get( "volume" ) if "title" in extracted_publication_info: pubnote["journal_title"] = extracted_publication_info.get( "title" ) if "year" in extracted_publication_info: year = maybe_int(extracted_publication_info.get('year')) if year is not None: pubnote['year'] = year if "page" in extracted_publication_info: page_start, page_end, artid = split_page_artid( extracted_publication_info.get("page")) if page_start: pubnote["page_start"] = page_start if page_end: pubnote["page_end"] = page_end if artid: pubnote["artid"] = artid if any(value for value in pubnote.values()): new_publication_info.append(pubnote) obj.data["publication_info"] = new_publication_info
def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj
def build_literature(self, data): literature = LiteratureBuilder() literature.add_document_type(data["document_type"]) literature.add_arxiv_eprint(data.get("arxiv_id"), data.get("arxiv_categories")) literature.add_preprint_date(data.get("preprint_date")) literature.add_doi(data.get("doi")) pdf_link = data.get("pdf_link") additional_link = data.get("additional_link") if pdf_link and "arxiv.org" not in pdf_link: literature.add_url(data.get("pdf_link")) if additional_link and "arxiv.org" not in additional_link: literature.add_url(data.get("additional_link")) literature.add_title(data["title"], source="submitter") language = data.get("language") literature.add_language(language if language != "en" else None) literature.add_inspire_categories(data.get("subjects")) for author in data.get("authors", []): record_author = literature.make_author( author.get("full_name"), record=author.get("record"), affiliations=[author.get("affiliation") ], # TODO: use `affiliation_record` ) literature.add_author(record_author) literature.add_collaboration(data.get("collaboration")) literature.add_accelerator_experiment( data.get("experiment"), record=data.get("experiment_record")) # TODO: source=submitter? literature.add_abstract(data.get("abstract"), source="submitter") for report_number in data.get("report_numbers", []): literature.add_report_number(report_number, source="submitter") page_start, page_end, artid = split_page_artid(data.get("page_range")) literature.add_publication_info( journal_title=data.get("journal_title"), journal_volume=data.get("volume"), journal_issue=data.get("issue"), journal_record=data.get("journal_record"), conference_record=data.get("conference_record"), artid=artid, page_start=data.get("start_page") or page_start, page_end=data.get("end_page") or page_end, year=data.get("year"), parent_record=data.get("parent_book_record"), ) literature.add_book_series(data.get("series_title")) literature.add_book( date=data.get("publication_date"), publisher=data.get("publisher"), place=data.get("publication_place"), ) literature.add_thesis( defense_date=data.get("defense_date"), degree_type=data.get("degree_type"), institution=data.get("institution"), date=data.get("submission_date"), ) for supervisor in data.get("supervisors", []): record_supervisor = literature.make_author( supervisor.get("full_name"), affiliations=[supervisor.get("affiliation")], roles=["supervisor"], ) literature.add_author(record_supervisor) literature.add_private_note(data.get("comments"), source="submitter") literature.add_private_note(data.get("proceedings_info"), source="submitter") if data.get("conference_record") is None: literature.add_private_note(data.get("conference_info"), source="submitter") return literature.record