def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements(form_fields, ['authors', 'supervisors', 'report_numbers']) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author( builder.make_author(author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'])) for supervisor in form_fields.get('supervisors', []): builder.add_author( builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'])) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) if document_type == 'chapter': document_type = 'book chapter' builder.add_document_type(document_type=document_type) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split()) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user') for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name'): builder.add_private_note(private_notes=form_fields.get(key)) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_preprint_date( preprint_date=form_fields.get('preprint_created')) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis(defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date')) if form_fields.get('type_of_doc') == 'chapter': if not form_fields.get('journal_title'): builder.add_book_series(title=form_fields.get('series_title')) if form_fields.get('type_of_doc') == 'book': if form_fields.get('journal_title'): form_fields['volume'] = form_fields.get('series_volume') else: builder.add_book_series(title=form_fields.get('series_title'), volume=form_fields.get('series_volume')) builder.add_book(publisher=form_fields.get('publisher_name'), place=form_fields.get('publication_place'), date=form_fields.get('publication_date')) builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('start_page'), page_end=form_fields.get('end_page'), artid=form_fields.get('artid'), parent_record=form_fields.get('parent_book')) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment')) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) if form_fields.get('title_translation'): builder.add_title_translation( title=form_fields['title_translation'], language='en', ) builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv') builder.add_title(title=form_fields.get('title_crossref'), source='crossref') builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef') form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [ builder.add_report_number( report_number=report_number.get('report_number')) for report_number in form_fields.get('report_numbers', []) ] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter') return builder.record
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements( form_fields, ['authors', 'supervisors', 'report_numbers'] ) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author(builder.make_author( author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'] )) for supervisor in form_fields.get('supervisors', []): builder.add_author(builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'] )) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) builder.add_document_type( document_type=document_type ) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None ) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split() ) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user' ) for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note( private_notes=form_fields.get(key) ) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('page_start'), page_end=form_fields.get('page_end'), artid=form_fields.get('artid') ) builder.add_preprint_date( preprint_date=form_fields.get('preprint_created') ) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis( defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date') ) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment') ) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) builder.add_title_translation(title=form_fields.get('title_translation')) builder.add_title( title=form_fields.get('title_arXiv'), source='arXiv' ) builder.add_title( title=form_fields.get('title_crossref'), source='crossref' ) builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef' ) form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [builder.add_report_number( report_number=report_number.get('report_number') ) for report_number in form_fields.get('report_numbers', [])] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter' ) builder.validate_record() return builder.record
class ArxivParser(object): """Parser for the arXiv format. It can be used directly by invoking the :func:`ArxivParser.parse` method, or be subclassed to customize its behavior. Args: arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the arXiv metadata. """ _l2t = LatexNodes2Text( latex_context=get_arxiv_latex_context_db(), math_mode="verbatim", strict_latex_spaces="based-on-source", keep_comments=True, keep_braced_groups=True, keep_braced_groups_minlen=2, ) def __init__(self, arxiv_record, source=None): self.root = self.get_root_node(arxiv_record) if not source: source = 'arXiv' self.builder = LiteratureBuilder(source) def parse(self): """Extract an arXiv record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(abstract=self.abstract, source=self.source) self.builder.add_title(title=self.title, source=self.source) for license in self.licenses: self.builder.add_license(**license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) self.builder.add_preprint_date(self.preprint_date) if self.public_note: self.builder.add_public_note(self.public_note, self.source) for rep_number in self.report_numbers: self.builder.add_report_number(rep_number, self.source) self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) normalized_categories = [ classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record def _get_authors_and_collaborations(self, node): """Parse authors, affiliations and collaborations from the record node. Heuristics are used to detect collaborations. In case those are not reliable, a warning is returned for manual checking. Args: node (Selector): a selector on a record Returns: tuple: a tuple of (authors, collaborations, warning) """ author_selectors = node.xpath('.//authors//author') # take 'for the' out of the general phrases and dont use it in # affiliations collab_phrases = [ 'consortium', ' collab ', 'collaboration', ' team', 'group', ' on behalf of ', ' representing ', ] inst_phrases = ['institute', 'university', 'department', 'center'] authors = [] collaborations = [] warning_tags = [] some_affiliation_contains_collaboration = False authors_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next_author_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next(next_author_and_affiliations) for (forenames, keyname, affiliations), (next_forenames, next_keyname, _) in six.moves.zip_longest( authors_and_affiliations, next_author_and_affiliations, fillvalue=('end of author-list', '', None)): name_string = " %s %s " % (forenames, keyname) # collaborations in affiliation field? Cautious with 'for the' in # Inst names affiliations_with_collaborations = [] affiliations_without_collaborations = [] for aff in affiliations: affiliation_contains_collaboration = any( phrase in aff.lower() for phrase in collab_phrases) and not any( phrase in aff.lower() for phrase in inst_phrases) if affiliation_contains_collaboration: affiliations_with_collaborations.append(aff) some_affiliation_contains_collaboration = True else: affiliations_without_collaborations.append(aff) for aff in affiliations_with_collaborations: coll, author_name = coll_cleanforthe(aff) if coll and coll not in collaborations: collaborations.append(coll) # Check if name is a collaboration, else append to authors collaboration_in_name = ' for the ' in name_string.lower() or any( phrase in name_string.lower() for phrase in collab_phrases) if collaboration_in_name: coll, author_name = coll_cleanforthe(name_string) if author_name: surname, given_names = split_fullname(author_name) authors.append({ 'full_name': surname + ', ' + given_names, 'surname': surname, 'given_names': given_names, 'affiliations': [], }) if coll and coll not in collaborations: collaborations.append(coll) elif name_string.strip() == ':': # DANGERZONE : this might not be correct - add a warning for the cataloger warning_tags.append(' %s %s ' % (next_forenames, next_keyname)) if not some_affiliation_contains_collaboration: # everything up to now seems to be collaboration info for author_info in authors: name_string = " %s %s " % \ (author_info['given_names'], author_info['surname']) coll, author_name = coll_cleanforthe(name_string) if coll and coll not in collaborations: collaborations.append(coll) authors = [] else: authors.append({ 'full_name': keyname + ', ' + forenames, 'surname': keyname, 'given_names': forenames, 'affiliations': affiliations_without_collaborations }) if warning_tags: warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join( warning_tags) else: warning = '' return authors, collaborations, warning @staticmethod def _get_author_names_and_affiliations(author_node): forenames = u' '.join( author_node.xpath('.//forenames//text()').extract()) keyname = u' '.join(author_node.xpath('.//keyname//text()').extract()) affiliations = author_node.xpath('.//affiliation//text()').extract() return forenames, keyname, affiliations @property def preprint_date(self): preprint_date = self.root.xpath('.//created/text()').extract_first() return preprint_date @property def abstract(self): abstract = self.root.xpath('.//abstract/text()').extract_first() long_text_fixed = self.fix_long_text(abstract) return self.latex_to_unicode(long_text_fixed) @property def authors(self): authors, _, _ = self.authors_and_collaborations parsed_authors = [ self.builder.make_author(full_name=auth["full_name"], raw_affiliations=auth["affiliations"]) for auth in authors ] return parsed_authors @property def collaborations(self): _, collaborations, _ = self.authors_and_collaborations return collaborations @property def dois(self): doi_values = self.root.xpath('.//doi/text()').extract() doi_values_splitted = chain.from_iterable( [re.split(RE_DOIS, doi) for doi in doi_values]) dois = [{ 'doi': value, 'material': 'publication' } for value in doi_values_splitted] return dois @property def licenses(self): licenses = self.root.xpath('.//license/text()').extract() return [{ 'url': license, 'material': self.material } for license in licenses] @property def material(self): return 'preprint' @property def number_of_pages(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) found_pages = RE_PAGES.search(comments) if found_pages: pages = found_pages.group(1) return maybe_int(pages) return None @property def publication_info(self): publication_info = { 'material': 'publication', 'pubinfo_freetext': self.pubinfo_freetext, } return publication_info @property def pubinfo_freetext(self): return self.root.xpath('.//journal-ref/text()').extract_first() @property def title(self): long_text_fixed = self.fix_long_text( self.root.xpath('.//title/text()').extract_first()) return self.latex_to_unicode(long_text_fixed) @staticmethod def fix_long_text(text): return re.sub(r'\s+', ' ', text).strip() @staticmethod def get_root_node(arxiv_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(arxiv_record, six.string_types): root = get_node(arxiv_record) else: root = arxiv_record root.remove_namespaces() return root @property def public_note(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) return self.latex_to_unicode(comments) @property def private_note(self): _, _, warning = self.authors_and_collaborations return warning @property def report_numbers(self): report_numbers = self.root.xpath('.//report-no/text()').extract() rns = [] for rn in report_numbers: rns.extend(rn.split(', ')) return rns @property def arxiv_eprint(self): return self.root.xpath('.//id/text()').extract_first() @property def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first( default='[]') categories = categories.split() categories_without_old = [ normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories ] return dedupe_list(categories_without_old) @property def document_type(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) doctype = 'article' if RE_THESIS.search(comments): doctype = 'thesis' elif RE_CONFERENCE.search(comments): doctype = 'conference paper' return doctype @property def source(self): return 'arXiv' @property def authors_and_collaborations(self): if not hasattr(self, '_authors_and_collaborations'): self._authors_and_collaborations = self._get_authors_and_collaborations( self.root) return self._authors_and_collaborations @classmethod def latex_to_unicode(cls, latex_string): try: return cls._l2t.latex_to_text(latex_string).replace(" ", " ") except Exception as e: return latex_string
def hepcrawl_to_hep(crawler_record): """ Args: crawler_record(dict): dictionary representing the hepcrawl formatted record. Returns: dict: The hep formatted record. """ def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder( source=crawler_record['acquisition_source']['source'] ) for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( full_name=author['full_name'], raw_affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), subtitle=title.get('subtitle'), source=title.get('source') ) for abstract in crawler_record.get('abstracts', []): builder.add_abstract( abstract=abstract.get('value'), source=abstract.get('source') ) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories') ) for doi in crawler_record.get('dois', []): builder.add_doi( doi=doi.get('value'), material=doi.get('material'), ) for private_note in crawler_record.get('private_notes', []): builder.add_private_note( private_notes=private_note ) for public_note in crawler_record.get('public_notes', []): builder.add_public_note( public_note=public_note.get('value'), source=public_note.get('source') ) for license in crawler_record.get('license', []): builder.add_license( url=license.get('url'), license=license.get('license'), material=license.get('material'), ) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration( collaboration=collaboration.get('value') ) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date( imprint_date=imprint.get('date') ) for copyright in crawler_record.get('copyright', []): builder.add_copyright( holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement') ) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date') ) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0]) ) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', 'manual', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), material=_pub_info.get('pubinfo_material'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number( report_number=report_number.get('value'), source=report_number.get('source') ) for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) for document in crawler_record.get('documents', []): builder.add_document( description=document.get('description'), fulltext=document.get('fulltext'), hidden=document.get('hidden'), key=document['key'], material=document.get('material'), original_url=document.get('original_url'), url=document['url'], ) return builder.record
def crawler2hep(crawler_record): def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder('hepcrawl') for author in crawler_record.get('authors', []): builder.add_author( builder.make_author( author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title(title=title.get('title'), source=title.get('source')) for abstract in crawler_record.get('abstracts', []): builder.add_abstract(abstract=abstract.get('value'), source=abstract.get('source')) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories')) for doi in crawler_record.get('dois', []): builder.add_doi(doi=doi.get('value')) for public_note in crawler_record.get('public_notes', []): builder.add_public_note(public_note=public_note.get('value'), source=public_note.get('source')) for license in crawler_record.get('license', []): builder.add_license(url=license.get('url'), license=license.get('license')) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration(collaboration=collaboration.get('value')) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date(imprint_date=imprint.get('date')) for copyright in crawler_record.get('copyright', []): builder.add_copyright(holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement')) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date')) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method='hepcrawl', date=acquisition_source.get('date'), source=acquisition_source.get('source'), submission_number=acquisition_source.get('submission_number')) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0])) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', ] special_collections = [ 'cdf-internal-note', 'cdf-note', 'cds', 'd0-internal-note', 'd0-preliminary-note', 'h1-internal-note', 'h1-preliminary-note', 'halhidden', 'hephidden', 'hermes-internal-note', 'larsoft-internal-note', 'larsoft-note', 'zeus-internal-note', 'zeus-preliminary-note', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection in special_collections: builder.add_special_collection(collection.upper()) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number(report_number=report_number.get('value'), source=report_number.get('source')) builder.validate_record() return builder.record