def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements( form_fields, ['authors', 'supervisors', 'report_numbers'] ) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author(builder.make_author( author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'] )) for supervisor in form_fields.get('supervisors', []): builder.add_author(builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'] )) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) builder.add_document_type( document_type=document_type ) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None ) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split() ) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user' ) for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note( private_notes=form_fields.get(key) ) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('page_start'), page_end=form_fields.get('page_end'), artid=form_fields.get('artid') ) builder.add_preprint_date( preprint_date=form_fields.get('preprint_created') ) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis( defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date') ) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment') ) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) builder.add_title_translation(title=form_fields.get('title_translation')) builder.add_title( title=form_fields.get('title_arXiv'), source='arXiv' ) builder.add_title( title=form_fields.get('title_crossref'), source='crossref' ) builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef' ) form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [builder.add_report_number( report_number=report_number.get('report_number') ) for report_number in form_fields.get('report_numbers', [])] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter' ) builder.validate_record() return builder.record
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements(form_fields, ['authors', 'supervisors', 'report_numbers']) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author( builder.make_author(author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'])) for supervisor in form_fields.get('supervisors', []): builder.add_author( builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'])) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) if document_type == 'chapter': document_type = 'book chapter' builder.add_document_type(document_type=document_type) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split()) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user') for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name'): builder.add_private_note(private_notes=form_fields.get(key)) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_preprint_date( preprint_date=form_fields.get('preprint_created')) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis(defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date')) if form_fields.get('type_of_doc') == 'chapter': if not form_fields.get('journal_title'): builder.add_book_series(title=form_fields.get('series_title')) if form_fields.get('type_of_doc') == 'book': if form_fields.get('journal_title'): form_fields['volume'] = form_fields.get('series_volume') else: builder.add_book_series(title=form_fields.get('series_title'), volume=form_fields.get('series_volume')) builder.add_book(publisher=form_fields.get('publisher_name'), place=form_fields.get('publication_place'), date=form_fields.get('publication_date')) builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('start_page'), page_end=form_fields.get('end_page'), artid=form_fields.get('artid'), parent_record=form_fields.get('parent_book')) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment')) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) if form_fields.get('title_translation'): builder.add_title_translation( title=form_fields['title_translation'], language='en', ) builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv') builder.add_title(title=form_fields.get('title_crossref'), source='crossref') builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef') form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [ builder.add_report_number( report_number=report_number.get('report_number')) for report_number in form_fields.get('report_numbers', []) ] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter') return builder.record
class ArxivParser(object): """Parser for the arXiv format. It can be used directly by invoking the :func:`ArxivParser.parse` method, or be subclassed to customize its behavior. Args: arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the arXiv metadata. """ _l2t = LatexNodes2Text( latex_context=get_arxiv_latex_context_db(), math_mode="verbatim", strict_latex_spaces="based-on-source", keep_comments=True, keep_braced_groups=True, keep_braced_groups_minlen=2, ) def __init__(self, arxiv_record, source=None): self.root = self.get_root_node(arxiv_record) if not source: source = 'arXiv' self.builder = LiteratureBuilder(source) def parse(self): """Extract an arXiv record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(abstract=self.abstract, source=self.source) self.builder.add_title(title=self.title, source=self.source) for license in self.licenses: self.builder.add_license(**license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) self.builder.add_preprint_date(self.preprint_date) if self.public_note: self.builder.add_public_note(self.public_note, self.source) for rep_number in self.report_numbers: self.builder.add_report_number(rep_number, self.source) self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) normalized_categories = [ classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record def _get_authors_and_collaborations(self, node): """Parse authors, affiliations and collaborations from the record node. Heuristics are used to detect collaborations. In case those are not reliable, a warning is returned for manual checking. Args: node (Selector): a selector on a record Returns: tuple: a tuple of (authors, collaborations, warning) """ author_selectors = node.xpath('.//authors//author') # take 'for the' out of the general phrases and dont use it in # affiliations collab_phrases = [ 'consortium', ' collab ', 'collaboration', ' team', 'group', ' on behalf of ', ' representing ', ] inst_phrases = ['institute', 'university', 'department', 'center'] authors = [] collaborations = [] warning_tags = [] some_affiliation_contains_collaboration = False authors_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next_author_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next(next_author_and_affiliations) for (forenames, keyname, affiliations), (next_forenames, next_keyname, _) in six.moves.zip_longest( authors_and_affiliations, next_author_and_affiliations, fillvalue=('end of author-list', '', None)): name_string = " %s %s " % (forenames, keyname) # collaborations in affiliation field? Cautious with 'for the' in # Inst names affiliations_with_collaborations = [] affiliations_without_collaborations = [] for aff in affiliations: affiliation_contains_collaboration = any( phrase in aff.lower() for phrase in collab_phrases) and not any( phrase in aff.lower() for phrase in inst_phrases) if affiliation_contains_collaboration: affiliations_with_collaborations.append(aff) some_affiliation_contains_collaboration = True else: affiliations_without_collaborations.append(aff) for aff in affiliations_with_collaborations: coll, author_name = coll_cleanforthe(aff) if coll and coll not in collaborations: collaborations.append(coll) # Check if name is a collaboration, else append to authors collaboration_in_name = ' for the ' in name_string.lower() or any( phrase in name_string.lower() for phrase in collab_phrases) if collaboration_in_name: coll, author_name = coll_cleanforthe(name_string) if author_name: surname, given_names = split_fullname(author_name) authors.append({ 'full_name': surname + ', ' + given_names, 'surname': surname, 'given_names': given_names, 'affiliations': [], }) if coll and coll not in collaborations: collaborations.append(coll) elif name_string.strip() == ':': # DANGERZONE : this might not be correct - add a warning for the cataloger warning_tags.append(' %s %s ' % (next_forenames, next_keyname)) if not some_affiliation_contains_collaboration: # everything up to now seems to be collaboration info for author_info in authors: name_string = " %s %s " % \ (author_info['given_names'], author_info['surname']) coll, author_name = coll_cleanforthe(name_string) if coll and coll not in collaborations: collaborations.append(coll) authors = [] else: authors.append({ 'full_name': keyname + ', ' + forenames, 'surname': keyname, 'given_names': forenames, 'affiliations': affiliations_without_collaborations }) if warning_tags: warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join( warning_tags) else: warning = '' return authors, collaborations, warning @staticmethod def _get_author_names_and_affiliations(author_node): forenames = u' '.join( author_node.xpath('.//forenames//text()').extract()) keyname = u' '.join(author_node.xpath('.//keyname//text()').extract()) affiliations = author_node.xpath('.//affiliation//text()').extract() return forenames, keyname, affiliations @property def preprint_date(self): preprint_date = self.root.xpath('.//created/text()').extract_first() return preprint_date @property def abstract(self): abstract = self.root.xpath('.//abstract/text()').extract_first() long_text_fixed = self.fix_long_text(abstract) return self.latex_to_unicode(long_text_fixed) @property def authors(self): authors, _, _ = self.authors_and_collaborations parsed_authors = [ self.builder.make_author(full_name=auth["full_name"], raw_affiliations=auth["affiliations"]) for auth in authors ] return parsed_authors @property def collaborations(self): _, collaborations, _ = self.authors_and_collaborations return collaborations @property def dois(self): doi_values = self.root.xpath('.//doi/text()').extract() doi_values_splitted = chain.from_iterable( [re.split(RE_DOIS, doi) for doi in doi_values]) dois = [{ 'doi': value, 'material': 'publication' } for value in doi_values_splitted] return dois @property def licenses(self): licenses = self.root.xpath('.//license/text()').extract() return [{ 'url': license, 'material': self.material } for license in licenses] @property def material(self): return 'preprint' @property def number_of_pages(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) found_pages = RE_PAGES.search(comments) if found_pages: pages = found_pages.group(1) return maybe_int(pages) return None @property def publication_info(self): publication_info = { 'material': 'publication', 'pubinfo_freetext': self.pubinfo_freetext, } return publication_info @property def pubinfo_freetext(self): return self.root.xpath('.//journal-ref/text()').extract_first() @property def title(self): long_text_fixed = self.fix_long_text( self.root.xpath('.//title/text()').extract_first()) return self.latex_to_unicode(long_text_fixed) @staticmethod def fix_long_text(text): return re.sub(r'\s+', ' ', text).strip() @staticmethod def get_root_node(arxiv_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(arxiv_record, six.string_types): root = get_node(arxiv_record) else: root = arxiv_record root.remove_namespaces() return root @property def public_note(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) return self.latex_to_unicode(comments) @property def private_note(self): _, _, warning = self.authors_and_collaborations return warning @property def report_numbers(self): report_numbers = self.root.xpath('.//report-no/text()').extract() rns = [] for rn in report_numbers: rns.extend(rn.split(', ')) return rns @property def arxiv_eprint(self): return self.root.xpath('.//id/text()').extract_first() @property def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first( default='[]') categories = categories.split() categories_without_old = [ normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories ] return dedupe_list(categories_without_old) @property def document_type(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) doctype = 'article' if RE_THESIS.search(comments): doctype = 'thesis' elif RE_CONFERENCE.search(comments): doctype = 'conference paper' return doctype @property def source(self): return 'arXiv' @property def authors_and_collaborations(self): if not hasattr(self, '_authors_and_collaborations'): self._authors_and_collaborations = self._get_authors_and_collaborations( self.root) return self._authors_and_collaborations @classmethod def latex_to_unicode(cls, latex_string): try: return cls._l2t.latex_to_text(latex_string).replace(" ", " ") except Exception as e: return latex_string