Python LiteratureBuilder.add_inspire_categories Exemples, inspire_schemas.api.LiteratureBuilder.add_inspire_categories Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tasks.py Projet : fschwenn/inspire-next

def formdata_to_model(obj, formdata):
    """Manipulate form data to match literature data model."""
    def _is_arxiv_url(url):
        return 'arxiv.org' in url

    form_fields = copy.deepcopy(formdata)
    filter_empty_elements(
        form_fields, ['authors', 'supervisors', 'report_numbers']
    )

    builder = LiteratureBuilder(source='submitter')

    for author in form_fields.get('authors', []):
        builder.add_author(builder.make_author(
            author['full_name'],
            affiliations=force_list(author['affiliation'])
            if author['affiliation'] else None,
            roles=['author']
        ))

    for supervisor in form_fields.get('supervisors', []):
        builder.add_author(builder.make_author(
            supervisor['full_name'],
            affiliations=force_list(supervisor['affiliation'])
            if author['affiliation'] else None,
            roles=['supervisor']
        ))

    builder.add_title(title=form_fields.get('title'))

    document_type = 'conference paper' if form_fields.get('conf_name') \
        else form_fields.get('type_of_doc', [])

    builder.add_document_type(
        document_type=document_type
    )

    builder.add_abstract(
        abstract=form_fields.get('abstract'),
        source='arXiv' if form_fields.get('categories') else None
    )

    if form_fields.get('arxiv_id') and form_fields.get('categories'):
        builder.add_arxiv_eprint(
            arxiv_id=form_fields.get('arxiv_id'),
            arxiv_categories=form_fields.get('categories').split()
        )

    builder.add_doi(doi=form_fields.get('doi'))

    builder.add_inspire_categories(
        subject_terms=form_fields.get('subject_term'),
        source='user'
    )

    for key in ('extra_comments', 'nonpublic_note',
                'hidden_notes', 'conf_name', 'references'):
        builder.add_private_note(
            private_notes=form_fields.get(key)
        )

    year = form_fields.get('year')
    try:
        year = int(year)
    except (TypeError, ValueError):
        year = None

    builder.add_publication_info(
        year=year,
        cnum=form_fields.get('conference_id'),
        journal_issue=form_fields.get('issue'),
        journal_title=form_fields.get('journal_title'),
        journal_volume=form_fields.get('volume'),
        page_start=form_fields.get('page_start'),
        page_end=form_fields.get('page_end'),
        artid=form_fields.get('artid')
    )

    builder.add_preprint_date(
        preprint_date=form_fields.get('preprint_created')
    )

    if form_fields.get('type_of_doc') == 'thesis':
        builder.add_thesis(
            defense_date=form_fields.get('defense_date'),
            degree_type=form_fields.get('degree_type'),
            institution=form_fields.get('institution'),
            date=form_fields.get('thesis_date')
        )

    builder.add_accelerator_experiments_legacy_name(
        legacy_name=form_fields.get('experiment')
    )

    language = form_fields.get('other_language') \
        if form_fields.get('language') == 'oth' \
        else form_fields.get('language')
    builder.add_language(language=language)

    builder.add_title_translation(title=form_fields.get('title_translation'))

    builder.add_title(
        title=form_fields.get('title_arXiv'),
        source='arXiv'
    )

    builder.add_title(
        title=form_fields.get('title_crossref'),
        source='crossref'
    )

    builder.add_license(url=form_fields.get('license_url'))

    builder.add_public_note(public_note=form_fields.get('public_notes'))

    builder.add_public_note(
        public_note=form_fields.get('note'),
        source='arXiv' if form_fields.get('categories') else 'CrossRef'
    )

    form_url = form_fields.get('url')
    form_additional_url = form_fields.get('additional_url')
    if form_url and not _is_arxiv_url(form_url):
        obj.extra_data['submission_pdf'] = form_url
        if not form_additional_url:
            builder.add_url(url=form_url)

    if form_additional_url and not _is_arxiv_url(form_additional_url):
        builder.add_url(url=form_additional_url)

    [builder.add_report_number(
        report_number=report_number.get('report_number')
    ) for report_number in form_fields.get('report_numbers', [])]

    builder.add_collaboration(collaboration=form_fields.get('collaboration'))

    builder.add_acquisition_source(
        datetime=datetime.datetime.utcnow().isoformat(),
        submission_number=obj.id,
        internal_uid=int(obj.id_user),
        email=form_fields.get('email'),
        orcid=form_fields.get('orcid'),
        method='submitter'
    )
    builder.validate_record()

    return builder.record

Exemple #2

0

Afficher le fichier

def formdata_to_model(obj, formdata):
    """Manipulate form data to match literature data model."""
    def _is_arxiv_url(url):
        return 'arxiv.org' in url

    form_fields = copy.deepcopy(formdata)
    filter_empty_elements(form_fields,
                          ['authors', 'supervisors', 'report_numbers'])

    builder = LiteratureBuilder(source='submitter')

    for author in form_fields.get('authors', []):
        builder.add_author(
            builder.make_author(author['full_name'],
                                affiliations=force_list(author['affiliation'])
                                if author['affiliation'] else None,
                                roles=['author']))

    for supervisor in form_fields.get('supervisors', []):
        builder.add_author(
            builder.make_author(
                supervisor['full_name'],
                affiliations=force_list(supervisor['affiliation'])
                if author['affiliation'] else None,
                roles=['supervisor']))

    builder.add_title(title=form_fields.get('title'))

    document_type = 'conference paper' if form_fields.get('conf_name') \
        else form_fields.get('type_of_doc', [])
    if document_type == 'chapter':
        document_type = 'book chapter'

    builder.add_document_type(document_type=document_type)

    builder.add_abstract(
        abstract=form_fields.get('abstract'),
        source='arXiv' if form_fields.get('categories') else None)

    if form_fields.get('arxiv_id') and form_fields.get('categories'):
        builder.add_arxiv_eprint(
            arxiv_id=form_fields.get('arxiv_id'),
            arxiv_categories=form_fields.get('categories').split())

    builder.add_doi(doi=form_fields.get('doi'))

    builder.add_inspire_categories(
        subject_terms=form_fields.get('subject_term'), source='user')

    for key in ('extra_comments', 'nonpublic_note', 'hidden_notes',
                'conf_name'):
        builder.add_private_note(private_notes=form_fields.get(key))

    year = form_fields.get('year')
    try:
        year = int(year)
    except (TypeError, ValueError):
        year = None

    builder.add_preprint_date(
        preprint_date=form_fields.get('preprint_created'))

    if form_fields.get('type_of_doc') == 'thesis':
        builder.add_thesis(defense_date=form_fields.get('defense_date'),
                           degree_type=form_fields.get('degree_type'),
                           institution=form_fields.get('institution'),
                           date=form_fields.get('thesis_date'))

    if form_fields.get('type_of_doc') == 'chapter':
        if not form_fields.get('journal_title'):
            builder.add_book_series(title=form_fields.get('series_title'))

    if form_fields.get('type_of_doc') == 'book':
        if form_fields.get('journal_title'):
            form_fields['volume'] = form_fields.get('series_volume')
        else:
            builder.add_book_series(title=form_fields.get('series_title'),
                                    volume=form_fields.get('series_volume'))
        builder.add_book(publisher=form_fields.get('publisher_name'),
                         place=form_fields.get('publication_place'),
                         date=form_fields.get('publication_date'))

    builder.add_publication_info(
        year=year,
        cnum=form_fields.get('conference_id'),
        journal_issue=form_fields.get('issue'),
        journal_title=form_fields.get('journal_title'),
        journal_volume=form_fields.get('volume'),
        page_start=form_fields.get('start_page'),
        page_end=form_fields.get('end_page'),
        artid=form_fields.get('artid'),
        parent_record=form_fields.get('parent_book'))

    builder.add_accelerator_experiments_legacy_name(
        legacy_name=form_fields.get('experiment'))

    language = form_fields.get('other_language') \
        if form_fields.get('language') == 'oth' \
        else form_fields.get('language')
    builder.add_language(language=language)

    if form_fields.get('title_translation'):
        builder.add_title_translation(
            title=form_fields['title_translation'],
            language='en',
        )

    builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv')

    builder.add_title(title=form_fields.get('title_crossref'),
                      source='crossref')

    builder.add_license(url=form_fields.get('license_url'))

    builder.add_public_note(public_note=form_fields.get('public_notes'))

    builder.add_public_note(
        public_note=form_fields.get('note'),
        source='arXiv' if form_fields.get('categories') else 'CrossRef')

    form_url = form_fields.get('url')
    form_additional_url = form_fields.get('additional_url')
    if form_url and not _is_arxiv_url(form_url):
        obj.extra_data['submission_pdf'] = form_url
        if not form_additional_url:
            builder.add_url(url=form_url)

    if form_additional_url and not _is_arxiv_url(form_additional_url):
        builder.add_url(url=form_additional_url)

    [
        builder.add_report_number(
            report_number=report_number.get('report_number'))
        for report_number in form_fields.get('report_numbers', [])
    ]

    builder.add_collaboration(collaboration=form_fields.get('collaboration'))

    builder.add_acquisition_source(
        datetime=datetime.datetime.utcnow().isoformat(),
        submission_number=obj.id,
        internal_uid=int(obj.id_user),
        email=form_fields.get('email'),
        orcid=form_fields.get('orcid'),
        method='submitter')

    return builder.record

Exemple #3

0

Afficher le fichier

Fichier : arxiv.py Projet : zanachka/hepcrawl

class ArxivParser(object):
    """Parser for the arXiv format.

    It can be used directly by invoking the :func:`ArxivParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the arXiv metadata.
    """
    _l2t = LatexNodes2Text(
        latex_context=get_arxiv_latex_context_db(),
        math_mode="verbatim",
        strict_latex_spaces="based-on-source",
        keep_comments=True,
        keep_braced_groups=True,
        keep_braced_groups_minlen=2,
    )

    def __init__(self, arxiv_record, source=None):
        self.root = self.get_root_node(arxiv_record)
        if not source:
            source = 'arXiv'
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract an arXiv record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(abstract=self.abstract, source=self.source)
        self.builder.add_title(title=self.title, source=self.source)
        for license in self.licenses:
            self.builder.add_license(**license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        self.builder.add_preprint_date(self.preprint_date)
        if self.public_note:
            self.builder.add_public_note(self.public_note, self.source)
        for rep_number in self.report_numbers:
            self.builder.add_report_number(rep_number, self.source)
        self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories)
        self.builder.add_private_note(self.private_note)
        self.builder.add_document_type(self.document_type)
        normalized_categories = [
            classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories
        ]
        self.builder.add_inspire_categories(dedupe_list(normalized_categories),
                                            'arxiv')

        return self.builder.record

    def _get_authors_and_collaborations(self, node):
        """Parse authors, affiliations and collaborations from the record node.

        Heuristics are used to detect collaborations. In case those are not
        reliable, a warning is returned for manual checking.

        Args:
            node (Selector): a selector on a record
        Returns:
            tuple: a tuple of (authors, collaborations, warning)
        """
        author_selectors = node.xpath('.//authors//author')

        # take 'for the' out of the general phrases and dont use it in
        # affiliations
        collab_phrases = [
            'consortium',
            ' collab ',
            'collaboration',
            ' team',
            'group',
            ' on behalf of ',
            ' representing ',
        ]
        inst_phrases = ['institute', 'university', 'department', 'center']

        authors = []
        collaborations = []
        warning_tags = []
        some_affiliation_contains_collaboration = False

        authors_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next_author_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next(next_author_and_affiliations)

        for (forenames, keyname,
             affiliations), (next_forenames, next_keyname,
                             _) in six.moves.zip_longest(
                                 authors_and_affiliations,
                                 next_author_and_affiliations,
                                 fillvalue=('end of author-list', '', None)):

            name_string = " %s %s " % (forenames, keyname)

            # collaborations in affiliation field? Cautious with 'for the' in
            # Inst names
            affiliations_with_collaborations = []
            affiliations_without_collaborations = []
            for aff in affiliations:
                affiliation_contains_collaboration = any(
                    phrase in aff.lower()
                    for phrase in collab_phrases) and not any(
                        phrase in aff.lower() for phrase in inst_phrases)
                if affiliation_contains_collaboration:
                    affiliations_with_collaborations.append(aff)
                    some_affiliation_contains_collaboration = True
                else:
                    affiliations_without_collaborations.append(aff)
            for aff in affiliations_with_collaborations:
                coll, author_name = coll_cleanforthe(aff)
                if coll and coll not in collaborations:
                    collaborations.append(coll)

            # Check if name is a collaboration, else append to authors
            collaboration_in_name = ' for the ' in name_string.lower() or any(
                phrase in name_string.lower() for phrase in collab_phrases)
            if collaboration_in_name:
                coll, author_name = coll_cleanforthe(name_string)
                if author_name:
                    surname, given_names = split_fullname(author_name)
                    authors.append({
                        'full_name': surname + ', ' + given_names,
                        'surname': surname,
                        'given_names': given_names,
                        'affiliations': [],
                    })
                if coll and coll not in collaborations:
                    collaborations.append(coll)
            elif name_string.strip() == ':':
                # DANGERZONE : this might not be correct - add a warning for the cataloger
                warning_tags.append(' %s %s ' % (next_forenames, next_keyname))
                if not some_affiliation_contains_collaboration:
                    # everything up to now seems to be collaboration info
                    for author_info in authors:
                        name_string = " %s %s " % \
                            (author_info['given_names'], author_info['surname'])
                        coll, author_name = coll_cleanforthe(name_string)
                        if coll and coll not in collaborations:
                            collaborations.append(coll)
                    authors = []
            else:
                authors.append({
                    'full_name':
                    keyname + ', ' + forenames,
                    'surname':
                    keyname,
                    'given_names':
                    forenames,
                    'affiliations':
                    affiliations_without_collaborations
                })
        if warning_tags:
            warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join(
                warning_tags)
        else:
            warning = ''
        return authors, collaborations, warning

    @staticmethod
    def _get_author_names_and_affiliations(author_node):
        forenames = u' '.join(
            author_node.xpath('.//forenames//text()').extract())
        keyname = u' '.join(author_node.xpath('.//keyname//text()').extract())
        affiliations = author_node.xpath('.//affiliation//text()').extract()

        return forenames, keyname, affiliations

    @property
    def preprint_date(self):
        preprint_date = self.root.xpath('.//created/text()').extract_first()

        return preprint_date

    @property
    def abstract(self):
        abstract = self.root.xpath('.//abstract/text()').extract_first()
        long_text_fixed = self.fix_long_text(abstract)
        return self.latex_to_unicode(long_text_fixed)

    @property
    def authors(self):
        authors, _, _ = self.authors_and_collaborations
        parsed_authors = [
            self.builder.make_author(full_name=auth["full_name"],
                                     raw_affiliations=auth["affiliations"])
            for auth in authors
        ]

        return parsed_authors

    @property
    def collaborations(self):
        _, collaborations, _ = self.authors_and_collaborations

        return collaborations

    @property
    def dois(self):
        doi_values = self.root.xpath('.//doi/text()').extract()
        doi_values_splitted = chain.from_iterable(
            [re.split(RE_DOIS, doi) for doi in doi_values])
        dois = [{
            'doi': value,
            'material': 'publication'
        } for value in doi_values_splitted]

        return dois

    @property
    def licenses(self):
        licenses = self.root.xpath('.//license/text()').extract()
        return [{
            'url': license,
            'material': self.material
        } for license in licenses]

    @property
    def material(self):
        return 'preprint'

    @property
    def number_of_pages(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        found_pages = RE_PAGES.search(comments)
        if found_pages:
            pages = found_pages.group(1)
            return maybe_int(pages)

        return None

    @property
    def publication_info(self):
        publication_info = {
            'material': 'publication',
            'pubinfo_freetext': self.pubinfo_freetext,
        }

        return publication_info

    @property
    def pubinfo_freetext(self):
        return self.root.xpath('.//journal-ref/text()').extract_first()

    @property
    def title(self):
        long_text_fixed = self.fix_long_text(
            self.root.xpath('.//title/text()').extract_first())
        return self.latex_to_unicode(long_text_fixed)

    @staticmethod
    def fix_long_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    @staticmethod
    def get_root_node(arxiv_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(arxiv_record, six.string_types):
            root = get_node(arxiv_record)
        else:
            root = arxiv_record
        root.remove_namespaces()

        return root

    @property
    def public_note(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        return self.latex_to_unicode(comments)

    @property
    def private_note(self):
        _, _, warning = self.authors_and_collaborations

        return warning

    @property
    def report_numbers(self):
        report_numbers = self.root.xpath('.//report-no/text()').extract()
        rns = []
        for rn in report_numbers:
            rns.extend(rn.split(', '))

        return rns

    @property
    def arxiv_eprint(self):
        return self.root.xpath('.//id/text()').extract_first()

    @property
    def arxiv_categories(self):
        categories = self.root.xpath('.//categories/text()').extract_first(
            default='[]')
        categories = categories.split()
        categories_without_old = [
            normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories
        ]

        return dedupe_list(categories_without_old)

    @property
    def document_type(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        doctype = 'article'
        if RE_THESIS.search(comments):
            doctype = 'thesis'
        elif RE_CONFERENCE.search(comments):
            doctype = 'conference paper'

        return doctype

    @property
    def source(self):
        return 'arXiv'

    @property
    def authors_and_collaborations(self):
        if not hasattr(self, '_authors_and_collaborations'):
            self._authors_and_collaborations = self._get_authors_and_collaborations(
                self.root)
        return self._authors_and_collaborations

    @classmethod
    def latex_to_unicode(cls, latex_string):
        try:
            return cls._l2t.latex_to_text(latex_string).replace("  ", " ")
        except Exception as e:
            return latex_string