Example #1
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(
            extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.',
                     len(extracted_raw_references))
        obj.data['references'] = match_references_based_on_flag(
            extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(
                extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references_based_on_flag(
                pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(
            extract_references_from_text(text, source))
        matched_text_references = match_references_based_on_flag(
            text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.',
                     len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.',
                     len(matched_text_references))
        obj.data['references'] = matched_text_references
Example #2
0
    def parse(self):
        """Extract an arXiv record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(abstract=self.abstract, source=self.source)
        self.builder.add_title(title=self.title, source=self.source)
        for license in self.licenses:
            self.builder.add_license(**license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        self.builder.add_preprint_date(self.preprint_date)
        if self.public_note:
            self.builder.add_public_note(self.public_note, self.source)
        for rep_number in self.report_numbers:
            self.builder.add_report_number(rep_number, self.source)
        self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories)
        self.builder.add_private_note(self.private_note)
        self.builder.add_document_type(self.document_type)
        normalized_categories = [
            classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories
        ]
        self.builder.add_inspire_categories(dedupe_list(normalized_categories),
                                            'arxiv')

        return self.builder.record
Example #3
0
def dedupe_all_lists(obj, exclude_keys=()):
    """Recursively remove duplucates from all lists.

    Args:
        obj: collection to deduplicate
        exclude_keys (Container[str]): key names to ignore for deduplication
    """
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            if key in exclude_keys:
                new_obj[key] = value
            else:
                new_obj[key] = dedupe_all_lists(value)
        return new_obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj
Example #4
0
def refextract_url():
    """Run refextract on a URL."""
    if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"):
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        data = {
            "journal_kb_data": create_journal_dict(),
            "url": request.json["url"]
        }
        response = requests.post(
            f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_url",
            headers=headers,
            data=orjson.dumps(data),
        )
        if response.status_code != 200:
            return jsonify({"message": "Can not extract references"}, 500)
        extracted_references = response.json()["extracted_references"]
    else:
        extracted_references = extract_references_from_url(
            request.json["url"],
            override_kbs_files={"journals": create_journal_dict()},
            reference_format="{title},{volume},{page}",
        )
    deduplicated_extracted_references = dedupe_list(extracted_references)
    references = map_refextract_to_schema(deduplicated_extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
Example #5
0
def test_dedupe_list():
    list_with_duplicates = ['foo', 'bar', 'foo']

    expected = ['foo', 'bar']
    result = dedupe_list(list_with_duplicates)

    assert expected == result
Example #6
0
def match_references_by_uuids(literature_uuids):
    record_json = type_coerce(RecordMetadata.json, JSONB)
    has_references = record_json.has_key("references")  # noqa: W601
    selected_uuids = RecordMetadata.id.in_(literature_uuids)
    not_deleted = or_(  # exclude deleted records incase some are deleted after uuids are fetched by the callee
        not_(record_json.has_key("deleted")),  # noqa: W601
        not_(record_json["deleted"] == cast(True, JSONB)),
    )
    with_references_query = RecordMetadata.query.filter(
        selected_uuids, has_references, not_deleted
    )

    for record_metadata in with_references_query.all():
        references = record_metadata.json["references"]
        match_result = match_references(references)

        if not match_result["any_link_modified"]:
            continue

        literature = LiteratureRecord(record_metadata.json, model=record_metadata)
        literature["references"] = dedupe_list(match_result["matched_references"])
        literature.update(dict(literature))

        db.session.commit()
        added_recids = match_result["added_recids"]
        removed_recids = match_result["removed_recids"]
        LOGGER.info(
            "References are matched",
            uuid=record_metadata.id,
            recid=record_metadata.json["control_number"],
            added_recids=added_recids,
            added_recid_count=len(added_recids),
            removed_recids=removed_recids,
            removed_recid_count=len(removed_recids),
        )
Example #7
0
def fuzzy_match(obj, eng):
    """Return ``True`` if a similar record is found in the system.

    Uses a custom configuration for ``inspire-matcher`` to find records
    similar to the current workflow object's payload in the system.

    Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of
    the brief of first 5 record that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'):
        return False

    fuzzy_match_config = current_app.config['FUZZY_MATCH']
    matches = dedupe_list(match(obj.data, fuzzy_match_config))
    record_ids = [_get_hep_record_brief(el['_source']) for el in matches]
    obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5]
    return bool(record_ids)
Example #8
0
def article_exists(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``record_matches`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    matches = dedupe_list(match(obj.data))
    record_ids = [el['_source']['control_number'] for el in matches]
    if record_ids:
        obj.extra_data['record_matches'] = record_ids
        return True

    obj.extra_data['record_matches'] = []
    return False
Example #9
0
def already_pending_in_holdingpen_validator(property_name, value):
    """Check if there's a submission in the holdingpen with the same arXiv ID.
    """
    if property_name == 'arXiv ID':
        query_should = {
            'metadata.arxiv_eprints.value.raw': value,
        }
    elif property_name == 'DOI':
        query_should = {
            'metadata.dois.value.raw': value,
        }

    query = {
        "query": {
            "bool": {
                "filter": [
                    {
                        "term": {
                            "metadata.acquisition_source.source": "submitter"
                        },
                    },
                    {
                        "bool": {
                            "must_not": {
                                "term": {
                                    "_workflow.status": "COMPLETED"
                                }
                            }
                        }
                    }
                ],
                "must": [
                    {
                        "term": query_should,
                    }
                ]
            }
        },
        "_source": {
            "includes": [
                "_id"
            ]
        }
    }

    hits = es.search(
        index='holdingpen-hep',
        doc_type='hep',
        body=query,
    )['hits']['hits']

    matches = dedupe_list(hits)
    holdingpen_ids = [int(el['_id']) for el in matches]

    if holdingpen_ids:
        raise ValidationError(
            'There exists already a pending suggestion with the same %s '
            '"%s", it will be attended to shortly.'
            % (property_name, value)
        )
Example #10
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {
                        'citation_count': citation_count
                    }
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(
            es_scan(es,
                    query={
                        '_source': 'references.recid',
                        'filter': {
                            'exists': {
                                'field': 'references.recid'
                            }
                        },
                        'size': LARGE_CHUNK_SIZE
                    },
                    scroll=u'2m',
                    index=index,
                    doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(
                list(
                    chain.from_iterable(
                        map(force_list,
                            get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo(
        '... DONE: {} records updated with success. {} failures.'.format(
            success, failed))
Example #11
0
def article_exists(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``record_matches`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    matches = dedupe_list(match(obj.data))
    record_ids = [el['_source']['control_number'] for el in matches]
    if record_ids:
        obj.extra_data['record_matches'] = record_ids
        return True

    obj.extra_data['record_matches'] = []
    return False
Example #12
0
def already_pending_in_holdingpen_validator(property_name, value):
    """Check if there's a submission in the holdingpen with the same arXiv ID.
    """
    if property_name == 'arXiv ID':
        query_should = {
            'metadata.arxiv_eprints.value.raw': value,
        }
    elif property_name == 'DOI':
        query_should = {
            'metadata.dois.value.raw': value,
        }

    query = {
        "query": {
            "bool": {
                "filter": [
                    {
                        "term": {
                            "metadata.acquisition_source.source": "submitter"
                        },
                    },
                    {
                        "bool": {
                            "must_not": {
                                "term": {
                                    "_workflow.status": "COMPLETED"
                                }
                            }
                        }
                    }
                ],
                "must": [
                    {
                        "term": query_should,
                    }
                ]
            }
        },
        "_source": {
            "includes": [
                "_id"
            ]
        }
    }

    hits = es.search(
        index='holdingpen-hep',
        doc_type='hep',
        body=query,
    )['hits']['hits']

    matches = dedupe_list(hits)
    holdingpen_ids = [int(el['_id']) for el in matches]

    if holdingpen_ids:
        raise ValidationError(
            'There exists already a pending suggestion with the same %s '
            '"%s", it will be attended to shortly.'
            % (property_name, value)
        )
Example #13
0
def fuzzy_match(obj, eng):
    """Return ``True`` if a similar record is found in the system.

    Uses a custom configuration for ``inspire-matcher`` to find records
    similar to the current workflow object's payload in the system.

    Also sets the ``matches.fuzzy`` property in ``extra_data`` to the list of
    the brief of first 5 record that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    if not current_app.config.get('FEATURE_FLAG_ENABLE_FUZZY_MATCHER'):
        return False

    fuzzy_match_config = current_app.config['FUZZY_MATCH']
    matches = dedupe_list(match(obj.data, fuzzy_match_config))
    record_ids = [_get_hep_record_brief(el['_source']) for el in matches]
    obj.extra_data.setdefault('matches', {})['fuzzy'] = record_ids[0:5]
    return bool(record_ids)
Example #14
0
    def _get_affiliations_identifiers(value):
        t_values = (t_value.split(':')
                    for t_value in dedupe_list(force_list(value.get('t'))))

        return [{
            'schema': schema.upper(),
            'value': identifier
        } for schema, identifier in t_values]
Example #15
0
    def arxiv_categories(self):
        categories = self.root.xpath('.//categories/text()').extract_first(
            default='[]')
        categories = categories.split()
        categories_without_old = [
            normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories
        ]

        return dedupe_list(categories_without_old)
Example #16
0
def extract_references_from_file(path,
                                 recid=None,
                                 reference_format=u"{title} {volume} ({year}) {page}",
                                 linker_callback=None,
                                 override_kbs_files=None):
    """Extract references from a local pdf file.

    The first parameter is the path to the file.
    It returns a list of parsed references.
    It raises FullTextNotAvailableError if the file does not exist,
    UnknownDocumentTypeError if it is not a PDF or plain text.

    The standard reference format is: {title} {volume} ({year}) {page}.

    E.g. you can change that by passing the reference_format:

    >>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}")

    If you want to also link each reference to some other resource (like a record),
    you can provide a linker_callback function to be executed for every reference
    element found.

    To override KBs for journal names etc., use ``override_kbs_files``:

    >>> extract_references_from_file(path, override_kbs_files={'journals': 'my/path/to.kb'})

    """
    if not os.path.isfile(path):
        raise FullTextNotAvailableError(u"File not found: '{0}'".format(path))

    docbody = get_plaintext_document_body(path)
    reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    if not reflines:
        docbody = get_plaintext_document_body(path, keep_layout=True)
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)

    parsed_refs, stats = parse_references(
        reflines,
        recid=recid,
        reference_format=reference_format,
        linker_callback=linker_callback,
        override_kbs_files=override_kbs_files,
    )

    if magic.from_file(path, mime=True) == "application/pdf":
        extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
        if len(extracted_texkeys_urls) == len(parsed_refs):
            parsed_refs_updated = []
            for ref, ref_texkey_urls in zip(parsed_refs, extracted_texkeys_urls):
                update_reference_with_urls(ref, ref_texkey_urls.get('urls', []))
                if ref.get('url'):
                    ref['url'] = dedupe_list(ref['url'])
                parsed_refs_updated.append(dict(ref, texkey=[ref_texkey_urls['texkey']]))

            return parsed_refs_updated
    return parsed_refs
Example #17
0
def refextract(obj, eng):
    """Extract references from various sources and add them to the workflow.

    Runs ``refextract`` on both the PDF attached to the workflow and the
    references provided by the submitter, if any, then chooses the one
    that generated the most and attaches them to the workflow object.

    Args:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        None

    """
    if 'references' in obj.data:
        extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
        obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
        obj.data['references'] = match_references(extracted_raw_references)
        return

    matched_pdf_references, matched_text_references = [], []
    source = LiteratureReader(obj.data).source

    with get_document_in_workflow(obj) as tmp_document:
        if tmp_document:
            pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
            matched_pdf_references = match_references(pdf_references)

    text = get_value(obj.extra_data, 'formdata.references')
    if text:
        text_references = dedupe_list(extract_references_from_text(text, source))
        matched_text_references = match_references(text_references)

    if len(matched_pdf_references) == len(matched_text_references) == 0:
        obj.log.info('No references extracted.')
    elif len(matched_pdf_references) > len(matched_text_references):
        obj.log.info('Extracted %d references from PDF.', len(matched_pdf_references))
        obj.data['references'] = matched_pdf_references
    elif len(matched_text_references) >= len(matched_pdf_references):
        obj.log.info('Extracted %d references from text.', len(matched_text_references))
        obj.data['references'] = matched_text_references
Example #18
0
def references(self, key, value):
    """Populate the ``references`` key."""
    def _has_curator_flag(value):
        normalized_nine_values = [
            el.upper() for el in force_list(value.get('9'))
        ]
        return 'CURATOR' in normalized_nine_values

    def _is_curated(value):
        return force_single_element(
            value.get('z')) == '1' and _has_curator_flag(value)

    def _set_record(el):
        recid = maybe_int(el)
        record = get_record_ref(recid, 'literature')
        rb.set_record(record)

    rb = ReferenceBuilder()
    mapping = [
        ('0', _set_record),
        ('a', rb.add_uid),
        ('b', rb.add_uid),
        ('c', rb.add_collaboration),
        ('e', partial(rb.add_author, role='ed.')),
        ('h', rb.add_refextract_authors_str),
        ('i', rb.add_uid),
        ('k', rb.set_texkey),
        ('m', rb.add_misc),
        ('o', rb.set_label),
        ('p', rb.set_publisher),
        ('q', rb.add_parent_title),
        ('r', rb.add_report_number),
        ('s', rb.set_pubnote),
        ('t', rb.add_title),
        ('x', rb.add_raw_reference),
        ('y', rb.set_year),
    ]

    for field, method in mapping:
        for el in force_list(value.get(field)):
            if el:
                method(el)

    for el in dedupe_list(force_list(value.get('u'))):
        if el:
            rb.add_url(el)

    if _is_curated(value):
        rb.curate()

    if _has_curator_flag(value):
        rb.obj['legacy_curated'] = True

    return rb.obj
Example #19
0
def add_citation_counts(chunk_size=500, request_timeout=120):
    def _get_records_to_update_generator(citations_lookup):
        with click.progressbar(citations_lookup.iteritems()) as bar:
            for uuid, citation_count in bar:
                yield {
                    '_op_type': 'update',
                    '_index': index,
                    '_type': doc_type,
                    '_id': str(uuid),
                    'doc': {'citation_count': citation_count}
                }

    index, doc_type = schema_to_index('records/hep.json')
    citations_lookup = Counter()

    click.echo('Extracting all citations...')
    with click.progressbar(es_scan(
            es,
            query={
                '_source': 'references.recid',
                'filter': {
                    'exists': {
                        'field': 'references.recid'
                    }
                },
                'size': LARGE_CHUNK_SIZE
            },
            scroll=u'2m',
            index=index,
            doc_type=doc_type)) as records:
        for record in records:
            unique_refs_ids = dedupe_list(list(chain.from_iterable(map(
                force_list, get_value(record, '_source.references.recid')))))

            for unique_refs_id in unique_refs_ids:
                citations_lookup[unique_refs_id] += 1
    click.echo('... DONE.')

    click.echo('Mapping recids to UUIDs...')
    citations_lookup = _build_recid_to_uuid_map(citations_lookup)
    click.echo('... DONE.')

    click.echo('Adding citation numbers...')
    success, failed = es_bulk(
        es,
        _get_records_to_update_generator(citations_lookup),
        chunk_size=chunk_size,
        raise_on_exception=False,
        raise_on_error=False,
        request_timeout=request_timeout,
        stats_only=True,
    )
    click.echo('... DONE: {} records updated with success. {} failures.'.format(
        success, failed))
Example #20
0
    def _get_ids(value):
        def _is_jacow(j_value):
            return j_value.upper().startswith('JACOW-')

        def _is_orcid(j_value):
            return j_value.upper().startswith('ORCID:') and len(j_value) > 6

        def _is_naked_orcid(j_value):
            return ORCID.match(j_value)

        def _is_cern(j_value):
            return j_value.startswith('CCID-')

        result = []

        i_values = force_list(value.get('i'))
        for i_value in i_values:
            result.append({
                'schema': 'INSPIRE ID',
                'value': i_value,
            })

        j_values = force_list(value.get('j'))
        for j_value in j_values:
            if _is_jacow(j_value):
                result.append({
                    'schema': 'JACOW',
                    'value': 'JACoW-' + j_value[6:],
                })
            elif _is_orcid(j_value):
                result.append({
                    'schema': 'ORCID',
                    'value': j_value[6:].replace('.', ''),
                })
            elif _is_naked_orcid(j_value):
                result.append({
                    'schema': 'ORCID',
                    'value': j_value,
                })
            elif _is_cern(j_value):
                result.append({
                    'schema': 'CERN',
                    'value': 'CERN-' + j_value[5:],
                })

        w_values = force_list(value.get('w'))
        for w_value in w_values:
            result.append({
                'schema': 'INSPIRE BAI',
                'value': w_value,
            })

        return dedupe_list(result)
Example #21
0
def pending_in_holding_pen(obj, eng):
    """Return ``True`` if the record is already present in the Holding Pen.

    Uses a custom configuration of the ``inspire-matcher`` to find duplicates
    of the current workflow object in the Holding Pen.

    Also sets ``holdingpen_matches`` in ``extra_data`` to the list of ids that
    matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the Holding
        Pen, ``False`` otherwise.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
            },
        ],
        'doc_type':
        'hep',
        'index':
        'holdingpen-hep',
    }

    matches = dedupe_list(match(obj.data, config))
    holdingpen_ids = [
        int(el['_id']) for el in matches if int(el['_id']) != obj.id
    ]
    if holdingpen_ids:
        obj.extra_data['holdingpen_matches'] = holdingpen_ids
        return True

    return False
Example #22
0
def duplicated_validator(property_name, property_value):
    def _is_not_deleted(base_record, match_result):
        return not get_value(match_result, '_source.deleted', default=False)

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_id',
                        'search_path': 'arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'doi',
                        'search_path': 'dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator':
                _is_not_deleted,
            },
        ],
        'doc_type':
        'hep',
        'index':
        'records-hep',
    }

    if property_name == 'arXiv ID':
        data = {
            'arxiv_id': property_value,
        }
    if property_name == 'DOI':
        data = {
            'doi': property_value,
        }

    matches = dedupe_list(match(data, config))
    matched_ids = [int(el['_source']['control_number']) for el in matches]
    if matched_ids:
        url = url_for(
            'invenio_records_ui.literature',
            pid_value=matched_ids[0],
        )
        raise ValidationError(
            'There exists already an item with the same %s. '
            '<a target="_blank" href="%s">See the record.</a>' %
            (property_name, url))
Example #23
0
def duplicated_validator(property_name, property_value):
    def _is_not_deleted(base_record, match_result):
        return not get_value(match_result, '_source.deleted', default=False)

    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_id',
                        'search_path': 'arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'doi',
                        'search_path': 'dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator': _is_not_deleted,
            },
        ],
        'doc_type': 'hep',
        'index': 'records-hep',
    }

    if property_name == 'arXiv ID':
        data = {
            'arxiv_id': property_value,
        }
    if property_name == 'DOI':
        data = {
            'doi': property_value,
        }

    matches = dedupe_list(match(data, config))
    matched_ids = [int(el['_source']['control_number']) for el in matches]
    if matched_ids:
        url = url_for(
            'invenio_records_ui.literature',
            pid_value=matched_ids[0],
        )
        raise ValidationError(
            'There exists already an item with the same %s. '
            '<a target="_blank" href="%s">See the record.</a>'
            % (property_name, url)
        )
Example #24
0
def dedupe_all_lists(obj):
    """Recursively remove duplucates from all lists."""
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            new_obj[key] = dedupe_all_lists(value)
        return new_obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj
Example #25
0
    def _get_affiliations(value):
        result = []

        u_values = force_list(value.get('u'))
        z_values = force_list(value.get('z'))

        # XXX: we zip only when they have the same length, otherwise
        #      we might match a value with the wrong recid.
        if len(u_values) == len(z_values):
            for u_value, z_value in zip(u_values, z_values):
                result.append({
                    'record': get_record_ref(z_value, 'institutions'),
                    'value': u_value,
                })
        else:
            for u_value in u_values:
                result.append({'value': u_value})

        return dedupe_list(result)
Example #26
0
def match_reference_with_config(reference,
                                config,
                                previous_matched_recid=None):
    """Match a reference using inspire-matcher given the config.

    Args:
        reference (dict): the metadata of the reference.
        config (dict): the list of inspire-matcher configurations for queries.
        previous_matched_recid (int): the record id of the last matched
            reference from the list of references.

    Returns:
        dict: the matched reference.
    """
    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = str(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    matched_recids = [
        matched_record['_source']['control_number']
        for matched_record in match(reference, config)
    ]
    matched_recids = dedupe_list(matched_recids)

    same_as_previous = any(matched_recid == previous_matched_recid
                           for matched_recid in matched_recids)
    if len(matched_recids) == 1:
        _add_match_to_reference(reference, matched_recids[0], config['index'])
    elif same_as_previous:
        _add_match_to_reference(reference, previous_matched_recid,
                                config['index'])

    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = int(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    return reference
Example #27
0
def _pending_in_holding_pen(obj, validation_func):
    """Return the list of matching workflows in the holdingpen.

    Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``,
    and by a custom validator function.

    Args:
        obj: a workflow object.
        validation_func: a function used to filter the matched records.

    Returns:
        (list): the ids matching the current ``obj`` that satisfy
        ``validation_func``.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator':
                validation_func,
            },
        ],
        'doc_type':
        'hep',
        'index':
        'holdingpen-hep',
    }
    matches = dedupe_list(match(obj.data, config))
    return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
Example #28
0
def _pending_in_holding_pen(obj, validation_func):
    """Return the list of matching workflows in the holdingpen.

    Matches the holdingpen records by their ``arxiv_eprint``, their ``doi``,
    and by a custom validator function.

    Args:
        obj: a workflow object.
        validation_func: a function used to filter the matched records.

    Returns:
        (list): the ids matching the current ``obj`` that satisfy
        ``validation_func``.

    """
    config = {
        'algorithm': [
            {
                'queries': [
                    {
                        'path': 'arxiv_eprints.value',
                        'search_path': 'metadata.arxiv_eprints.value.raw',
                        'type': 'exact',
                    },
                    {
                        'path': 'dois.value',
                        'search_path': 'metadata.dois.value.raw',
                        'type': 'exact',
                    },
                ],
                'validator': validation_func,
            },
        ],
        'doc_type': 'hep',
        'index': 'holdingpen-hep',
    }
    matches = dedupe_list(match(obj.data, config))
    return [int(el['_id']) for el in matches if int(el['_id']) != obj.id]
Example #29
0
def exact_match(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``matches.exact`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    exact_match_config = current_app.config['EXACT_MATCH']
    matches = dedupe_list(match(obj.data, exact_match_config))
    record_ids = [el['_source']['control_number'] for el in matches]
    obj.extra_data.setdefault('matches', {})['exact'] = record_ids
    return bool(record_ids)
Example #30
0
def exact_match(obj, eng):
    """Return ``True`` if the record is already present in the system.

    Uses the default configuration of the ``inspire-matcher`` to find
    duplicates of the current workflow object in the system.

    Also sets the ``matches.exact`` property in ``extra_data`` to the list of
    control numbers that matched.

    Arguments:
        obj: a workflow object.
        eng: a workflow engine.

    Returns:
        bool: ``True`` if the workflow object has a duplicate in the system
        ``False`` otherwise.

    """
    exact_match_config = current_app.config['EXACT_MATCH']
    matches = dedupe_list(match(obj.data, exact_match_config))
    record_ids = [el['_source']['control_number'] for el in matches]
    obj.extra_data.setdefault('matches', {})['exact'] = record_ids
    return bool(record_ids)
Example #31
0
def match_reference_with_config(reference, config, previous_matched_recid=None):
    """Match a reference using inspire-matcher given the config.

    Args:
        reference (dict): the metadata of the reference.
        config (dict): the list of inspire-matcher configurations for queries.
        previous_matched_recid (int): the record id of the last matched
            reference from the list of references.

    Returns:
        dict: the matched reference.
    """
    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = str(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    matched_recids = [matched_record['_source']['control_number'] for matched_record in match(reference, config)]
    matched_recids = dedupe_list(matched_recids)

    same_as_previous = any(matched_recid == previous_matched_recid for matched_recid in matched_recids)
    if len(matched_recids) == 1:
        _add_match_to_reference(reference, matched_recids[0], config['index'])
    elif same_as_previous:
        _add_match_to_reference(reference, previous_matched_recid, config['index'])

    # XXX: avoid this type casting.
    try:
        reference['reference']['publication_info']['year'] = int(
            reference['reference']['publication_info']['year'])
    except KeyError:
        pass

    return reference
Example #32
0
 def _get_raw_affiliations(value):
     return dedupe_list([{
         'value': el
     } for el in force_list(value.get('v'))])
Example #33
0
def remove_duplicates_from_list(l):
    return dedupe_list(l)
Example #34
0
def remove_duplicates_from_list(l):
    return dedupe_list(l)