Ejemplo n.º 1
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ('author', rb.add_refextract_authors_str),
            ('collaboration', rb.add_collaboration),
            ('doi', rb.add_uid),
            ('hdl', rb.add_uid),
            ('isbn', rb.add_uid),
            ('journal_reference', rb.set_pubnote),
            ('linemarker', rb.set_label),
            ('misc', rb.add_misc),
            ('publisher', rb.set_publisher),
            ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ('reportnumber', rb.add_report_number),
            ('texkey', rb.set_texkey),
            ('title', rb.add_title),
            ('url', rb.add_url),
            ('year', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, 'reference.urls'):
            rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls'])

        result.append(rb.obj)

    return result
Ejemplo n.º 2
0
def dedupe_all_lists(obj, exclude_keys=()):
    """Recursively remove duplucates from all lists.

    Args:
        obj: collection to deduplicate
        exclude_keys (Container[str]): key names to ignore for deduplication
    """
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            if key in exclude_keys:
                new_obj[key] = value
            else:
                new_obj[key] = dedupe_all_lists(value)
        return new_obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj
Ejemplo n.º 3
0
    def references(self):
        """Extract a Crossref record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_keys = self.record.get("reference")
        reference_list = list(
            itertools.chain.from_iterable(
                self.get_reference(key) for key in force_list(ref_keys)))
        return dedupe_list_of_dicts(reference_list)
Ejemplo n.º 4
0
def dedupe_all_lists(obj):
    """Recursively remove duplucates from all lists."""
    squared_dedupe_len = 10
    if isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            new_obj[key] = dedupe_all_lists(value)
        return new_obj
    elif isinstance(obj, (list, tuple, set)):
        new_elements = [dedupe_all_lists(v) for v in obj]
        if len(new_elements) < squared_dedupe_len:
            new_obj = dedupe_list(new_elements)
        else:
            new_obj = dedupe_list_of_dicts(new_elements)
        return type(obj)(new_obj)
    else:
        return obj
Ejemplo n.º 5
0
def test_dedupe_list_of_dicts():
    list_of_dicts_with_duplicates = [
        {
            'a': 123,
            'b': 1234
        },
        {
            'a': 3222,
            'b': 1234
        },
        {
            'a': 123,
            'b': 1234
        },
    ]

    expected = [{'a': 123, 'b': 1234}, {'a': 3222, 'b': 1234}]
    result = dedupe_list_of_dicts(list_of_dicts_with_duplicates)

    assert expected == result
Ejemplo n.º 6
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ("author", rb.add_refextract_authors_str),
            ("collaboration", rb.add_collaboration),
            ("doi", rb.add_uid),
            ("hdl", rb.add_uid),
            ("isbn", rb.add_uid),
            ("journal_reference", rb.set_pubnote),
            ("linemarker", rb.set_label),
            ("misc", rb.add_misc),
            ("publisher", rb.set_publisher),
            ("raw_ref",
             lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ("reportnumber", rb.add_report_number),
            ("texkey", rb.set_texkey),
            ("title", rb.add_title),
            ("url", rb.add_url),
            ("year", rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, "reference.urls"):
            rb.obj["reference"]["urls"] = dedupe_list_of_dicts(
                rb.obj["reference"]["urls"])

        result.append(rb.obj)
        result.extend(rb.pop_additional_pubnotes())

    return result
Ejemplo n.º 7
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ('author', rb.add_refextract_authors_str),
            ('collaboration', rb.add_collaboration),
            ('doi', rb.add_uid),
            ('hdl', rb.add_uid),
            ('isbn', rb.add_uid),
            ('journal_reference', rb.set_pubnote),
            ('linemarker', rb.set_label),
            ('misc', rb.add_misc),
            ('publisher', rb.set_publisher),
            ('raw_ref',
             lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ('reportnumber', rb.add_report_number),
            ('texkey', rb.set_texkey),
            ('title', rb.add_title),
            ('url', rb.add_url),
            ('year', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, 'reference.urls'):
            rb.obj['reference']['urls'] = dedupe_list_of_dicts(
                rb.obj['reference']['urls'])

        result.append(rb.obj)
        result.extend(rb.pop_additional_pubnotes())

    return result