Esempio n. 1
0
    def serialize_object(self, obj):
        """Serialize a single record and persistent identifier.

        :param obj: Record instance
        """
        json = self._schema_cls().dump(obj)
        return simpledc.tostring(json)
Esempio n. 2
0
    def serialize(self, pid, record, links_factory=None):
        """Serialize a single record and persistent identifier.

        :param pid: Persistent identifier instance.
        :param record: Record instance.
        :param links_factory: Factory function for record links.
        """
        return simpledc.tostring(
            self.transform_record(pid, record, links_factory))
Esempio n. 3
0
    def serialize_object_list(self, obj_list):
        """Serialize a list of records.

        :param obj_list: List of record instances
        """
        records = obj_list.get("hits", {}).get("hits", [])
        json_list = self._schema_cls().dump(records, many=True)
        # TODO: multiple records should be wrapped in a single root tag.
        return "\n".join(simpledc.tostring(json) for json in json_list)
Esempio n. 4
0
    def serialize(self, pid, record, links_factory=None):
        """Serialize a single record and persistent identifier.

        :param pid: Persistent identifier instance.
        :param record: Record instance.
        :param links_factory: Factory function for record links.
        """
        return simpledc.tostring(
            self.transform_record(pid, record, links_factory))
Esempio n. 5
0
def dictionary2xml(dictionary):
    '''takes a dictionary and creates an xml string from the object, and the file name'''

    file_name = dictionary['identifiers'] + '.xml'

    for key, value in dictionary.items():
        dictionary[key] = [value]
    xml = simpledc.tostring(dictionary)

    return xml, file_name
Esempio n. 6
0
def test_elements():
    """Test simple dc."""
    elements = [
        ('contributors', 'contributor'),
        ('coverage', 'coverage'),
        ('creators', 'creator'),
        ('dates', 'date'),
        ('descriptions', 'description'),
        ('formats', 'format'),
        ('identifiers', 'identifier'),
        ('languages', 'language'),
        ('publishers', 'publisher'),
        ('relations', 'relation'),
        ('rights', 'rights'),
        ('sources', 'source'),
        ('subjects', 'subject'),
        ('titles', 'title'),
        ('types', 'type'),
    ]

    # Test each element individually
    for plural, singular in elements:
        # Test multiple values
        tree = simpledc.dump_etree({plural: ['value 1', 'value 2']})
        elems = tree.xpath(
            '/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns)
        assert len(elems) == 2, singular
        assert elems[0].text == 'value 1'
        assert elems[1].text == 'value 2'

        # Test empty values
        tree = simpledc.dump_etree({plural: []})
        elem = tree.xpath(
            '//dc:{0}'.format(singular), namespaces=simpledc.ns)
        assert len(elem) == 0, singular

    # Test all elements together
    data = {}
    for plural, singular in elements:
        data[plural] = ['test 1', 'test 2']

    tree = simpledc.dump_etree(data)
    for plural, singular in elements:
        elems = tree.xpath(
            '/oai_dc:dc/dc:{0}'.format(singular), namespaces=simpledc.ns)
        assert len(elems) == 2, singular
        assert elems[0].text == 'test 1'
        assert elems[1].text == 'test 2'

    # Test tostring
    xml = simpledc.tostring(data)
    for plural, singular in elements:
        assert '<dc:{0}>'.format(singular) in xml
Esempio n. 7
0
def test_elements():
    """Test simple dc."""
    elements = [
        ('contributors', 'contributor'),
        ('coverage', 'coverage'),
        ('creators', 'creator'),
        ('dates', 'date'),
        ('descriptions', 'description'),
        ('formats', 'format'),
        ('identifiers', 'identifier'),
        ('languages', 'language'),
        ('publishers', 'publisher'),
        ('relations', 'relation'),
        ('rights', 'rights'),
        ('sources', 'source'),
        ('subjects', 'subject'),
        ('titles', 'title'),
        ('types', 'type'),
    ]

    # Test each element individually
    for plural, singular in elements:
        # Test multiple values
        tree = simpledc.dump_etree({plural: ['value 1', 'value 2']})
        elems = tree.xpath('/oai_dc:dc/dc:{0}'.format(singular),
                           namespaces=simpledc.ns)
        assert len(elems) == 2, singular
        assert elems[0].text == 'value 1'
        assert elems[1].text == 'value 2'

        # Test empty values
        tree = simpledc.dump_etree({plural: []})
        elem = tree.xpath('//dc:{0}'.format(singular), namespaces=simpledc.ns)
        assert len(elem) == 0, singular

    # Test all elements together
    data = {}
    for plural, singular in elements:
        data[plural] = ['test 1', 'test 2']

    tree = simpledc.dump_etree(data)
    for plural, singular in elements:
        elems = tree.xpath('/oai_dc:dc/dc:{0}'.format(singular),
                           namespaces=simpledc.ns)
        assert len(elems) == 2, singular
        assert elems[0].text == 'test 1'
        assert elems[1].text == 'test 2'

    # Test tostring
    xml = simpledc.tostring(data)
    for plural, singular in elements:
        assert '<dc:{0}>'.format(singular) in xml
Esempio n. 8
0
    def serialize_search(self, pid_fetcher, search_result, links=None,
                         item_links_factory=None):
        """Serialize a search result.

        :param pid_fetcher: Persistent identifier fetcher.
        :param search_result: Elasticsearch search result.
        :param links: Dictionary of links to add to response.
        """
        records = []
        for hit in search_result['hits']['hits']:
            records.append(simpledc.tostring(self.transform_search_hit(
                pid_fetcher(hit['_id'], hit['_source']),
                hit,
                links_factory=item_links_factory,
            )))

        return "\n".join(records)
Esempio n. 9
0
    def serialize_search(self, pid_fetcher, search_result, links=None,
                         item_links_factory=None):
        """Serialize a search result.

        :param pid_fetcher: Persistent identifier fetcher.
        :param search_result: Elasticsearch search result.
        :param links: Dictionary of links to add to response.
        """
        records = []
        for hit in search_result['hits']['hits']:
            records.append(simpledc.tostring(self.transform_search_hit(
                pid_fetcher(hit['_id'], hit['_source']),
                hit,
                links_factory=item_links_factory,
            )))

        return "\n".join(records)
Esempio n. 10
0
def generate_dublin_core(pubrecord):
    """
    This function turns a publication record into a simple dublin core XML record
    :param pubrecord:
    :return: dublin core XML record
    """

    authors = pubrecord.get('authorsList')
    editors = pubrecord.get('editorsList')
    all_contributors = None
    if authors and editors:
        all_contributors = authors + editors
    elif authors:
        all_contributors = authors
    elif editors:
        all_contributors = editors

    data = {
        "dates": [pubrecord.get('publicationYear')],
        "descriptions": [pubrecord.get('docAbstract')],
        "formats": ['application/pdf'],
        "identifiers": [pubrecord.get('doi')],
        "languages": ['en'],
        "publishers": [pubrecord.get('publisher')],
        "titles": [pubrecord.get('title')],
    }

    if all_contributors and len(all_contributors) >= 1:
        data["creators"] = [all_contributors[0]]
    if all_contributors and len(all_contributors) >= 2:
        data["contributors"] = all_contributors[1:]

    if pubrecord['publicationType']['text'] == 'Book chapter':
        data['types'] = ['chapter']
    elif pubrecord['publicationType']['text'] == 'Book':
        data['types'] = ['book']
    elif pubrecord['publicationType']['text'] == 'Article':
        data['types'] = ['article']
    elif pubrecord['publicationType']['text'] == 'Report':
        data['types'] = ['reports']
    else:
        data['types'] = ['text']
    return '\n'.join(simpledc.tostring(data).splitlines()[1:])
Esempio n. 11
0
def generate_dublin_core(pubrecord):
    """
    This function turns a publication record into a simple dublin core XML record
    :param pubrecord:
    :return: dublin core XML record
    """

    authors = pubrecord.get('authorsList')
    editors = pubrecord.get('editorsList')
    all_contributors = None
    if authors and editors:
        all_contributors = authors + editors
    elif authors:
        all_contributors = authors
    elif editors:
        all_contributors = editors

    data = {
        "dates": [pubrecord.get('publicationYear')],
        "descriptions": [pubrecord.get('docAbstract')],
        "formats": ['application/pdf'],
        "identifiers": [pubrecord.get('doi')],
        "languages": ['en'],
        "publishers": [pubrecord.get('publisher')],
        "titles": [pubrecord.get('title')],
    }

    if all_contributors and len(all_contributors) >= 1:
        data["creators"] = [all_contributors[0]]
    if all_contributors and len(all_contributors) >= 2:
        data["contributors"] = all_contributors[1:]

    if pubrecord['publicationType']['text'] == 'Book chapter':
        data['types'] = ['chapter']
    elif pubrecord['publicationType']['text'] == 'Book':
        data['types'] = ['book']
    elif pubrecord['publicationType']['text'] == 'Article':
        data['types'] = ['article']
    elif pubrecord['publicationType']['text'] == 'Report':
        data['types'] = ['reports']
    else:
        data['types'] = ['text']
    return '\n'.join(simpledc.tostring(data).splitlines()[1:])
Esempio n. 12
0
os.makedirs("xml", exist_ok=True)
# df is a name or variable that we use to store a DataFrame
# we read rows of csv file and make a DataFrame named df
df = pd.read_csv('dc_sample.csv')

# what does the data look like? Do we need to do anything to it?
# How can we stop code after looking at data?

# empty values are read in as NaN, which can be difficult to work with
# so, we take all empty values and make them an empty string
df = df.fillna('')

# Let's look at ist0977.xml, what do we notice about the subject item?
# How might we address this?
# Let's look at all the rows that have a comma in subject
# What do we learn?
# How might we address the larger issue?
# Could we fix the data in the file using python?

list_of_dicts = df.to_dict(orient='records')

for dictionary in list_of_dicts:
    for key, value in dictionary.items():
        dictionary[key] = [value]

    xml = simpledc.tostring(dictionary)
    fn = dictionary['identifiers'][0]

    with open('xml/' + fn + '.xml', mode='w', encoding='utf8') as fp:
        fp.write(xml)