Ejemplo n.º 1
0
 def metadata(self):
     return common.Metadata(
         None,
         {
             "title":
             self._title(),
             "creator":
             self._creators(),
             "subject":
             self._subject(),
             "description":
             self._description(),
             "publisher":
             self._publisher(),
             "date":
             self._date(),
             "type": [fetch_pubtype_from_vocabulary(self.data.get("type"))],
             "format": ["text/html"],
             "identifier": [
                 self._context["url_for_html"](
                     acron=self.data["journal_acron"],
                     doc_id=self.data["doc_id"],
                 ),
             ],
             "source": [],
             "language":
             self._language(),
             "relation":
             self._relation(),
             "rights": ["info:eu-repo/semantics/openAccess"],
         },
     )
Ejemplo n.º 2
0
    def __call__(self, element):
        map = {}
        # create XPathEvaluator for this element
        xpath_evaluator = etree.XPathEvaluator(element,
                                               namespaces=self._namespaces)

        e = xpath_evaluator.evaluate
        # now extra field info according to xpath expr
        for field_name, (field_type, expr) in self._fields.items():
            if field_type == 'bytes':
                value = str(e(expr))
            elif field_type == 'bytesList':
                value = [str(item) for item in e(expr)]
            elif field_type == 'text':
                # make sure we get back unicode strings instead
                # of lxml.etree._ElementUnicodeResult objects.
                value = unicode(e(expr))
            elif field_type == 'textList':
                # make sure we get back unicode strings instead
                # of lxml.etree._ElementUnicodeResult objects.
                value = [unicode(v) for v in e(expr)]
            else:
                raise Error, "Unknown field type: %s" % field_type
            map[field_name] = value
        return common.Metadata(element, map)
Ejemplo n.º 3
0
 def add_record(result, so, resource_sets):
     header = common.Header(so.identifier, so.modified, resource_sets, so.deleted)
     if datestampInRange(header, from_, until):
         if metadataPrefix == 'metashare':
             map_class = MetashareMap()
             map_ = map_class.getMap(so.metadata)
         elif metadataPrefix == 'olac':
             map_class = OlacMap()
             map_ = map_class.getMap(so.metadata)
         elif metadataPrefix == 'cmdi':
             map_class = CmdiMap()
             resource = so.resourceinfotype_model_set.all()[0]
             identifiers = resource.identificationInfo.identifier
             doi = identifiers[0] if identifiers else ""
             map_ = _add_elements_to_cmdi_metadata(
                 map_class.getMap(so.metadata),
                 resource.pk,
                 so.identifier,
                 'https://doi.org/' + doi,
             )
         else:
             raise CannotDisseminateFormatError, \
                 '%s metadata format is not supported' % metadataPrefix
         metadata = common.Metadata(map_)
         result.append((header, metadata, None))
Ejemplo n.º 4
0
 def record_for_book(self, book, headers_only=False):
     meta = None
     identifier = self.slug_to_identifier(book.slug)
     if isinstance(book, Book):
         #            setSpec = map(self.tag_to_setspec, book.tags.filter(category__in=self.TAG_CATEGORIES))
         header = common.Header(identifier, book.changed_at, [], False)
         if not headers_only:
             meta = common.Metadata(self.metadata(book))
         about = None
     elif isinstance(book, Deleted):
         header = common.Header(identifier, book.deleted_at, [], True)
         if not headers_only:
             meta = common.Metadata({})
         about = None
     if headers_only:
         return header
     return header, meta, about
Ejemplo n.º 5
0
    def __init__(self):
        data = []

        for i in range(0, 12):
            # create some records in a year
            year = 2005
            month = i + 1
            day = 1
            datestamp = datetime(year, month, day, 12, 30, 0)
            data.append((common.Header(str(i), datestamp, '', False),
                         common.Metadata({'title': ['Title %s' % i]}), None))
        self._data = data
Ejemplo n.º 6
0
def createFakeData():
    data = []
    for i in range(100):
        # create some datestamp spread
        year = 2004
        month = i % 12 + 1
        day = i % 28 + 1
        hour = i % 24
        minute = i % 60
        second = i % 60
        datestamp = datetime(year, month, day, hour, minute, second)
        data.append((common.Header(str(i), datestamp, '', False),
                     common.Metadata({'title': ['Title %s' % i]}), None))
    return data
Ejemplo n.º 7
0
    def _record_for_dataset(self, dataset, set_spec):
        '''Show a tuple of a header and metadata for this dataset.
        '''
        package = get_action('package_show')({}, {'id': dataset.id})

        coverage = []
        temporal_begin = package.get('temporal_coverage_begin', '')
        temporal_end = package.get('temporal_coverage_end', '')

        geographic = package.get('geographic_coverage', '')
        if geographic:
            coverage.extend(geographic.split(','))
        if temporal_begin or temporal_end:
            coverage.append("%s/%s" % (temporal_begin, temporal_end))

        pids = [
            pid.get('id') for pid in package.get('pids', {})
            if pid.get('id', False)
        ]
        pids.append(package.get('id'))
        pids.append(
            config.get('ckan.site_url') +
            url_for(controller="package", action='read', id=package['name']))

        meta = {#'title': self._get_json_content(package.get('title', None) or package.get('name')),
                'identifier': pids,
                'type': ['dataset'],
                'language': [l.strip() for l in package.get('language').split(",")] if package.get('language', None) else None,
                'description': self._get_json_content(package.get('notes')) if package.get('notes', None) else None,
                'subject': [tag.get('display_name') for tag in package['tags']] if package.get('tags', None) else None,
                'date': [dataset.metadata_created.strftime('%Y-%m-%d')] if dataset.metadata_created else None,
                'rights': [package['license_title']] if package.get('license_title', None) else None,
                'coverage': coverage if coverage else [], }

        iters = dataset.extras.items()
        meta = dict(iters + meta.items())
        metadata = {}
        # Fixes the bug on having a large dataset being scrambled to individual
        # letters
        for key, value in meta.items():
            if not isinstance(value, list):
                metadata[str(key)] = [value]
            else:
                metadata[str(key)] = value
        return (common.Header('', dataset.id, dataset.metadata_created,
                              set_spec,
                              False), common.Metadata('', metadata), None)
Ejemplo n.º 8
0
 def getRecord(self, metadataPrefix, identifier):
     """
     OAI-PMH verb, GetRecord. See
     http://www.openarchives.org/OAI/openarchivesprotocol.html#GetRecord
     """
     self.check_no_multivalue_arg()
     checkMetadataPrefix(metadataPrefix)
     try:
         so = StorageObject.objects.get(identifier=identifier, publication_status=PUBLISHED)
     except:
         raise IdDoesNotExistError,'Resource [%s] does not exist' % identifier
     else:
         resource_sets = _get_setSpecs(so)
         header = common.Header(identifier, so.modified, resource_sets, so.deleted)
         if metadataPrefix == 'metashare':
             map_class = MetashareMap()
             map_ = map_class.getMap(so.metadata)
         elif metadataPrefix == 'olac':
             map_class = OlacMap()
             map_ = map_class.getMap(so.metadata)
         elif metadataPrefix == 'cmdi':
             map_class = CmdiMap()
             resource = so.resourceinfotype_model_set.all()[0]
             identifiers = resource.identificationInfo.identifier
             doi = identifiers[0] if identifiers else ""
             map_ = _add_elements_to_cmdi_metadata(
                 map_class.getMap(so.metadata),
                 resource.pk,
                 so.identifier,
                 'https://doi.org/' + doi,
             )
         else:
             raise CannotDisseminateFormatError, \
                 '%s metadata format is not supported' % metadataPrefix
         metadata = common.Metadata(map_)
         return header, metadata, None
Ejemplo n.º 9
0
    def __call__(self, element):
        # extract the Json
        jsontxt = element.text
        payload = json.loads(jsontxt)

        return common.Metadata(element, payload)
Ejemplo n.º 10
0
    def _record_for_dataset_datacite(self, dataset, set_spec):
        '''Show a tuple of a header and metadata for this dataset.
        '''
        package = get_action('package_show')({}, {'id': dataset.id})
        coverage = []
        temporal_begin = package.get('temporal_coverage_begin', '')
        temporal_end = package.get('temporal_coverage_end', '')
        geographic = package.get('geographic_coverage', '')
        if geographic:
            coverage.extend(geographic.split(','))
        if temporal_begin or temporal_end:
            coverage.append("%s/%s" % (temporal_begin, temporal_end))

        #Loops through extras -table:
        extras = {}
        for item in package['extras']:
            for key, value in item.iteritems():
                key = item[
                    'key']  # extras table is constructed as key: language, value: English
                value = item[
                    'value']  # instead of language : English, that is why it is looped here
                extras.update({key: value})

        identifiers = self._set_id(package, extras)
        subj = [tag.get('display_name') for tag in package['tags']
                ] if package.get('tags', None) else None
        if 'Discipline' in extras:
            subj.append(extras['Discipline'])

        meta = {
            'identifier':
            identifiers[0],
            'identifierType':
            identifiers[1],
            'alternateIdentifier':
            identifiers[2],
            'alternateIdentifierType':
            identifiers[3],
            'creator': [author for author in package['author'].split(";")]
            if 'author' in package else None,
            'publisher':
            extras['Publisher'] if 'Publisher' in extras else None,
            'publicationYear':
            extras['PublicationYear'] if 'PublicationYear' in extras else None,
            'publicationTimestamp':
            extras['PublicationTimestamp']
            if 'PublicationTimestamp' in extras else None,
            'resourceType':
            extras['ResourceType'] if 'ResourceType' in extras else None,
            'language':
            extras['Language'] if 'Language' in extras else None,
            'titles':
            package.get('title', None) or package.get('name'),
            'contributor': [author for author in package['author'].split(";")]
            if 'author' in package else None,
            'descriptions':
            self._get_json_content(package.get('notes')) if package.get(
                'notes', None) else None,
            'subjects':
            subj,
            'rights':
            extras['Rights'].replace('info:eu-repo/semantics/openAccess', '')
            if 'Rights' in extras else None,
            'openAccess':
            extras['OpenAccess'] if 'OpenAccess' in extras else None,
            'coverage':
            coverage if coverage else None,
        }

        metadata = {}
        # Fixes the bug on having a large dataset being scrambled to individual
        # letters
        for key, value in meta.items():
            if value and not isinstance(value, list):
                metadata[str(key)] = [value]
            else:
                metadata[str(key)] = value
        return (common.Header('', dataset.id, dataset.metadata_created,
                              set_spec,
                              False), common.Metadata('', metadata), None)
Ejemplo n.º 11
0
def nrd_metadata_reader(xml):
    '''Read metadata in NRD schema

        This function takes NRD metadata as an lxml.etree.Element object,
        and returns the same metadata as a dictionary, with central TTA
        elements picked to format-independent keys.

        :param xml: RDF metadata as XML-encoded NRD
        :type xml: lxml.etree.Element instance
        :returns: a metadata dictionary
        :rtype: a hash from string to any value
        '''
    result = rdf_reader(xml).getMap()

    def document_attrs(source, dest, result):
        '''Callback for copying document attributes'''
        copy_element(source + '/dct:title', dest + '/title', result)
        copy_element(source + '/dct:identifier', dest, result)
        copy_element(source + '/dct:creator', dest + '/creator.0/name', result)
        copy_element(source + '/nrd:creator', dest + '/creator', result,
                     person_attrs)
        copy_element(source + '/dct:description', dest + '/description',
                     result)

    def funding_attrs(source, dest, result):
        '''Callback for copying project attributes'''
        copy_element(source + '/rev:arpfo:funds.0/arpfo:grantNumber',
                     dest + '/fundingNumber', result)
        copy_element(source + '/rev:arpfo:funds.0/rev:arpfo:provides',
                     dest + '/funder', result, person_attrs)

    def file_attrs(source, dest, result):
        '''Callback for copying manifestation attributes'''
        copy_element(source + '/dcat:mediaType', dest + '/mimetype', result)
        copy_element(source + '/fp:checksum.0/fp:checksumValue.0',
                     dest + '/checksum.0', result)
        copy_element(source + '/fp:checksum.0/fp:generator.0',
                     dest + '/checksum.0/algorithm', result)
        copy_element(source + '/dcat:byteSize', dest + '/size', result)

    mapping = [
        (u'dataset', u'versionidentifier', None),
        (u'dataset/nrd:continuityIdentifier', u'continuityidentifier', None),
        (u'dataset/rev:foaf:primaryTopic.0/nrd:metadataIdentifier',
         u'metadata/identifier', None),
        (u'dataset/rev:foaf:primaryTopic.0/nrd:metadataModified',
         u'metadata/modified', None),
        (u'dataset/dct:title', u'title', None),
        (u'dataset/nrd:modified', u'modified', None),
        (u'dataset/nrd:rights', u'rights', None),
        (u'dataset/nrd:language', u'language', None),
        (u'dataset/nrd:owner', u'owner', person_attrs),
        (u'dataset/nrd:creator', u'creator', person_attrs),
        (u'dataset/nrd:distributor', u'distributor', person_attrs),
        (u'dataset/nrd:contributor', u'contributor', person_attrs),
        (u'dataset/nrd:subject', u'subject', None),  # fetch tags?
        (u'dataset/nrd:producerProject', u'project', funding_attrs),
        (u'dataset/dct:isPartOf', u'collection', document_attrs),
        (u'dataset/dct:requires', u'requires', None),
        (u'dataset/nrd:discipline', u'discipline', None),
        (u'dataset/nrd:temporal', u'temporalcoverage', None),
        (u'dataset/nrd:spatial', u'spatialcoverage', None),  # names?
        (u'dataset/nrd:manifestation', u'resource', file_attrs),
        (u'dataset/nrd:observationMatrix', u'variables', None),  # TODO
        (u'dataset/nrd:usedByPublication', u'publication', document_attrs),
        (u'dataset/dct:description', u'description', None),
    ]
    for source, dest, callback in mapping:
        copy_element(source, dest, result, callback)
    try:
        rights = lxml.etree.XML(result[u'rights'])
        rightsclass = rights.attrib['RIGHTSCATEGORY'].lower()
        result[u'rightsclass'] = rightsclass
        if rightsclass == 'licensed':
            result[u'license'] = rights[0].text
        if rightsclass == 'contractual':
            result[u'accessURL'] = rights[0].text
    except:
        pass
    return oc.Metadata(result)
Ejemplo n.º 12
0
 def __call__(self, metashare_elem ):
     xml = etree.tostring(metashare_elem[0], pretty_print=True )
     return common.Metadata({ "raw_xml": xml })
Ejemplo n.º 13
0
    def _record_for_dataset_b2f(self, dataset, set_spec):
        '''Show a tuple of a header and metadata for this dataset.
        '''
        package = get_action('package_show')({}, {'id': dataset.id})
        # coverage = []
        # temporal_begin = package.get('temporal_coverage_begin', '')
        # temporal_end = package.get('temporal_coverage_end', '')
        # geographic = package.get('geographic_coverage', '')
        # if geographic:
        #     coverage.extend(geographic.split(','))
        # if temporal_begin or temporal_end:
        #     coverage.append("%s/%s" % (temporal_begin, temporal_end))

        # Loops through extras -table:
        extras = {}
        for item in package['extras']:
            for key, value in item.iteritems():
                key = item[
                    'key']  # extras table is constructed as key: language, value: English
                value = item[
                    'value']  # instead of language : English, that is why it is looped here
                extras.update({key: value})

        identifiers = self._set_id(package, extras)
        keywords = [tag.get('display_name') for tag in package['tags']
                    ] if package.get('tags', None) else None

        meta = {
            'community':
            package.get('group', None),
            'DOI':
            extras['DOI'] if 'DOI' in extras else None,
            'PID':
            extras['PID'] if 'PID' in extras else None,
            'version':
            extras['Version'] if 'Version' in extras else None,
            'source':
            package.get('url', None),
            'relatedIdentifier':
            extras['RelatedIdentifier']
            if 'RelatedIdentifier' in extras else None,
            'creator': [author for author in package['author'].split(";")]
            if 'author' in package else None,
            'publisher':
            extras['Publisher'] if 'Publisher' in extras else None,
            'contact':
            extras['Contact'] if 'Contact' in extras else None,
            'publicationYear':
            extras['PublicationYear'] if 'PublicationYear' in extras else None,
            'metadataAccess':
            extras['MetaDataAccess'] if 'MetaDataAccess' in extras else None,
            'resourceType':
            extras['ResourceType'] if 'ResourceType' in extras else None,
            'language':
            extras['Language'] if 'Language' in extras else None,
            'titles':
            package.get('title', None) or package.get('name'),
            'contributor':
            extras['Contributor'] if 'Contributor' in extras else None,
            'descriptions':
            self._get_json_content(package.get('notes')) if package.get(
                'notes', None) else None,
            'keywords':
            keywords,
            'disciplines':
            extras['Discipline'] if 'Discipline' in extras else None,
            'rights':
            extras['Rights'].replace('info:eu-repo/semantics/openAccess', '')
            if 'Rights' in extras else None,
            'openAccess':
            extras['OpenAccess'] if 'OpenAccess' in extras else None,
            'size':
            extras['Size'] if 'Size' in extras else None,
            'format':
            extras['Format'] if 'Format' in extras else None,
            'spatialCoverage':
            extras['SpatialCoverage'] if 'SpatialCoverage' in extras else None,
            'temporalCoverage':
            extras['TemporalCoverage']
            if 'TemporalCoverage' in extras else None,
            'fundingReference':
            extras['FundingReference']
            if 'FundingReference' in extras else None,
        }

        metadata = {}
        # Fixes the bug on having a large dataset being scrambled to individual
        # letters
        for key, value in meta.items():
            if value and not isinstance(value, list):
                metadata[str(key)] = [value]
            else:
                metadata[str(key)] = value
        return (common.Header('', dataset.id, dataset.metadata_created,
                              set_spec,
                              False), common.Metadata('', metadata), None)
Ejemplo n.º 14
0
    def _record_for_dataset_datacite(self, dataset, set_spec):
        '''Show a tuple of a header and metadata for this dataset.
        '''
        package = get_action('package_show')({}, {'id': dataset.id})
        # Loops through extras -table:
        extras = {}
        for item in package['extras']:
            for key, value in item.iteritems():
                key = item[
                    'key']  # extras table is constructed as key: language, value: English
                value = item[
                    'value']  # instead of language : English, that is why it is looped here
                values = value.split(";")
                extras.update({key: values})

        temporal_begin = extras.get('TemporalCoverage:BeginDate')
        temporal_end = extras.get('TemporalCoverage:EndDate')
        dates = []
        if temporal_begin or temporal_end:
            begin = temporal_begin[0] if temporal_begin else ''
            end = temporal_end[0] if temporal_end else ''
            dates.append("%s/%s" % (begin, end))

        # identifiers = self._set_id(package, extras)
        subj = [tag.get('display_name') for tag in package['tags']
                ] if package.get('tags', None) else None
        if subj is not None and 'Discipline' in extras:
            subj.extend(extras['Discipline'])

        author = package.get('author')
        if author:
            authors = [a for a in author.split(";")]
        else:
            authors = None

        meta = {
            'DOI':
            extras['DOI'] if 'DOI' in extras else None,
            'PID':
            extras['PID'] if 'PID' in extras else None,
            'version':
            extras['Version'] if 'Version' in extras else None,
            'source':
            package.get('url', None),
            'relatedIdentifier':
            extras['RelatedIdentifier']
            if 'RelatedIdentifier' in extras else None,
            'creator':
            authors if authors else None,
            'publisher':
            extras['Publisher'] if 'Publisher' in extras else None,
            'publicationYear':
            extras['PublicationYear'] if 'PublicationYear' in extras else None,
            'publicationTimestamp':
            extras['PublicationTimestamp']
            if 'PublicationTimestamp' in extras else None,
            'resourceType':
            extras['ResourceType'] if 'ResourceType' in extras else None,
            'language':
            extras['Language'] if 'Language' in extras else None,
            'titles':
            package.get('title', None) or package.get('name'),
            'contributor':
            extras['Contributor'] if 'Contributor' in extras else None,
            'descriptions':
            self._get_json_content(package.get('notes')) if package.get(
                'notes', None) else None,
            'subjects':
            subj,
            'rights':
            extras['Rights'] if 'Rights' in extras else None,
            'openAccess':
            extras['OpenAccess'] if 'OpenAccess' in extras else None,
            'size':
            extras['Size'] if 'Size' in extras else None,
            'format':
            extras['Format'] if 'Format' in extras else None,
            'fundingReference':
            extras['FundingReference']
            if 'FundingReference' in extras else None,
            'dates':
            dates if dates else None,
            'geoLocation':
            extras['SpatialCoverage'] if 'SpatialCoverage' in extras else None,
        }

        metadata = {}
        # Fixes the bug on having a large dataset being scrambled to individual
        # letters
        for key, value in meta.items():
            if value and not isinstance(value, list):
                metadata[str(key)] = [value]
            else:
                metadata[str(key)] = value
        return (common.Header('', dataset.id, dataset.metadata_created,
                              set_spec,
                              False), common.Metadata('', metadata), None)
Ejemplo n.º 15
0
    def __call__(self, metadata_element, nsprefix="nlmaa:"):
        map = {}

        #logging.debug("Parsing " + etree.tostring(metadata_element))

        article = self._find_element(metadata_element,
                                     "{0}article".format(nsprefix))

        #In the case of the bulk importer, the root element is Article
        if article is None:
            article = metadata_element

        # front
        front = self._find_element(article, "{0}front".format(nsprefix))

        # back
        back = self._find_element(article, "{0}back".format(nsprefix))

        # journal meta
        journal_meta = self._find_element(front,
                                          "{0}journal-meta".format(nsprefix))

        # article metadata
        article_meta = self._find_element(front,
                                          "{0}article-meta".format(nsprefix))

        if journal_meta is not None:
            try:
                map["journal"] = {}

                (self._set_map_with_element_text(
                    map["journal"], "name", journal_meta,
                    "{0}journal-title-group/{0}journal-title".format(nsprefix))
                 or self._set_map_with_element_text(
                     map["journal"], "name", journal_meta,
                     "{0}journal-title".format(nsprefix)))

                issns = journal_meta.findall("{0}issn".format(nsprefix),
                                             self._namespaces)
                if issns:
                    map["journal"]["identifier"] = []
                    for issn in issns:
                        map["journal"]["identifier"].append({
                            "type":
                            issn.get('pub-type'),
                            "id":
                            issn.text,
                            "canonical":
                            issn.get('pub-type') + ':' + issn.text
                        })

                self._set_map_with_element_text(
                    map["journal"], "publisher", journal_meta,
                    "{0}publisher/{0}publisher-name".format(nsprefix))
            except:
                logging.error("Could not extract journal metadata")
        else:
            logging.info("No journal metadata found for ")

        if article_meta is not None:
            try:
                #identifiers
                article_ids = article_meta.findall(
                    "{0}article-id".format(nsprefix), self._namespaces)
                if article_ids:
                    map["identifier"] = []
                    for article_id in article_ids:
                        map["identifier"].append({
                            "type":
                            article_id.get('pub-id-type'),
                            "id":
                            article_id.text,
                            "canonical":
                            article_id.get('pub-id-type') + ':' +
                            article_id.text
                        })

                        if article_id.get(
                                'pub-id-type'
                        ) == 'pmid' and article_id.text == '17242517':
                            print "FOUND THE record with missing citations"
                            logging.critical(
                                "FOUND THE record with missing citations")
                            logging.critical(etree.tostring(metadata_element))

            except:
                logging.error(
                    "Could not extract identifiers from article metadata")

            try:
                #title
                self._set_map_with_element_text(
                    map, "title", article_meta,
                    "{0}title-group/{0}article-title".format(nsprefix))
            except:
                logging.error("Could not extract title from article metadata")

            try:
                #pagination
                self._set_map_with_element_text(map, "volume", article_meta,
                                                "{0}volume".format(nsprefix))
                self._set_map_with_element_text(map, "issue", article_meta,
                                                "{0}issue".format(nsprefix))
                self._set_map_with_element_text(map, "firstpage", article_meta,
                                                "{0}fpage".format(nsprefix))
                self._set_map_with_element_text(map, "lastpage", article_meta,
                                                "{0}lpage".format(nsprefix))
                if "firstpage" in map:
                    if "lastpage" in map and (map["firstpage"] !=
                                              map["lastpage"]):
                        map["pages"] = map["firstpage"] + "-" + map["lastpage"]
                    else:
                        map["pages"] = map["firstpage"]
            except:
                logging.error(
                    "Could not extract pagination from article metadata")

            try:
                #publication date
                # why only use the pmc-release date? need to check with Mark
                pub_date = article_meta.find(
                    "{0}pub-date[@pub-type='pmc-release']".format(nsprefix),
                    self._namespaces)
                if pub_date is not None:
                    self._set_map_with_element_text(map, "year", pub_date,
                                                    "{0}year".format(nsprefix))
                    self._set_map_with_element_text(
                        map, "month", pub_date, "{0}month".format(nsprefix))
                    self._set_map_with_element_text(map, "day", pub_date,
                                                    "{0}day".format(nsprefix))
                else:
                    logging.info("No publication data for ")
            except:
                logging.error(
                    "Could not extract publication date from article metadata")

            try:
                #copyright
                self._set_map_with_element_text(
                    map, "copyright", article_meta,
                    "{0}permissions/{0}copyright-statement".format(nsprefix))
            except:
                logging.error(
                    "Could not extract copyright info from article metadata")

            try:
                #abstract
                self._set_map_with_element_xml(map, "abstract", article_meta,
                                               "{0}abstract".format(nsprefix))
            except:
                logging.error(
                    "Could not extract abstract from article metadata")

            try:
                #keywords
                keywords = article_meta.findall(
                    "{0}kwd_group/{0}kwd".format(nsprefix), self._namespaces)
                if keywords:
                    map["keyword"] = []
                    for keyword in keywords:
                        map["keyword"].append(keyword.text)
                else:
                    logging.info("No keywords for ")
            except:
                logging.error(
                    "Could not extract keywords from article metadata")

            try:
                #contributors
                contribs = article_meta.findall(
                    "{0}contrib-group/{0}contrib".format(nsprefix),
                    self._namespaces)
                if contribs:
                    map["author"] = []
                    map["editor"] = []
                    for contrib in contribs:
                        entity = {}
                        if contrib.get('corresp') == 'yes':
                            entity["corresponding"] = 'yes'
                        self._set_map_with_element_text(
                            entity, "lastname", contrib,
                            "{0}name/{0}surname".format(nsprefix))
                        self._set_map_with_element_text(
                            entity, "forenames", contrib,
                            "{0}name/{0}given-names".format(nsprefix)
                        )  #MW: Changed firstname to forenames. Discuss with Mark.
                        if "lastname" in entity and entity[
                                "lastname"] is not None and "forenames" in entity and entity[
                                    "forenames"] is not None:
                            entity["name"] = entity[
                                "lastname"] + ", " + entity["forenames"]
                        email = contrib.find(
                            "{0}address/{0}email".format(nsprefix),
                            self._namespaces)
                        if email is None:
                            email = contrib.find("{0}email".format(nsprefix),
                                                 self._namespaces)
                        if email is not None:
                            entity["identifier"] = {
                                "type": "email",
                                "id": email.text
                            }

                        xrefs = contrib.findall("{0}xref".format(nsprefix),
                                                self._namespaces)
                        affs = article_meta.findall(
                            "{0}aff".format(nsprefix), self._namespaces
                        )  #NOT ContribGroup - check with Mark
                        for xref in xrefs:
                            if xref.get('ref-type') == "aff":
                                rid = xref.get("rid")
                                for aff in affs:
                                    if aff.get("id") == rid:
                                        if "affiliation" not in entity:
                                            entity["affiliation"] = []
                                        for text in aff.itertext():
                                            entity["affiliation"].append(text)

                        if contrib.get("contrib-type") == "author":
                            map["author"].append(entity)
                        if contrib.get("contrib-type") == "editor":
                            map["editor"].append(entity)
                else:
                    logging.info("No contributors found for ")
            except:
                logging.error(
                    "Could not extract contributors from article metadata")
        else:
            logging.info("No article metadata found for ")

        if back is not None:
            acknowledgements = back.findall(
                "{0}ack/{0}sec/{0}p".format(nsprefix), self._namespaces)
            if acknowledgements:
                map["acknowledgement"] = []
                for acknowledgement in acknowledgements:
                    map["acknowledgement"].append(acknowledgement.text)
            else:
                logging.info("No acknowledgements found for ")

            conflicts = back.findall("{0}fn-group/{0}fn/{0}p".format(nsprefix),
                                     self._namespaces)
            if conflicts:
                map["conflict"] = []
                for conflict in conflicts:
                    map["conflict"].append(conflict.text)
            else:
                logging.info("No conflicts found for ")

            refs = back.findall("{0}ref-list/{0}ref".format(nsprefix),
                                self._namespaces)
            if refs:
                map["citation"] = []
                for ref in refs:
                    entity = {}
                    self._set_map_with_element_text(
                        entity, "label", ref, "{0}label".format(nsprefix))

                    #Three different ways to cite articles. Check with Mark.
                    citation = ref.find("{0}mixed-citation".format(nsprefix),
                                        self._namespaces)
                    if citation is None:
                        citation = ref.find(
                            "{0}element-citation".format(nsprefix),
                            self._namespaces)
                    if citation is None:
                        citation = ref.find("{0}citation".format(nsprefix),
                                            self._namespaces)

                    if citation is not None:
                        self._set_map_with_element_text(
                            entity, "title", citation,
                            "{0}article-title".format(nsprefix))
                        pub_ids = citation.findall(
                            "{0}pub-id".format(nsprefix), self._namespaces)
                        if pub_ids:
                            entity["identifier"] = []
                            for pub_id in pub_ids:
                                entity["identifier"].append({
                                    "type":
                                    pub_id.get('pub-id-type'),
                                    "id":
                                    pub_id.text,
                                    'canonical':
                                    pub_id.get('pub-id-type') + ':' +
                                    pub_id.text
                                })
                    # TODO: should this append happen even if the entity is empty? or bring into the above IF
                    map["citation"].append(entity)
                    # add code here to create a record for this citation if it does not already exist
            else:
                logging.info("No refs found for ")
        else:
            logging.info("No back metadata for ")

        #logging.debug("MAP: ")
        #logging.debug(map)

        return common.Metadata(map)
Ejemplo n.º 16
0
    def __call__(self, metadata_element):
        map = {'identifier': [], 'journal': {}, 'author': []}

        arXiv = self._find_element(metadata_element, "arXiv:arXiv")
        arXiv_id = self._find_element_text(arXiv, "arXiv:id")
        doi = self._find_element_text(arXiv, "arXiv:doi")

        if arXiv_id is not None:
            map["identifier"].append({
                'type': 'arXiv',
                'id': arXiv_id,
                'canonical': 'arXiv:' + arXiv_id
            })

        if doi is not None:
            map["identifier"].append({
                'type': 'doi',
                'id': doi,
                'canonical': 'doi:' + doi
            })

        #title
        self._set_map_with_element_text(map, "title", arXiv, "arXiv:title")

        #copyright ?
        #license
        self._set_map_with_element_text(map, "license", arXiv, "arXiv:license")

        #abstract
        self._set_map_with_element_text(map, "abstract", arXiv,
                                        "arXiv:abstract")

        #journal
        self._set_map_with_element_text(map['journal'], "reference", arXiv,
                                        "arXiv:journal-ref")
        self._set_map_with_element_text(map['journal'], "comments", arXiv,
                                        "arXiv:comments")
        self._set_map_with_element_text(map['journal'], "categories", arXiv,
                                        "arXiv:categories")

        #authors
        for author in self._find_elements(arXiv, "arXiv:authors/arXiv:author"):
            entity = {}
            self._set_map_with_element_text(entity, "lastname", author,
                                            "arXiv:keyname")
            self._set_map_with_element_text(
                entity, "forenames", author, "arXiv:forenames"
            )  #MW: Changed firstname to forenames. Discuss with Mark.
            self._set_map_with_element_text(entity, "suffix", author,
                                            "arXiv:suffix")
            if "lastname" in entity and entity[
                    "lastname"] is not None and "forenames" in entity and entity[
                        "forenames"] is not None:
                entity[
                    "name"] = entity["lastname"] + ", " + entity["forenames"]
            affiliations = self._find_elements(author, "arXiv:affiliation")
            if affiliations:
                entity["affiliation"] = []
                for affiliation in affiliations:
                    entity["affiliation"].append(affiliation.text)
            map["author"].append(entity)

        return common.Metadata(map)
Ejemplo n.º 17
0
    def build_record(self, metadata):
        """Construct a OAI-PMH payload for a record"""

        return common.Metadata(None, metadata)
Ejemplo n.º 18
0
 def read(self):
     """ Parse metadata and return metadata (oaipmh.common.Metadata) with unified dictionty. """
     unified = self._read()
     result = xml_reader(self.xml).getMap()
     result['unified'] = unified
     return oc.Metadata(self.xml, result)