コード例 #1
0
 def gather_stage(self, harvest_job):
     url = harvest_job.source.url
     # Test wether we should use OAI-PMH or DDI
     metadata_registry = MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     client = oaipmh.client.Client(url, metadata_registry)
     try:
         client.identify()
     except XMLSyntaxError:
         self.harvester = DDIHarvester()
     except urllib2.URLError:
         self._save_gather_error('Could not identify source!', harvest_job)
         return None
     if not self.harvester:
         self.harvester = OAIPMHHarvester()
     objs = self.harvester.gather_stage(harvest_job)
     ret = []
     for obj in objs:
         obj = HarvestObject.get(obj)
         cont = obj.content
         dict = json.loads(cont)
         dict['harv'] = jsonpickle.encode(self.harvester)
         obj.content = json.dumps(dict)
         obj.save()
         ret.append(obj.id)
     return ret
コード例 #2
0
ファイル: oai.py プロジェクト: cameronneylon/oacensus
    def scrape(self):
        raise Exception("not finished")
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        url = self.setting('pmh-endpoint')
        client = Client(url, registry)

        print "  OAI Repository", url
        print "  Available sets:"
        for s in client.listSets():
            print "   ", s

        oai_set = self.setting('set')
        oai_from = self.setting('from')
        oai_until = self.setting('until')

        kwargs = {}

        if oai_set:
            kwargs['set'] = oai_set

        if oai_from is not None:
            date_args = [int(arg) for arg in oai_from.split("-")]
            kwargs['from_'] = datetime.datetime(*date_args)

        if oai_until is not None:
            date_args = [int(arg) for arg in oai_until.split("-")]
            kwargs['until'] = datetime.datetime(*date_args)

        records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]

        data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
        with open(data_filepath, 'wb') as f:
            print "  picking", len(records), "records"
            pickle.dump(records, f)
コード例 #3
0
def init(user):
	fullURL = URL+user
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(fullURL, registry)
	logging.info('The community %s harvested', user)
	return(client)
コード例 #4
0
    def __init__(self, oaisource, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param oaisource: the OAISource to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        if not oaisource.endpoint:
            raise ValueError(
                'No OAI endpoint was configured for this OAI source.')

        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.client = Client(oaisource.endpoint, self.registry)
        self.client._day_granularity = day_granularity
        self.translators = {
            'oai_dc': OAIDCTranslator(oaisource),
            'base_dc': BASEDCTranslator(oaisource),
        }
コード例 #5
0
 def __init__(self, url):
     registry = MetadataRegistry()
     registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader)
     self.client = Client(url, registry)
     self.institutes = {}
     self.resource_types = []
     self.load_institutes_and_types()
コード例 #6
0
ファイル: oai.py プロジェクト: tarsbase/dissemin
    def __init__(self, endpoint, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param endpoint: the address of the OAI-PMH endpoint
            to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.registry.registerReader('citeproc', citeproc_reader)
        self.client = Client(endpoint, self.registry)
        self.client._day_granularity = day_granularity
        if settings.PROAIXY_API_KEY:
            self.client.extra_parameters = {
                'key': settings.PROAIXY_API_KEY}
        self.translators = {}
コード例 #7
0
ファイル: harvest.py プロジェクト: gsastry/philarchive
def harvest(url):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)

    client = Client(url, registry)
    client.ignoreBadCharacters(true_or_false=True)

    identifiers = []
    for header in client.listIdentifiers(metadataPrefix='oai_dc'):
        # if (not(header.isDeleted())):
        print(f"Found identifier {header.identifier()}")
        identifiers.append(header.identifier())
        # else:
        #     print(f"Skipping (DELETED) identifier {header.identifier()}")

    print(f"Total number of identifiers: {len(identifiers)}")

    # Only get the identifier string at the end of the url
    identifiers = [x.split('/')[-1] for x in identifiers]

    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, 'philarchive-2.txt')

    with open(filename, 'w') as f:
        print(f"Writing to {filename}")
        f.writelines('\n'.join(identifiers))
コード例 #8
0
ファイル: metadata.py プロジェクト: Kihara-tony/Bookstore
 def setUp(self):
     self.registry = MetadataRegistry()
     self.registry.registerReader('mets', dspace_mets_reader)
     self.element = etree.parse(
         os.path.join(os.path.dirname(__file__),
                      'dspace_mets.xml')).getroot()
     self.item = self.registry.readMetadata('mets', self.element)
コード例 #9
0
def arxiv_oai_scraper(subject, start, end, sleep_time=0):

    base_url = "http://export.arxiv.org/oai2"
    output = list()

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)

    for _, md, _ in records:

        # print md.getField("title")
        # checks for the case in 2010 when there is no title for something
        if md is not None:

            txt_dict = {"title": md["title"],
                    "abstract": md["description"],
                    "date": md["date"],
                    "subject": md["subject"],
                    "url": md["identifier"],
                    "authors": md['creator']}

            output.append(txt_dict)

        time.sleep(sleep_time)

    return output
コード例 #10
0
ファイル: utils.py プロジェクト: llcit/uh-dla-dev-py
    def list_oai_collections(self, community):
        """ Retrieve the header data for each record in the current community repo """

        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(community.repository.base_url, registry)
            records = client.listIdentifiers(
                metadataPrefix='oai_dc', set=community.identifier)
        except:
            community_collections = set()
            return


        """ Filter records to build list of collections in the community set """
        community_collections = set()
        for i in records:
            for j in i.setSpec():
                if j[:3] == 'col':
                    community_collections.add(j)
    
        print len(community_collections)
        """ Build collection tuples (identifier, name) """
        for i in community_collections:
            # print i
            # print community_collections
            
            set_data = []
            set_data.append(i)  # Store identifier
            set_data.append('Collection: %s'%i)  # Store human readable name
            # print set_data
            self.collections.append(set_data)
コード例 #11
0
ファイル: openaire.py プロジェクト: LibrarPotter/zenodo
 def __init__(self, url):
     """Initialize client."""
     registry = MetadataRegistry()
     registry.registerReader('oaf', self.oaf_reader)
     return super(OpenAireClient, self).__init__(
         url, metadata_registry=registry
     )
コード例 #12
0
ファイル: autoinsert.py プロジェクト: rfurman/arxaliv
def insertAll(time, time2):
    registry = MetadataRegistry()
    registry.registerReader('arXivRaw', arXivRaw_reader)
    client = Client(URL, registry)
    client.updateGranularity()
    list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
    errors = 0
    for a in list:
        #a = list.next()
        try:
            title = '\n'.join(a[1]['title'])
            sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
            abstract = '\n'.join(a[1]['abstract'])
            url = 'http://arxiv.org/abs/' + a[1]['id'][0]
            date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
            authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
            abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
            print title
            print sr2
            print abstract
            print url
            print date
            print authors
            insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
        except:
            print 'ERROR'
            print a
            errors = errors+1
    print 'Completed with %s errors' % errors
コード例 #13
0
    def __init__(self,
                 url,
                 prefix=nsdl.LR_NSDL_PREFIX,
                 reader=None,
                 fields=None,
                 namespaces=None,
                 fieldMap=None):
        '''
        Constructor
        '''

        if fields == None:
            self._fields = nsdl.LR_NSDL_DC_FIELDS
        else:
            self._fields = fields

        if fieldMap == None:
            self._fieldMap = nsdl.NSDL_TO_LR_MAP
        else:
            self._fieldMap = fieldMap

        if namespaces == None:
            self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
        else:
            self._namespaces = namespaces

        if reader == None:
            reader = MetadataReader(fields=self._fields,
                                    namespaces=self._namespaces)

        self._url = url
        self._registry = MetadataRegistry()
        self._prefix = prefix
        self._registry.registerReader(prefix, reader)
        self._client = Client(url, self._registry)
コード例 #14
0
 def _get_client_identifier(self, url, harvest_job=None):
     registry = MetadataRegistry()
     registry.registerReader(self.metadata_prefix_value, oai_dc_reader)
     client = oaipmh.client.Client(url, registry)
     try:
         identifier = client.identify()
     except (urllib2.URLError, urllib2.HTTPError,):
         if harvest_job:
             self._save_gather_error(
                 'Could not gather from %s!' % harvest_job.source.url,
                 harvest_job)
         return client, None
     except socket.error:
         if harvest_job:
             errno, errstr = sys.exc_info()[:2]
             self._save_gather_error(
                 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr),
                 harvest_job)
         return client, None
     except ValueError:
         # We have no source URL when importing via UI.
         return client, None
     except Exception as e:
         # Guard against miscellaneous stuff. Probably plain bugs.
         log.debug(traceback.format_exc(e))
         return client, None
     return client, identifier
コード例 #15
0
ファイル: harvester.py プロジェクト: HarmdR/ckan-oaipmh-epos
 def _create_metadata_registry(self):
     registry = MetadataRegistry()
     registry.registerReader('oai_dc', oai_dc_reader)
     registry.registerReader('oai_ddi', oai_ddi_reader)
     # TODO: Change back?
     registry.registerReader('dif', dif_reader2)
     # HDR
     registry.registerReader('datacite', datacite_reader)
     return registry
コード例 #16
0
ファイル: OaiClient.py プロジェクト: JeanFred/BibRose
    def __init__(self, configuration_file):
        """Constructor."""
        self.oai_config = ConfigParser.SafeConfigParser()
        self.oai_config.read(configuration_file)
        self.current_config = 'ToulouseBis'

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        self.client = Client(self._get_config_value('url'), registry)
コード例 #17
0
    def __init__(self, configuration_file):
        """Constructor."""
        self.oai_config = ConfigParser.SafeConfigParser()
        self.oai_config.read(configuration_file)
        self.current_config = 'ToulouseBis'

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        self.client = Client(self._get_config_value('url'), registry)
コード例 #18
0
ファイル: harvester.py プロジェクト: ilrt/ckanext-oaipmh
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
コード例 #19
0
def test(request):
	URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler'
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(URL, registry)
	identifyResponse = client.identify()

	print dir(identifyResponse)
	#for record in client.listRecords(metadataPrefix='oai_dc'):
	#	result += record
	return HttpResponse(identifyResponse.repositoryName())
コード例 #20
0
def _registerReader(metadata_format):
    """
    """
    #TODO, check namespaces
    if metadata_format in ("metashare", "cmdi", "olac"):
        metadata_registry = MetadataRegistry()
        metadata_registry.registerReader(metadata_format, Reader())
        return metadata_registry
    else:
        raise NotImplementedError("The %s metadata format is " \
                                  "currently not supported." % metadata_format)
コード例 #21
0
 def test_get_record(self):
     metadata_reg = MetadataRegistry()
     metadata_reg.registerReader('oai_dc', oai_dc_reader)
     client = Client(config.get('ckan.site_url') + self.base_url, metadata_reg)
     res = self._oai_get_method_and_validate('?verb=ListIdentifiers&metadataPrefix=oai_dc&set=roger')
     urllib2.urlopen = mock.Mock(return_value=StringIO(res))
     ids = client.listIdentifiers(metadataPrefix='oai_dc')
     offset = self.base_url + '?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc' % ids.next().identifier()
     res = self.app.get(offset)
     self.assert_(oaischema.validate(etree.fromstring(res.body)))
     self.assert_("abraham" in res.body)
コード例 #22
0
ファイル: utils.py プロジェクト: llcit/uh-dla-dev-py
    def harvest_oai_collection_records(self, collection):
        records = []
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(collection.community.repository.base_url, registry)
            records = client.listRecords(
                metadataPrefix='oai_dc', set=collection.identifier)
        except:
            return

        return records
コード例 #23
0
ファイル: forms.py プロジェクト: llcit/llt
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
コード例 #24
0
 def test_resumption_identifiers(self):
     metadata_reg = MetadataRegistry()
     metadata_reg.registerReader('oai_dc', oai_dc_reader)
     urllib2.urlopen = realopen
     client = CKANServer()
     metadata_registry = metadata.MetadataRegistry()
     metadata_registry.registerReader('oai_dc', oai_dc_reader)
     metadata_registry.registerWriter('oai_dc', oai_dc_writer)
     serv = BatchingServer(client, metadata_registry=metadata_registry)
     client = ServerClient(serv, metadata_reg)
     recs = client.listIdentifiers(metadataPrefix='oai_dc')
     for rec in recs:
         self.assert_(rec)
コード例 #25
0
ファイル: forms.py プロジェクト: llcit/uh-dla-dev-py
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
コード例 #26
0
ファイル: transformer.py プロジェクト: kraenhansen/datafest
def get_client(url, transforms):
    transforms = fix_transforms(transforms)
    registry = MetadataRegistry()
    c = Client(url, registry)
    metadata = c.listMetadataFormats()
    metadata[0] = [
        'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb']
    namespaces = dict((x[0], x[2]) for x in metadata)
    fields = dict((transform['field'], ('textList', transform['path']))
                  for transform in transforms)
    namespace = metadata[0][0]
    print namespaces,fields
    registry.registerReader(namespace, MetadataReader(fields=fields, namespaces=namespaces))
    return c, namespace
コード例 #27
0
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r,record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)     
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)                 
コード例 #28
0
ファイル: base_spider.py プロジェクト: dissemin/croawl
 def read_base_records(self):
     registry = MetadataRegistry()
     registry.registerReader('base_dc', base_dc_reader)
     client = Client('http://doai.io/oai', registry)
     for header, record, _ in client.listRecords(metadataPrefix='base_dc'):
         # only process records for which base was unsure
         if '2' not in record['oa']:
             continue
         # extract splash_url
         for link in record['identifier']:
             metadata = {'base_oa':''.join(record['oa']),
                     'splash_url':link,
                     'from_identifier':header.identifier()}
             yield self.filter_url(link,metadata, looking_for='any')
コード例 #29
0
    def setUp(self):
        super(BookMetadataTest, self).setUp()
        xml = path.join(path.dirname(__file__), 'files/lubie-kiedy-kobieta.xml')
        self.book = models.Book.from_xml_file(xml)

        xml = path.join(path.dirname(__file__), 'files/antygona.xml')
        self.book2 = models.Book.from_xml_file(xml)

        mr = MetadataRegistry()
        self.catalogue = Catalogue(mr)

        mr.registerWriter('oai_dc', oai_dc_writer)
        nsmap = {'oai_dc': NS_OAIDC, 'dc': NS_DC, 'xsi': NS_XSI}
        self.xml = XMLTreeServer(self.catalogue, mr, nsmap)
コード例 #30
0
ファイル: util.py プロジェクト: science/LearningRegistry
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r, record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)
コード例 #31
0
ファイル: oaipmhapi.py プロジェクト: learner9753/wolnelektury
    def setUp(self):
        super(BookMetadataTest, self).setUp()
        xml = path.join(path.dirname(__file__),
                        'files/lubie-kiedy-kobieta.xml')
        self.book = models.Book.from_xml_file(xml)

        xml = path.join(path.dirname(__file__), 'files/antygona.xml')
        self.book2 = models.Book.from_xml_file(xml)

        mr = MetadataRegistry()
        self.catalogue = Catalogue(mr)

        mr.registerWriter('oai_dc', oai_dc_writer)
        nsmap = {'oai_dc': NS_OAIDC, 'dc': NS_DC, 'xsi': NS_XSI}
        self.xml = XMLTreeServer(self.catalogue, mr, nsmap)
コード例 #32
0
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                    metadataPrefix=self._prefix, from_=from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
コード例 #33
0
 def __init__(self, dbName):
     global configs, dbs, session
     self.protocolMap = configs[dbName]
     self.db = dbs[dbName]
     session.database = self.db.id
     # get some generally useful stuff now
     self.baseURL = self.protocolMap.baseURL
     # get earliest datestamp in database
     q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0)))) # get UTC of the epoch as query term
     try:
         tl = self.db.scan(session, q, 1)
     except SRWDiagnostics.Diagnostic16:
         raise ConfigFileException('Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id)
     else:
         try:
             datestamp = tl[0][0]
         except IndexError:
             #something went wrong :( - use the epoch
             self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
         else:
             try:
                 self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%dT%H:%M:%S')
             except ValueError:
                 self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%d %H:%M:%S')
     
     self.repositoryName = self.protocolMap.title
     self.protocolVersion = self.protocolMap.version
     self.adminEmails = self.protocolMap.contacts
     self.deletedRecord = "no"    # Cheshire3 does not support deletions at this time
     self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity
     self.compression = []        # Cheshire3 does not support compressions at this time
     self.metadataRegistry = OaiMetadataRegistry()
コード例 #34
0
 def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None):
     '''
     Constructor
     '''
     
     if fields == None:
         self._fields = nsdl.LR_NSDL_DC_FIELDS
     else:
         self._fields = fields
     
     if fieldMap == None:
         self._fieldMap = nsdl.NSDL_TO_LR_MAP
     else:
         self._fieldMap = fieldMap
     
     if namespaces == None:
         self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
     else:
         self._namespaces = namespaces
         
     if reader == None:
         reader = MetadataReader(fields = self._fields, namespaces = self._namespaces)
     
     self._url = url
     self._registry = MetadataRegistry()
     self._prefix = prefix
     self._registry.registerReader(prefix, reader)
     self._client = Client(url, self._registry)
コード例 #35
0
ファイル: oai.py プロジェクト: Phyks/dissemin
    def __init__(self, oaisource, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param oaisource: the OAISource to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        if not oaisource.endpoint:
            raise ValueError('No OAI endpoint was configured for this OAI source.')

        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.client = Client(oaisource.endpoint, self.registry)
        self.client._day_granularity = day_granularity
        self.translators = {
            'oai_dc': OAIDCTranslator(oaisource),
            'base_dc': BASEDCTranslator(oaisource),
        }
コード例 #36
0
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """    
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start, until=end, set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1
        
        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({ 'handle' : handle[0] })
        for key in qdc_reader._fields.keys():
           r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n');

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
コード例 #37
0
ファイル: oai.py プロジェクト: Alchemy-Meister/OAI-PMH
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                metadataPrefix = self._prefix,
                from_ = from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
コード例 #38
0
ファイル: transformer.py プロジェクト: kraenhansen/datafest
def get_client(url, transforms):
    transforms = fix_transforms(transforms)
    registry = MetadataRegistry()
    c = Client(url, registry)
    metadata = c.listMetadataFormats()
    metadata[0] = [
        'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd',
        'http://www.kulturarv.dk/fbb'
    ]
    namespaces = dict((x[0], x[2]) for x in metadata)
    fields = dict((transform['field'], ('textList', transform['path']))
                  for transform in transforms)
    namespace = metadata[0][0]
    print namespaces, fields
    registry.registerReader(
        namespace, MetadataReader(fields=fields, namespaces=namespaces))
    return c, namespace
コード例 #39
0
ファイル: harvester.py プロジェクト: mknezevic/ckanext-oaipmh
    def _get_client_identifier(self, url, harvest_job=None):
        registry = MetadataRegistry()

        if 'metadata_formats' in self.config:
            for mdp in self.config['metadata_formats']:
                registry.registerReader(mdp, kata_oai_dc_reader)
            if self.metadata_prefix_value not in self.config['metadata_formats']:
                registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)
        else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)
        
        client = oaipmh.client.Client(url, registry)
        try:
            identifier = client.identify()
            client.updateGranularity() #quickfix: to set corrent datetime granularity, updateGranularity has to be called 
        except (urllib2.URLError, urllib2.HTTPError) as err:
            log.debug("Error occurred: {0}".format(err))
            if harvest_job:
                self._save_gather_error('Could not gather from %s!' % harvest_job.source.url, harvest_job)
            return client, None
        except socket.error:
            if harvest_job:
                errno, errstr = sys.exc_info()[:2]
                self._save_gather_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job)
            return client, None
        except ValueError:
            # We have no source URL when importing via UI.
            return client, None
        except Exception as e:
            # Guard against miscellaneous stuff. Probably plain bugs.
            log.debug(traceback.format_exc(e))
            return client, None
        return client, identifier
コード例 #40
0
ファイル: openbeelden_uploader.py プロジェクト: xqt/toollabs
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
            'title': ('textList', 'oai_oi:oi/oi:title/text()'),
            'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
            'creator': ('textList', 'oai_oi:oi/oi:creator/text()'),
            'subject': ('textList', 'oai_oi:oi/oi:subject/text()'),
            'description': ('textList', 'oai_oi:oi/oi:description/text()'),
            'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
            'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'),
            'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
            'date': ('textList', 'oai_oi:oi/oi:date/text()'),
            'type': ('textList', 'oai_oi:oi/oi:type/text()'),
            'extent': ('textList', 'oai_oi:oi/oi:extend/text()'),
            'medium': ('textList', 'oai_oi:oi/oi:medium/text()'),
            'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'),
            'source': ('textList', 'oai_oi:oi/oi:source/text()'),
            'language': ('textList', 'oai_oi:oi/oi:language/text()'),
            'references': ('textList', 'oai_oi:oi/oi:references/text()'),
            'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'),
            'attributionName':
            ('textList', 'oai_oi:oi/oi:attributionName/text()'),
            'attributionURL':
            ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
            'license': ('textList', 'oai_oi:oi/oi:license/text()'),
            #Zitten er niet in
            #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
            #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
            #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
            #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),
        },
        namespaces={
            'oi': 'http://www.openbeelden.nl/oai/',
            'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms',
        })
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)

    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
コード例 #41
0
ファイル: metadata.py プロジェクト: taneliselin/oai-harvest
 def writeMetadata(self, metadata_prefix, element, metadata):
     try:
         return MetadataRegistry.writeMetadata(self, metadata_prefix,
                                               element, metadata)
     except KeyError as key_error:
         try:
             return self.defaultWriter(element, metadata)
         except AttributeError:
             raise key_error
コード例 #42
0
ファイル: metadata.py プロジェクト: taneliselin/oai-harvest
 def readMetadata(self, metadata_prefix, element):
     try:
         return MetadataRegistry.readMetadata(self, metadata_prefix,
                                              element)
     except KeyError as key_error:
         try:
             return self.defaultReader(element)
         except AttributeError:
             raise key_error
コード例 #43
0
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
        'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
        'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
        'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
        'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
        'description': ('textList', 'oai_oi:oi/oi:description/text()'),
        'abstract':       ('textList', 'oai_oi:oi/oi:abstract/text()'),
        'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
        'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
        'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
        'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
        'extent':      ('textList', 'oai_oi:oi/oi:extend/text()'),
        'medium':       ('textList', 'oai_oi:oi/oi:medium/text()'),
        'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
        'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
        'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
        'references':  ('textList', 'oai_oi:oi/oi:references/text()'),
        'spatial':  ('textList', 'oai_oi:oi/oi:spatial/text()'),
        'attributionName':       ('textList', 'oai_oi:oi/oi:attributionName/text()'),
        'attributionURL':       ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
        'license':       ('textList', 'oai_oi:oi/oi:license/text()'),
        #Zitten er niet in
        #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
        #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
        #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
        #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),        
        },
        namespaces={
            'oi' : 'http://www.openbeelden.nl/oai/',
            'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc' : 'http://purl.org/dc/elements/1.1/',
            'dcterms' : 'http://purl.org/dc/terms',
        }
    )
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)
    
    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
コード例 #44
0
ファイル: metadata.py プロジェクト: bloomonkey/oai-harvest
 def readMetadata(self, metadata_prefix, element):
     try:
         return MetadataRegistry.readMetadata(self,
                                              metadata_prefix,
                                              element)
     except KeyError as key_error:
         try:
             return self.defaultReader(element)
         except AttributeError:
             raise key_error
コード例 #45
0
    def iter_items(self, partition):
        """ Partition is an OAI-PMH endpoint """

        # source = "oai:%s" % partition

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = Client(partition, registry)

        for record in client.listRecords(metadataPrefix='oai_dc'):
            header, metadata, _ = record

            if header.isDeleted():
                continue

            # _id = header.identifier()
            # date = header.datestamp()

            meta = metadata.getMap()

            # TODO: there are much validation and heuristics to be done here!

            # format0 = (meta.get("format") or [None])[0]
            # if not format0:
            #     continue

            # if format0 not in ("application/pdf", ):
            #     continue

            url0 = (meta.get("identifier") or [None])[0]

            if not url0:
                continue

            title0 = (meta.get("title") or [""])[0].encode("utf-8")
            desc0 = (meta.get("description") or [""])[0].encode("utf-8")

            # TODO: validate that the url0 is not on another domain?!
            yield url0, {}, "html", 2, """
                <html><head><title>%s</title></head><body>%s</body></html>
            """ % (title0, desc0)
コード例 #46
0
ファイル: metadata.py プロジェクト: bloomonkey/oai-harvest
 def writeMetadata(self, metadata_prefix, element, metadata):
     try:
         return MetadataRegistry.writeMetadata(self,
                                               metadata_prefix,
                                               element,
                                               metadata
                                               )
     except KeyError as key_error:
         try:
             return self.defaultWriter(element, metadata)
         except AttributeError:
             raise key_error
コード例 #47
0
    def import_stage(self, harvest_object):
        """
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        """
        # Do common tasks and then call different methods depending on what
        # kind of info the harvest object contains.
        self._set_config(harvest_object.job.source.config)
        ident = json.loads(harvest_object.content)
        registry = MetadataRegistry()
        registry.registerReader(self.metadata_prefix_value, oai_dc_reader)
        client = oaipmh.client.Client(harvest_object.job.source.url, registry)
        domain = ident['domain']
        group = Group.get(domain)  # Checked in gather_stage so exists.
        try:
            if ident['fetch_type'] == 'record':
                return self._fetch_import_record(
                    harvest_object, ident, client, group)
            if ident['fetch_type'] == 'set':
                return self._fetch_import_set(
                    harvest_object, ident, client, group)
            # This should not happen...
            log.error('Unknown fetch type: %s' % ident['fetch_type'])
        except Exception as e:
            # Guard against miscellaneous stuff. Probably plain bugs.
            # Also very rare exceptions we haven't seen yet.
            log.debug(traceback.format_exc(e))
        return False
コード例 #48
0
	def setupOAIPMHConnection(self):
		oai_oi_reader = MetadataReader(
		    fields={
		    'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
		    'alternative':       ('textList', 'oai_oi:oi/oi:alternative/text()'),
		    'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
		    'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
		    'description': ('textList', 'oai_oi:oi/oi:description/text()'),
		    'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
		    'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
		    'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
		    'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
		    'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
		    'extent':        ('textList', 'oai_oi:oi/oi:extent/text()'),
		    'medium':        ('textList', 'oai_oi:oi/oi:medium/text()'),
		    'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
		    'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
		    'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
		    'references':    ('textList', 'oai_oi:oi/oi:references/text()'),
		    'spatial':    ('textList', 'oai_oi:oi/oi:spatial/text()'),
		    'attributionName':    ('textList', 'oai_oi:oi/oi:attributionName/text()'),
		    'attributionURL':    ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
		    'license':      ('textList', 'oai_oi:oi/oi:license/text()')
		    },

		    namespaces={
		    	'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
		    	'oi': 'http://www.openbeelden.nl/oai/'
		    }
		)

		URL = 'http://www.openbeelden.nl/feeds/oai/'

		#Initialize the OAI client
		self.registry = MetadataRegistry()
		self.registry.registerReader('oai_oi', oai_oi_reader)
		self.client = Client(URL, self.registry)

		#Test if the connection to the OAI-PMH provider works
		x = self.client.updateGranularity()
		x = self.client.identify()
		print 'identity %s' % x.repositoryName()
		print 'identity %s' % x.protocolVersion()
		print 'identity %s' % x.baseURL()

		"""
		for s in client.listSets():
			print s
		"""

		#initialize the OpenSKOSHandler
		self.openSKOSHandler = OpenSKOSHandler()
コード例 #49
0
    def __init__(self, session, configs, dbs, dbName):
        self.session = session
        self.protocolMap = configs[dbName]
        self.db = dbs[dbName]
        session.database = self.db.id
        # Get some generally useful stuff now
        self.baseURL = self.protocolMap.baseURL
        # Get earliest datestamp in database - UTC of the epoch as query term
        q = cqlparse('rec.lastModificationDate > "%s"'
                     '' % (str(datetime.datetime.utcfromtimestamp(0))))
        try:
            tl = self.db.scan(session, q, 1)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException(
                'Index map for '
                'rec.lastModificationDate required '
                'in protocolMap: %s'
                '' % self.db.get_path(session, 'protocolMap').id)
        else:
            try:
                datestamp = tl[0][0]
            except IndexError:
                # Something went wrong :( - use the epoch
                self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
            else:
                try:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%dT%H:%M:%S')
                except ValueError:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%d %H:%M:%S')

        self.repositoryName = self.protocolMap.title
        self.protocolVersion = self.protocolMap.version
        self.adminEmails = self.protocolMap.contacts
        # Check for deletion support
        recordStore = self.db.get_path(session, 'recordStore')
        deletions = recordStore.get_setting(session, 'storeDeletions')
        # Cheshire3 cannot guarantee that deletions will persist
        self.deletedRecord = "transient" if deletions else "no"
        # Finest level of granularity
        self.granularity = "YYYY-MM-DDThh:mm:ssZ"
        # Cheshire3 does not support compressions at this time
        self.compression = []
        self.metadataRegistry = OaiMetadataRegistry()
コード例 #50
0
ファイル: repository.py プロジェクト: Kihara-tony/Bookstore
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)
コード例 #51
0
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start,
                                 until=end,
                                 set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1

        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({'handle': handle[0]})
        for key in qdc_reader._fields.keys():
            r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n')

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
コード例 #52
0
    def __init__(self, dbName):
        global configs, dbs, session
        self.protocolMap = configs[dbName]
        self.db = dbs[dbName]
        session.database = self.db.id
        # get some generally useful stuff now
        self.baseURL = self.protocolMap.baseURL
        # get earliest datestamp in database
        q = cqlparse('rec.lastModificationDate > "%s"' %
                     (str(datetime.datetime.utcfromtimestamp(0)))
                     )  # get UTC of the epoch as query term
        try:
            tl = self.db.scan(session, q, 1)
        except SRWDiagnostics.Diagnostic16:
            raise ConfigFileException(
                'Index map for rec.lastModificationDate required in protocolMap: %s'
                % self.db.get_path(session, 'protocolMap').id)
        else:
            try:
                datestamp = tl[0][0]
            except IndexError:
                #something went wrong :( - use the epoch
                self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0)
            else:
                try:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%dT%H:%M:%S')
                except ValueError:
                    self.earliestDatestamp = datetime.datetime.strptime(
                        datestamp, '%Y-%m-%d %H:%M:%S')

        self.repositoryName = self.protocolMap.title
        self.protocolVersion = self.protocolMap.version
        self.adminEmails = self.protocolMap.contacts
        self.deletedRecord = "no"  # Cheshire3 does not support deletions at this time
        self.granularity = "YYYY-MM-DDThh:mm:ssZ"  # finest level of granularity
        self.compression = [
        ]  # Cheshire3 does not support compressions at this time
        self.metadataRegistry = OaiMetadataRegistry()
コード例 #53
0
 def _initialise_client(self, url):
     registry = MetadataRegistry()
     registry.registerReader('oai_dc', oai_dc_reader)
     registry.registerReader('ore', oai_ore_reader)
     logging.info('Initialising OAI client with URL [%s]', url)
     return Client(url, registry)
コード例 #54
0
def indexCollection(URL, url_base, metadata_prefix, collection, action):
    #pull data from OAI endpoint
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(URL, registry, force_http_get=True)

    harvested_data = []
    for record in client.listRecords(metadataPrefix=metadata_prefix,
                                     set=collection):
        if not record[0].isDeleted():
            fields = record[1].getMap()
            if fields['subject']:
                fields['subjects'] = fields['subject'][0].split(';')
                del fields['subject']
            fields['set'] = record[0].setSpec()
            identifier = record[0].identifier().split(':')[2]
            fields[
                'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/'
            harvested_data.append(fields)

    if action is 'reindex':
        es.indices.delete(index='digital_collection_recs', ignore=[400, 404])

        mapping = {
            "mappings": {
                "_doc": {
                    "properties": {
                        "title": {
                            "type": "text"
                        },
                        "creator": {
                            "type": "text"
                        },
                        "subjects": {
                            "type": "text"
                        },
                        "description": {
                            "type": "text"
                        },
                        "publisher": {
                            "type": "text"
                        },
                        "contributor": {
                            "type": "text"
                        },
                        "date": {
                            "type": "text"
                        },
                        "type": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "format": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "identifier": {
                            "type": "text"
                        },
                        "source": {
                            "type": "text"
                        },
                        "language": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "relation": {
                            "type": "text"
                        },
                        "coverage": {
                            "type": "text"
                        },
                        "rights": {
                            "type": "text"
                        },
                        "set": {
                            "type": "text",
                            "fielddata": "true"
                        },
                        "image_url_base": {
                            "type": "text"
                        }
                    }
                }
            }
        }
        es.indices.create(index='digital_collection_recs', body=mapping)

    helpers.bulk(es,
                 harvested_data,
                 index='digital_collection_recs',
                 doc_type='_doc')

    return "success"
コード例 #55
0
ファイル: views.py プロジェクト: barszczmm/wolnelektury
            'type',
            'format',
            'identifier',
            'source',
            'language',
            'relation',
            'coverage',
            'rights',
    ]:
        for value in map.get(name, []):
            e = SubElement(element, nsdc(name), nsmap=nsmap)
            e.text = value

    for name in ['hasPart', 'isPartOf']:
        for value in map.get(name, []):
            e = SubElement(element, nsdcterms(name), nsmap=nsmap)
            e.text = value


metadata_registry = MetadataRegistry()
metadata_registry.registerWriter('oai_dc', fbc_oai_dc_writer)
metadata_registry.registerWriter('qdc', qdc_writer)

server = ServerBase(Catalogue(metadata_registry), metadata_registry,
                    {'topxsi': NS_XSI})


def oaipmh(request):
    resp = server.handleRequest(request.GET)
    return HttpResponse(resp, content_type='application/xml')
コード例 #56
0
def harvest(metadata_set, dest_folder, log_file, content_type,
            from_date, until_date):

    #############################
    # ### FILESYSTEM CHECKS ### #
    #############################
    try:
        if not os.path.isdir(dest_folder):
            os.makedirs(dest_folder)
        # Verify write permission inside the folder:
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create destination folder: %s" % dest_folder)

    try:
        test_path = os.path.join(dest_folder, '__test_permissions__')
        os.makedirs(test_path)
        os.rmdir(test_path)
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to use destination folder: %s" % dest_folder)

    try:
        log_handle = open(log_file, 'a+')
        log_handle.close()
    except BaseException as e:
        log.error(str(e))
        log.exit("Unable to create log_file: %s" % log_file)

    #################################
    # ### OAI-PMH CONFIGURATION ### #
    #################################
    URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do'
    metadata_prefix = 'efg'

    ###################################
    # ### OPEN OAI-PMH CONNECTION ### #
    ###################################
    registry = MetadataRegistry()
    registry.registerReader(metadata_prefix, oai_dc_reader)

    #print ("URL=" + str(URL))

    client = Client(URL, registry)

    ####################################
    # ### CHECK IF THIS SET EXISTS ### #
    ####################################
    set_found = False
    for s in client.listSets():
        if metadata_set == s[0]:
            set_found = True

    if not set_found:
        log.exit("Unable to find this set: %s" % metadata_set)

    #############################
    # ### RETRIEVE METADATA ### #
    #############################

    if from_date is not None:
        from_date = parse_date(from_date)
        if from_date is None:
            log.exit("Unable to convert from date")

    if until_date is not None:
        until_date = parse_date(until_date)
        if until_date is None:
            log.exit("Unable to convert until date")

    report_data = {
        'downloaded': 0,
        'filtered': 0,
        'saved': 0,
        'saved_files': [],
        'missing_sourceid': [],
        'wrong_content_type': []
    }
    timestamp = int(1000 * time.time())
    log.info("Retrieving records for %s..." % metadata_set)
    try:
        records = client.listRecords(
            metadataPrefix=metadata_prefix,
            set=metadata_set,
            from_=from_date,
            until=until_date)
    except NoRecordsMatchError as e:
        log.exit(e)

    log.info("Records retrieved, extracting...")
    try:

        for record in records:
            element = record[1].element()
            # Obtained eTree is based on namespaced XML
            # Read: 19.7.1.6. Parsing XML with Namespaces
            # https://docs.python.org/2/library/xml.etree.elementtree.html

            # find(match)
            # Finds the first subelement matching match.
            #   match may be a tag name or path.
            #   Returns an element instance or None.

            # findall(match)
            # Finds all matching subelements, by tag name or path.
            #   Returns a list containing all matching elements
            #   in document order.

            report_data['downloaded'] += 1

            if report_data['downloaded'] % 100 == 0:
                print('.', end='', flush=True)

                if report_data['downloaded'] % 5000 == 0:
                    print(
                        ' %s downloaded - %s saved' % (
                            report_data['downloaded'],
                            report_data['saved']
                        ), flush=True)

            efgEntity = element.find(tag("efgEntity"))
            if efgEntity is None:
                # log.warning("efgEntity not found, skipping record")
                continue
            avcreation = efgEntity.find(tag("avcreation"))
            nonavcreation = efgEntity.find(tag("nonavcreation"))

            if avcreation is not None:
                manifestation = avcreation.find(tag("avManifestation"))
                recordSource = avcreation.find(tag("recordSource"))
                keywords = avcreation.findall(tag("keywords"))
                title_el = avcreation.find(tag("identifyingTitle"))
                title = (title_el.text
                         if title_el is not None
                         else "Unknown title")
            elif nonavcreation is not None:
                manifestation = nonavcreation.find(tag("nonAVManifestation"))
                recordSource = nonavcreation.find(tag("recordSource"))
                keywords = nonavcreation.findall(tag("keywords"))
                title_el = nonavcreation.find(tag("title"))
                title = (title_el.find(tag("text")).text
                         if title_el is not None
                         else "Unknown title")
            else:
                title = "Unknown title"
                # log.warning("(non)avcreation not found, skipping record")
                continue

            filter_keyword = "IMediaCities"
            is_good = False
            for keyword in keywords:
                term = keyword.find(tag("term"))
                if term.text == filter_keyword:
                    is_good = True
                    break

            if not is_good:
                continue

            report_data['filtered'] += 1

            if manifestation is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("avManifestation not found, skipping record")
                continue

            if content_type is not None:
                content_type = content_type.lower()

                item = manifestation.find(tag("item"))
                if item is None:
                    # missing <item> => type cannot be found
                    report_data['wrong_content_type'].append(title)
                    continue

                item_type = item.find(tag("type"))
                if item_type is None:
                    # missing <type>
                    report_data['wrong_content_type'].append(title)
                    continue

                if item_type.text.lower() != content_type:
                    # wrong type
                    report_data['wrong_content_type'].append(title)
                    continue



            # ATTENZIONE: il sourceID va preso dal recordSource che sta
            #              sotto avcreation/nonavcreation e NON sotto
            #               avManifestation/nonAVManifestation

            #recordSource = manifestation.find(tag("recordSource"))
            if recordSource is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("recordSource not found, skipping record")
                continue

            sourceID = recordSource.find(tag("sourceID"))
            if sourceID is None:
                report_data['missing_sourceid'].append(title)
                # log.warning("sourceID not found, skipping record")
                continue

            content = etree.tostring(efgEntity, pretty_print=False)

            # id_text = urllib.parse.quote_plus(sourceID.text.strip())
            # replace non alpha-numeric characters with a dash
            id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip())
            # fine cinzia

            filename = "%s_%s_%s.xml" % (
                metadata_set,
                id_text,
                timestamp
            )
            filepath = os.path.join(dest_folder, filename)
            # with open(filepath, 'wb') as f:
            with codecs.open(filepath, 'wb', "utf-8") as f:
                f.write(content.decode('utf-8'))
            # OLD
            #with codecs.open(filepath, 'wb', "utf-8") as f:
            #    f.write(html.unescape(content.decode('utf-8')))

            report_data['saved'] += 1
            report_data['saved_files'].append(filename)

    except NoRecordsMatchError as e:
        log.warning("No more records after filtering?")
        log.warning(e)

        # ###################
        # Write report file
        # ###################

        # the procedure writes a report file containing the results
        #     of the harvesting:
        # the list of records that do not contain the record ID
        #     (by writing the content of the element title)

    with open(log_file, 'w+') as f:
        json.dump(report_data, f)

    f.close()

    # Just to close previous dot line
    print("")

    log.info("""

%s records from set [%s] downloaded
open log file [%s] for details
""" % (report_data['saved'], metadata_set, log_file)
    )
def get_names(dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"
    # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc)

    edm_reader = MetadataReader(
        fields={
            'title': ('textList', record_prefix + '/dc:title/text()'),
            'creator': ('textList', record_prefix + '/dc:creator/text()'),
            'subject': ('textList', record_prefix + '/dc:subject/text()'),
            'description':
            ('textList', record_prefix + '/dc:description/text()'),
            'publisher': ('textList', record_prefix + '/dc:publisher/text()'),
            'contributor':
            ('textList', record_prefix + '/dc:contributor/text()'),
            'date': ('textList', record_prefix + '/dc:date/text()'),
            'type': ('textList', record_prefix + '/dc:type/text()'),
            'format': ('textList', record_prefix + '/dc:format/text()'),
            'identifier':
            ('textList', record_prefix + '/dc:identifier/text()'),
            'source': ('textList', record_prefix + '/dc:source/text()'),
            'language': ('textList', record_prefix + '/dc:language/text()'),
            'relation': ('textList', record_prefix + '/dc:relation/text()'),
            'coverage': ('textList', record_prefix + '/dc:coverage/text()'),
            'rights': ('textList', record_prefix + '/dc:rights/text()'),
            'spatial': ('textList', record_prefix + '/dc:spatial/text()'),
            'objectId': ('textList', record_prefix + '/@rdf:about'),
        },
        namespaces={
            'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms/',
            'dct': 'http://purl.org/dc/terms/',
            'edm': 'http://www.europeana.eu/schemas/edm/',
            'foaf': 'http://xmlns.com/foaf/0.1/',
            'owl': 'http://www.w3.org/2002/07/owl#',
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
            'skos': 'http://www.w3.org/2004/02/skos/core#',
            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
            'ore': 'http://www.openarchives.org/ore/terms/'
        })

    names = []
    identifier = []

    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry)
        # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21'

        for record in client.listRecords(metadataPrefix='edm', set=dataname):
            output = record[1].getMap()

            if output['creator'] != []:

                names.append([output['creator'][0]])
                identifier.append(
                    [output['creator'][0], output['objectId'][0]])

            if output['contributor'] != []:

                names.append([output['contributor'][0]])
                identifier.append(
                    [output['contributor'][0], output['objectId'][0]])

    print(names)

    return identifier
コード例 #58
0
ファイル: harvester.py プロジェクト: Dolpic/ckanext-oaipmh
 def _create_metadata_registry(self):
     registry = MetadataRegistry()
     registry.registerReader('oai_dc', oai_dc_reader)
     registry.registerReader('oai_ddi', oai_ddi_reader)
     return registry