Ejemplo n.º 1
0
    def list_oai_collections(self, community):
        """ Retrieve the header data for each record in the current community repo """

        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(community.repository.base_url, registry)
            records = client.listIdentifiers(
                metadataPrefix='oai_dc', set=community.identifier)
        except:
            community_collections = set()
            return


        """ Filter records to build list of collections in the community set """
        community_collections = set()
        for i in records:
            for j in i.setSpec():
                if j[:3] == 'col':
                    community_collections.add(j)
    
        print len(community_collections)
        """ Build collection tuples (identifier, name) """
        for i in community_collections:
            # print i
            # print community_collections
            
            set_data = []
            set_data.append(i)  # Store identifier
            set_data.append('Collection: %s'%i)  # Store human readable name
            # print set_data
            self.collections.append(set_data)
Ejemplo n.º 2
0
def insertAll(time, time2):
    registry = MetadataRegistry()
    registry.registerReader('arXivRaw', arXivRaw_reader)
    client = Client(URL, registry)
    client.updateGranularity()
    list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
    errors = 0
    for a in list:
        #a = list.next()
        try:
            title = '\n'.join(a[1]['title'])
            sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
            abstract = '\n'.join(a[1]['abstract'])
            url = 'http://arxiv.org/abs/' + a[1]['id'][0]
            date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
            authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
            abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
            print title
            print sr2
            print abstract
            print url
            print date
            print authors
            insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
        except:
            print 'ERROR'
            print a
            errors = errors+1
    print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0):

    base_url = "http://export.arxiv.org/oai2"
    output = list()

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)

    for _, md, _ in records:

        # print md.getField("title")
        # checks for the case in 2010 when there is no title for something
        if md is not None:

            txt_dict = {"title": md["title"],
                    "abstract": md["description"],
                    "date": md["date"],
                    "subject": md["subject"],
                    "url": md["identifier"],
                    "authors": md['creator']}

            output.append(txt_dict)

        time.sleep(sleep_time)

    return output
Ejemplo n.º 4
0
    def __init__(
        self, base_url, metadata_registry=None, applicationContext=None, dayGranularity=False, credentials=None
    ):
        Client.__init__(self, base_url, metadata_registry, credentials)
        SwissbibPreImportProcessor.__init__(self, applicationContext)

        self._day_granularity = dayGranularity
        # self.writeContext = writeContext

        # was ist hier anders als bei Aleph!
        if not self.context.getConfiguration().getIteratorOAIStructure() is None:
            self.pIterSingleRecord = re.compile(
                self.context.getConfiguration().getIteratorOAIStructure(), re.UNICODE | re.DOTALL | re.IGNORECASE
            )
        else:
            self.pIterSingleRecord = re.compile("<record>.*?</record>", re.UNICODE | re.DOTALL | re.IGNORECASE)
        # GH: 16.10.2015 this works for Nebis because we are looking for the outer 'shell' of all <record>...</record> not qualified with additional namespaces.
        # we can use this for deleted as well as for full records. Compare example in exampleContentStructures/alma/deletedAndUpdatedRecords.xml
        # with Aleph this isn't as easy..  .
        # self.pIterSingleRecordNebis = re.compile('<record>.*?</record>',re.UNICODE | re.DOTALL | re.IGNORECASE)

        self.pResumptionToken = re.compile(
            "<resumptionToken.*?>(.{1,}?)</resumptionToken>", re.UNICODE | re.DOTALL | re.IGNORECASE
        )
        self.harvestingErrorPattern = re.compile(
            "(<error.*?>.*?</error>|<html>.*?HTTP.*?Status.*?4\d\d)", re.UNICODE | re.DOTALL | re.IGNORECASE
        )
Ejemplo n.º 5
0
    def scrape(self):
        raise Exception("not finished")
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        url = self.setting('pmh-endpoint')
        client = Client(url, registry)

        print "  OAI Repository", url
        print "  Available sets:"
        for s in client.listSets():
            print "   ", s

        oai_set = self.setting('set')
        oai_from = self.setting('from')
        oai_until = self.setting('until')

        kwargs = {}

        if oai_set:
            kwargs['set'] = oai_set

        if oai_from is not None:
            date_args = [int(arg) for arg in oai_from.split("-")]
            kwargs['from_'] = datetime.datetime(*date_args)

        if oai_until is not None:
            date_args = [int(arg) for arg in oai_until.split("-")]
            kwargs['until'] = datetime.datetime(*date_args)

        records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)]

        data_filepath = os.path.join(self.work_dir(), self.setting('data-file'))
        with open(data_filepath, 'wb') as f:
            print "  picking", len(records), "records"
            pickle.dump(records, f)
Ejemplo n.º 6
0
 def getClient(self):
     if self.format == 'citeproc':
         return FakeOaiClientForCrossref()
     registry = MetadataRegistry()
     client = Client(self.url, registry)
     client.get_method = self.get_method
     client._day_granularity = self.day_granularity
     return client
Ejemplo n.º 7
0
def list_sets(target):
    if target is not None:
        client = Client(target['url'], registry)
        setspecs = client.listSets()
        results = []
        if setspecs is not None:
            for setspec in setspecs:
                results.append(convert_setspec(setspec))
        return results
Ejemplo n.º 8
0
def list_metadata_formats(target, identifier):
    if target is not None:
        client = Client(target['url'], registry)
        metadata_formats = client.listMetadataFormats(identifier=identifier)
        results = []
        if metadata_formats is not None:
            for metadata_format in metadata_formats:
                results.append(convert_metadata_formats(metadata_format))
        return results
Ejemplo n.º 9
0
def list_identifiers(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        results = []
        if headers is not None:
            for header in headers:
                results.append(convert_header(header))
        return results
Ejemplo n.º 10
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, metadata_registry)
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 11
0
def list_records(target, date_from, date_until, setspec):
    logging.debug("list_records")
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        if records is not None:
            for record in records:
                yield convert_record(record, target['metadata_prefix'], target['title'])
Ejemplo n.º 12
0
def list_records(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        # todo : clean this, find simplified cases
        if date_from is not None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        elif date_from is not None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until)
        elif date_from is not None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from, set=setspec)
        elif date_from is None and date_until is not None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until, set=setspec)
        elif date_from is not None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], from_=date_from)
        elif date_from is None and date_until is not None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], until=date_until)
        elif date_from is None and date_until is None and setspec is not None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'], set=setspec)
        elif date_from is None and date_until is None and setspec is None:
            records = client.listRecords(metadataPrefix=target['metadata_prefix'])

        results = []
        if records is not None:
            results = []
            for record in records:
                results.append(convert_record(record, target['metadata_prefix'], target['title']))
        return results
Ejemplo n.º 13
0
def pull_data(source):
    list_of_records = []
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(source
                    + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry)
    try:
        exps_date = []
        exps_metadata = []
        for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc'):
            exps_date.append(str(header._datestamp))
            exps_metadata.append(meta)
            logger.debug('Date=%s' % header._datestamp)

    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    exp_counter = 0
    for exp_metadata in exps_metadata:
        user_id = exp_metadata.getField('creator')[0]
        user_profile = json.loads(_get_user(source, user_id))
        data_tobe_indexed = dict(user_profile)
        data_tobe_indexed['user_id'] = user_id

        exp_id = exp_metadata.getField('identifier')[0]
        description = exp_metadata.getField('description')[0]
        title = exp_metadata.getField('title')[0]
        if settings.EXPERIMENT_PATH[0] == '/':
            settings.EXPERIMENT_PATH = settings.EXPERIMENT_PATH[1:]
        experiment_url = os.path.join(source,
                                      settings.EXPERIMENT_PATH % exp_id)

        data_tobe_indexed['experiment_id'] = exp_id
        data_tobe_indexed['experiment_title'] = title
        data_tobe_indexed['experiment_description'] = description
        data_tobe_indexed['experiment_url'] = experiment_url
        data_tobe_indexed['id'] = experiment_url
        data_tobe_indexed['experiment_date'] = exps_date[exp_counter]
        exp_counter += 1
        for k, v in data_tobe_indexed.items():
            logger.debug('%s = %s' % (k, v))
        logger.debug('')
        list_of_records.append(json.dumps(data_tobe_indexed))

    return list_of_records
Ejemplo n.º 14
0
def test(request):
	URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler'
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(URL, registry)
	identifyResponse = client.identify()

	print dir(identifyResponse)
	#for record in client.listRecords(metadataPrefix='oai_dc'):
	#	result += record
	return HttpResponse(identifyResponse.repositoryName())
Ejemplo n.º 15
0
    def harvest_oai_collection_records(self, collection):
        records = []
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(collection.community.repository.base_url, registry)
            records = client.listRecords(
                metadataPrefix='oai_dc', set=collection.identifier)
        except:
            return

        return records
Ejemplo n.º 16
0
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
Ejemplo n.º 17
0
def get_client(url, transforms):
    transforms = fix_transforms(transforms)
    registry = MetadataRegistry()
    c = Client(url, registry)
    metadata = c.listMetadataFormats()
    metadata[0] = [
        'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb']
    namespaces = dict((x[0], x[2]) for x in metadata)
    fields = dict((transform['field'], ('textList', transform['path']))
                  for transform in transforms)
    namespace = metadata[0][0]
    print namespaces,fields
    registry.registerReader(namespace, MetadataReader(fields=fields, namespaces=namespaces))
    return c, namespace
Ejemplo n.º 18
0
def index_documents(main_url, database_name, url, reader, prefix, format):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(url, registry)
    return_stuff = []
    for record in client.listRecords(metadataPrefix=prefix):
        r = record[1]
        value = format(r,record[0].identifier())
        if value != None:
            return_stuff.append(value)
        if len(return_stuff) >= 10000:
            sync_files(main_url, database_name, return_stuff)     
            return_stuff = []
    sync_files(main_url, database_name, return_stuff)                 
Ejemplo n.º 19
0
 def read_base_records(self):
     registry = MetadataRegistry()
     registry.registerReader('base_dc', base_dc_reader)
     client = Client('http://doai.io/oai', registry)
     for header, record, _ in client.listRecords(metadataPrefix='base_dc'):
         # only process records for which base was unsure
         if '2' not in record['oa']:
             continue
         # extract splash_url
         for link in record['identifier']:
             metadata = {'base_oa':''.join(record['oa']),
                     'splash_url':link,
                     'from_identifier':header.identifier()}
             yield self.filter_url(link,metadata, looking_for='any')
Ejemplo n.º 20
0
    def __init__(self, oaisource, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param oaisource: the OAISource to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        if not oaisource.endpoint:
            raise ValueError('No OAI endpoint was configured for this OAI source.')

        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.client = Client(oaisource.endpoint, self.registry)
        self.client._day_granularity = day_granularity
        self.translators = {
            'oai_dc': OAIDCTranslator(oaisource),
            'base_dc': BASEDCTranslator(oaisource),
        }
Ejemplo n.º 21
0
 def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None):
     '''
     Constructor
     '''
     
     if fields == None:
         self._fields = nsdl.LR_NSDL_DC_FIELDS
     else:
         self._fields = fields
     
     if fieldMap == None:
         self._fieldMap = nsdl.NSDL_TO_LR_MAP
     else:
         self._fieldMap = fieldMap
     
     if namespaces == None:
         self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
     else:
         self._namespaces = namespaces
         
     if reader == None:
         reader = MetadataReader(fields = self._fields, namespaces = self._namespaces)
     
     self._url = url
     self._registry = MetadataRegistry()
     self._prefix = prefix
     self._registry.registerReader(prefix, reader)
     self._client = Client(url, self._registry)
Ejemplo n.º 22
0
 def _init_clients(self):
     try:
         self._c = OaipmhClient(self.store.state['base_oai_url']) #, metadata_registry = dumbMetadataRegistry)
         self.identify()
     except OSError:
         logger.error("Cannot make OAIPMH client")
         raise Exception("Cannot make OAIPMH client")
Ejemplo n.º 23
0
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'):
    """
    Create an OAI-PMH client, gather metadata and output it.

    """    
    total = num = 0
    msg = "Fetching records between " + str(start) + " and " + str(end)
    sys.stderr.write(msg + "\n")

    #
    # Set up metadata readers
    #
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    registry.registerReader('qdc', qdc_reader)
    # registry.registerReader('rdf', rdf_reader)   # no reader yet
    # registry.registerReader('ore', ore_reader)   # no reader yet
    # registry.registerReader('mets', mets_reader) # no reader yet

    client = Client(URL, registry)
    records = client.listRecords(metadataPrefix='qdc',
                                 from_=start, until=end, set=set)
    for (h, m, a) in records:
        print h, m, a
        if not m:
            sys.stderr.write("o")
            continue
        total = total + 1
        
        handle = m.getField('identifier')
        if not handle:
            sys.stderr.write("Record without a handle.\n")
            continue

        r = dict({ 'handle' : handle[0] })
        for key in qdc_reader._fields.keys():
           r[key] = m.getField(key)
        RECORDS.append(r)

        sys.stderr.write('.')
        sys.stderr.flush()
        num = num + 1
    msg = "\nCollected " + str(num) + " records, out of " + str(total)
    sys.stderr.write('\n' + msg + '\n');

    if options.store:
        pickle.dump(RECORDS, open(options.store, "wb"))
Ejemplo n.º 24
0
    def update(self, from_date=None):
        self._log.info('Harvesting oai server: %s' % self._url)
        registry = MetadataRegistry()
        registry.registerReader(self._prefix, lambda el: el)

        client = Client(self._url, registry)
        try:
            for header, element, about in client.listRecords(
                metadataPrefix = self._prefix,
                from_ = from_date):
                added = self._process_record(header, element)
                if added:
                    yield self._get_id(header)
        except NoRecordsMatchError:
            pass

        super(OAIBasedContentProvider, self).update()
def processItems():
    oai_oi_reader = MetadataReader(
        fields={
        'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
        'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'),
        'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
        'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
        'description': ('textList', 'oai_oi:oi/oi:description/text()'),
        'abstract':       ('textList', 'oai_oi:oi/oi:abstract/text()'),
        'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
        'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
        'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
        'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
        'extent':      ('textList', 'oai_oi:oi/oi:extend/text()'),
        'medium':       ('textList', 'oai_oi:oi/oi:medium/text()'),
        'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
        'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
        'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
        'references':  ('textList', 'oai_oi:oi/oi:references/text()'),
        'spatial':  ('textList', 'oai_oi:oi/oi:spatial/text()'),
        'attributionName':       ('textList', 'oai_oi:oi/oi:attributionName/text()'),
        'attributionURL':       ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
        'license':       ('textList', 'oai_oi:oi/oi:license/text()'),
        #Zitten er niet in
        #'rights':      ('textList', 'oai_oi:oi/oi:rights/text()'),
        #'relation':    ('textList', 'oai_oi:oi/oi:relation/text()'),
        #'coverage':    ('textList', 'oai_oi:oi/oi:coverage/text()'),
        #'format':      ('textList', 'oai_oi:oi/oi:format/text()'),        
        },
        namespaces={
            'oi' : 'http://www.openbeelden.nl/oai/',
            'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc' : 'http://purl.org/dc/elements/1.1/',
            'dcterms' : 'http://purl.org/dc/terms',
        }
    )
    url = u'http://www.openbeelden.nl/feeds/oai/'

    registry = MetadataRegistry()
    registry.registerReader('oai_oi', oai_oi_reader)
    client = Client(url, registry)
    
    for record in client.listRecords(metadataPrefix='oai_oi'):
        processItem(record)
Ejemplo n.º 26
0
    def iter_items(self, partition):
        """ Partition is an OAI-PMH endpoint """

        # source = "oai:%s" % partition

        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = Client(partition, registry)

        for record in client.listRecords(metadataPrefix='oai_dc'):
            header, metadata, _ = record

            if header.isDeleted():
                continue

            # _id = header.identifier()
            # date = header.datestamp()

            meta = metadata.getMap()

            # TODO: there are much validation and heuristics to be done here!

            # format0 = (meta.get("format") or [None])[0]
            # if not format0:
            #     continue

            # if format0 not in ("application/pdf", ):
            #     continue

            url0 = (meta.get("identifier") or [None])[0]

            if not url0:
                continue

            title0 = (meta.get("title") or [""])[0].encode("utf-8")
            desc0 = (meta.get("description") or [""])[0].encode("utf-8")

            # TODO: validate that the url0 is not on another domain?!
            yield url0, {}, "html", 2, """
                <html><head><title>%s</title></head><body>%s</body></html>
            """ % (title0, desc0)
	def setupOAIPMHConnection(self):
		oai_oi_reader = MetadataReader(
		    fields={
		    'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
		    'alternative':       ('textList', 'oai_oi:oi/oi:alternative/text()'),
		    'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
		    'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
		    'description': ('textList', 'oai_oi:oi/oi:description/text()'),
		    'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
		    'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
		    'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
		    'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
		    'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
		    'extent':        ('textList', 'oai_oi:oi/oi:extent/text()'),
		    'medium':        ('textList', 'oai_oi:oi/oi:medium/text()'),
		    'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
		    'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
		    'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
		    'references':    ('textList', 'oai_oi:oi/oi:references/text()'),
		    'spatial':    ('textList', 'oai_oi:oi/oi:spatial/text()'),
		    'attributionName':    ('textList', 'oai_oi:oi/oi:attributionName/text()'),
		    'attributionURL':    ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
		    'license':      ('textList', 'oai_oi:oi/oi:license/text()')
		    },

		    namespaces={
		    	'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
		    	'oi': 'http://www.openbeelden.nl/oai/'
		    }
		)

		URL = 'http://www.openbeelden.nl/feeds/oai/'

		#Initialize the OAI client
		self.registry = MetadataRegistry()
		self.registry.registerReader('oai_oi', oai_oi_reader)
		self.client = Client(URL, self.registry)

		#Test if the connection to the OAI-PMH provider works
		x = self.client.updateGranularity()
		x = self.client.identify()
		print 'identity %s' % x.repositoryName()
		print 'identity %s' % x.protocolVersion()
		print 'identity %s' % x.baseURL()

		"""
		for s in client.listSets():
			print s
		"""

		#initialize the OpenSKOSHandler
		self.openSKOSHandler = OpenSKOSHandler()
def acquire_and_publish_documents(oai_url, publish_url, reader, prefix, pwd):
    registry = MetadataRegistry()
    registry.registerReader(prefix, reader)
    client = Client(oai_url, registry)
    documents = []
    count = 0
    for record in client.listRecords(metadataPrefix=prefix):
	    header = record[0]
	    metadata = record[1]
	    rawMetadata = urllib2.urlopen("{0}?verb=GetRecord&metadataPrefix={1}&identifier={2}".format(oai_url,prefix,header.identifier())).read()

            # re-format Jorum id
            identifier = header.identifier()
            identifier = identifier.replace("oai:dspace.jorum.ac.uk:","")
            uri = "http://dspace.jorum.ac.uk/xmlui/handle/" + identifier
            print(uri)

            # create keys from dc.subject terms
            fo = StringIO.StringIO(rawMetadata)
            tree = parse(fo)  # can only parse files or file objects
            keys = []
            for elem in tree.getiterator():
#                print("tag  " + str(elem.tag))
#                print("text " + str(elem.text))
                if elem.tag == "{http://purl.org/dc/elements/1.1/}subject":
                    keys.append(elem.text)
            fo.close()
            print(keys)
            print("\n")
	    value = convert_to_envelope(metadata, rawMetadata, uri, keys)
#            print (value)
#            print(dir(header))
	    if value != None:
		    documents.append(value)
		    count += 1
		    if (count % 10 == 0) or (count == 3):
			publish_documents(publish_url, documents, pwd)
			documents = []
    publish_documents(publish_url, documents, pwd)
Ejemplo n.º 29
0
    def list_oai_community_sets(self, repository):

        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(repository.base_url, registry)
            sets = client.listSets()
        except:
            return

        """ Filter records to build list of community sets """
        self.communities = []
        for i in sets:
            set_id = i[0]
            set_name = i[1]
            """ Build collection tuples (id, human readable name) """
            if set_id[:3] == 'com':
                set_data = []
                set_data.append(set_id)
                set_data.append(set_name)
                self.communities.append(set_data)

        self.communities = sorted(
            self.communities, key=lambda community: community[1])
Ejemplo n.º 30
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 31
0
 def getClient(self):
     registry = MetadataRegistry()
     client = Client(self.url, registry)
     client.get_method = self.get_method
     client._day_granularity = self.day_granularity
     return client
Ejemplo n.º 32
0
def _client(base_URL, registry=None):
    """
    instantiate client
    """
    client = Client(base_URL, registry)
    return client
Ejemplo n.º 33
0
        return element


# Defining of metadata Readers in the Registry

from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('marc21', XMLReader())

#### OAI-PMH Client processing

from oaipmh.client import Client
from lxml import etree

oai = Client('http://snape.mzk.cz/OAI-script', registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='MZK03')

#rec = recs.next()
#for rec in recs:

rec = oai.getRecord(identifier='MZK03-907223', metadataPrefix='marc21')

if rec:
    print rec[0].identifier()
    r = rec[1]  # Get XML tree for record
    print etree.tostring(r, pretty_print=True)

    if r:
        xpath_evaluator = etree.XPathEvaluator(
def transfer_experiment(source):
    """
    Pull public experiments from source into current mytardis.
    """

    #TODO: Cleanup error messages
    #TODO: does not transfer liences as not part of METS format.
    #NOTE: As this is a pull we trust the data from the other tardis
    # Check identity of the feed
    from oaipmh.client import Client
    from oaipmh import error
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    from django.core.cache import cache
    from django.utils.hashcompat import md5_constructor as md5

    # The cache key consists of the task name and the MD5 digest
    # of the feed URL.
    cache_key = md5("token").hexdigest()
    lock_id = "%s-lock-%s" % ("consume_experiment", cache_key)
    LOCK_EXPIRE = 60 * 5
    # cache.add fails if if the key already exists
    acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE)
    # memcache delete is very slow, but we have to use it to take
    # advantage of using add() for atomic locking
    release_lock = lambda: cache.delete(lock_id)

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    source_url = "%s/apps/oaipmh/?verb=Identify" % source

    client = Client(source_url, registry)
    try:
        identify = client.identify()
    except AttributeError as e:
        msg = "Error reading repos identity: %s:%s" % (source, e)
        logger.error(msg)
        raise ReposReadError(msg)
    except error.ErrorBase as e:
        msg = "OAIPMH error: %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except URLError as e:
        logger.error(e)
        raise
    repos = identify.baseURL()
    import urlparse
    repos_url = urlparse.urlparse(repos)
    dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc)
    if dest_name != source:
        msg = "Source directory reports incorrect name: %s" % dest_name
        logger.error(msg)
        raise BadAccessError(msg)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(
        source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc",
        registry)
    try:
        exps_metadata = [
            meta for (header, meta,
                      extra) in client.listRecords(metadataPrefix='oai_dc')
        ]
    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    local_ids = []
    for exp_metadata in exps_metadata:
        exp_id = exp_metadata.getField('identifier')[0]
        user = exp_metadata.getField('creator')[0]

        found_user = _get_or_create_user(source, user)

        #make sure experiment is publicish
        try:
            xmldata = getURL("%s/apps/reposproducer/expstate/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        try:
            exp_state = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not exp_state in [
                Experiment.PUBLIC_ACCESS_FULL,
                Experiment.PUBLIC_ACCESS_METADATA
        ]:
            msg = 'cannot ingest private experiments.' % exp_id
            logger.error(msg)
            raise BadAccessError(msg)

        # Get the usernames of isOwner django_user ACLs for the experiment
        try:
            xmldata = getURL("%s/apps/reposproducer/acls/%s/" %
                             (source, exp_id))

        except HTTPError as e:
            msg = "Cannot get acl list of experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)
        try:
            acls = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse acl list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        owners = []
        for acl in acls:
            if acl['pluginId'] == 'django_user' and acl['isOwner']:
                user = _get_or_create_user(source, acl['entityId'])
                owners.append(user.username)
            else:
                # FIXME: skips all other types of acl for now
                pass

        # Get the METS for the experiment
        metsxml = ""
        try:
            metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" %
                             (source, exp_id))
            #metsxml = getURL("%s/experiment/metsexport/%s/"
            #% (source, exp_id))

        except HTTPError as e:
            msg = "cannot get METS for experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)

        # load schema and parametername for experiment keys
        try:
            key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE)
        except Schema.DoesNotExist as e:
            msg = "No ExperimentKeyService Schema found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            key_name = ParameterName.objects.get(name=settings.KEY_NAME)
        except ParameterName.DoesNotExist as e:
            msg = "No ExperimentKeyService ParameterName found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            xmldata = getURL("%s/apps/reposproducer/key/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get key of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not xmldata:
            logger.warn(
                "Unable to retrieve experiment %s key.  Will try again later" %
                exp_id)
            return

        try:
            key_value = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse key list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not key_value:
            logger.warn(
                "Unable to retrieve experiment %s key value.  Will try again later"
                % exp_id)
            return

        logger.debug("retrieved key %s from experiment %s" %
                     (key_value, exp_id))
        exps = Experiment.objects.all()

        got_lock = True
        if not acquire_lock():
            logger.warning("another worker has access to consume experiment")
            return

        duplicate_exp = 0
        for exp in exps:
            #logger.warn("exp = %s" % exp.id)
            params = ExperimentParameter.objects.filter(
                name=key_name,
                parameterset__schema=key_schema,
                parameterset__experiment=exp)
            #logger.warn("params.count() = %s" % params.count())
            if params.count() >= 1:
                key = params[0].string_value
                if key == key_value:
                    duplicate_exp = exp.id
                    #logger.warn("found duplicate for %s" % duplicate_exp)
                    break

        if duplicate_exp:
            logger.warn(
                "Found duplicate experiment form %s exp %s to  exp %s" %
                (source, exp_id, duplicate_exp))
            if got_lock:
                release_lock()
            return

        # TODO: Need someway of updating and existing experiment.  Problem is
        # that copy will have different id from original, so need unique identifier
        # to allow matching

        # We have not pulled everything we need from producer and are ready to create
        # experiment.

        # Make placeholder experiment and ready metadata
        e = Experiment(
            title='Placeholder Title',
            approved=True,
            created_by=found_user,
            public_access=exp_state,
            locked=False  # so experiment can then be altered.
        )
        e.save()

        # store the key
        #eps, was_created = ExperimentParameterSet.objects.\
        #    get_or_create(experiment=e, schema=key_schema)
        #if was_created:
        #    logger.warn("was created")
        #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps,
        #    name=key_name,
        #    string_value=key_value)
        #if was_created:
        #    logger.warn("was created again")
        #ep.save()

        if got_lock:
            release_lock()

        local_id = e.id
        filename = path.join(e.get_or_create_directory(), 'mets_upload.xml')
        f = open(filename, 'wb+')
        f.write(metsxml)
        f.close()

        # Ingest this experiment META data and isOwner ACLS
        eid = None
        try:
            eid, sync_path = _registerExperimentDocument(filename=filename,
                                                         created_by=found_user,
                                                         expid=local_id,
                                                         owners=owners)
            logger.info('=== processing experiment %s: DONE' % local_id)
        except:
            # FIXME: what errors can mets return?
            msg = '=== processing experiment %s: FAILED!' \
                % local_id
            logger.error(msg)
            raise MetsParseError(msg)

        # FIXME: if METS parse fails then we should go back and delete the placeholder experiment

        exp = Experiment.objects.get(id=eid)

        # so that tardis does not copy the data
        for datafile in exp.get_datafiles():
            datafile.stay_remote = True
            datafile.save()

        #import nose.tools
        #nose.tools.set_trace()
        # FIXME: reverse lookup of URLs seem quite slow.
        # TODO: put this information into specific metadata schema attached to experiment
        exp.description += get_audit_message(source, exp_id)
        exp.save()

        local_ids.append(local_id)
    return local_ids
Ejemplo n.º 35
0
def run(event, context):
    #pull data from OAI endpoint
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    url_base = 'http://contentdm.marinlibrary.org'
    URL = url_base + '/oai/oai.php'
    client = Client(URL, registry, force_http_get=True)

    harvested_data = []
    for record in client.listRecords(metadataPrefix='oai_dc'):
        if not record[0].isDeleted():
            fields = record[1].getMap()
            fields['subjects'] = fields['subject'][0].split(';')
            del fields['subject']
            fields['set'] = record[0].setSpec()
            identifier = record[0].identifier().split(':')[2]
            fields[
                'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/'
            harvested_data.append(fields)

    es.indices.delete(index='digital_collection_recs', ignore=[400, 404])

    mapping = {
        "mappings": {
            "_doc": {
                "properties": {
                    "title": {
                        "type": "text"
                    },
                    "creator": {
                        "type": "text"
                    },
                    "subjects": {
                        "type": "text"
                    },
                    "description": {
                        "type": "text"
                    },
                    "publisher": {
                        "type": "text"
                    },
                    "contributor": {
                        "type": "text"
                    },
                    "date": {
                        "type": "text"
                    },
                    "type": {
                        "type": "text",
                        "fielddata": "true"
                    },
                    "format": {
                        "type": "text",
                        "fielddata": "true"
                    },
                    "identifier": {
                        "type": "text"
                    },
                    "source": {
                        "type": "text"
                    },
                    "language": {
                        "type": "text",
                        "fielddata": "true"
                    },
                    "relation": {
                        "type": "text"
                    },
                    "coverage": {
                        "type": "text"
                    },
                    "rights": {
                        "type": "text"
                    },
                    "set": {
                        "type": "text",
                        "fielddata": "true"
                    },
                    "image_url_base": {
                        "type": "text"
                    }
                }
            }
        }
    }
    es.indices.create(index='digital_collection_recs', body=mapping)

    helpers.bulk(es,
                 harvested_data,
                 index='digital_collection_recs',
                 doc_type='_doc')

    return "success"
Ejemplo n.º 36
0
class NSDLDCImport(object):
    '''
    Class exports the required fields from the UCAR OAI-PMH data repository using NSDL_DC.
    '''
    def __init__(self,
                 url,
                 prefix=nsdl.LR_NSDL_PREFIX,
                 reader=None,
                 fields=None,
                 namespaces=None,
                 fieldMap=None):
        '''
        Constructor
        '''

        if fields == None:
            self._fields = nsdl.LR_NSDL_DC_FIELDS
        else:
            self._fields = fields

        if fieldMap == None:
            self._fieldMap = nsdl.NSDL_TO_LR_MAP
        else:
            self._fieldMap = fieldMap

        if namespaces == None:
            self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES
        else:
            self._namespaces = namespaces

        if reader == None:
            reader = MetadataReader(fields=self._fields,
                                    namespaces=self._namespaces)

        self._url = url
        self._registry = MetadataRegistry()
        self._prefix = prefix
        self._registry.registerReader(prefix, reader)
        self._client = Client(url, self._registry)

    def _format(self, doc):
        value = {}
        # merge all the fields
        for (fieldname, fieldconfig) in self._fieldMap.items():
            if fieldconfig["type"] == "const" and "const" in fieldconfig:
                value[fieldname] = fieldconfig["const"]
            elif fieldconfig["type"] == "[string]" and len(
                    fieldconfig["fields"]) > 0:
                value[fieldname] = []
                for field in fieldconfig["fields"]:
                    value[fieldname].extend(doc.getField(field))
            elif fieldconfig["type"] == "string" and len(
                    fieldconfig["fields"]) > 0:
                value[fieldname] = ""
                for field in fieldconfig["fields"]:
                    value[fieldname] += ", ".join(doc.getField(field))
            elif fieldconfig["type"] == "boolean" and len(
                    fieldconfig["fields"]) > 0:
                value[fieldname] = True
                for field in fieldconfig["fields"]:
                    value[fieldname] &= doc.getField(field)
        return value

    def fetch_documents(self, range=10000):

        return_stuff = []
        for record in self._client.listRecords(metadataPrefix=self._prefix):
            r = record[1]
            value = self._format(r)
            if value != None:
                return_stuff.append(value)
            if len(return_stuff) >= range:
                yield return_stuff
                return_stuff = []
    def run(self):
        # Check that ElasticSearch is alive
        self.check_index()

        # If the user specified the --REBUILD flag, recreate the index
        if self.options['rebuild']:
            self.rebuild_index()

        # Connect to the repository
        registry = MetadataRegistry()
        registry.registerReader(self.settings["metadata_format"],
                                self.settings["metadata_reader"])

        client = Client(self.settings["uri"], registry)
        identity = client.identify()

        print "Connected to repository: %s" % identity.repositoryName()

        # got to update granularity or we barf with:
        # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
        client.updateGranularity()

        # Initialise some variables
        batcher = Batch.Batch()
        total_records = 0
        start = time.time()

        # Now do the synchonisation

        # If the user specified an identifier, then synchronise this record
        if (self.options['identifier'] is not None):
            total_records += self.synchronise_record(
                client, batcher, self.options['identifier'])
        else:
            # Else, synchronise using the date-range provided by the user, or failing that,
            # the date-range based on the last sync

            # Get the synchronisation config record
            synchronisation_config = self.get_synchronisation_config()

            if self.options["from_date"] is not None:
                # If the user specified a from-date argument, use it
                from_date = self.options[
                    "from_date"]  # already a date (not a datetime)
            elif synchronisation_config is not None and "to_date" in synchronisation_config:
                # Else read the last synchronised to_date from the config, and add on a day
                from_date = dateutil.parser.parse(
                    synchronisation_config["to_date"]).date() + timedelta(
                        days=1)
            else:
                # Else use the default_from_date in the config
                from_date = dateutil.parser.parse(
                    self.settings['default_from_date']).date()

            if self.options["to_date"] is not None:
                to_date = self.options[
                    "to_date"]  # already a date (not a datetime)
            else:
                to_date = (date.today() - timedelta(days=1))

            # Force the from_date to use time 00:00:00
            from_date = datetime.combine(
                from_date, _time(hour=0, minute=0, second=0, microsecond=0))

            # Force the to_date to use time 23:59:59
            to_date = datetime.combine(
                to_date, _time(hour=23, minute=59, second=59, microsecond=0))

            print "Synchronising from %s - %s" % (from_date, to_date)

            while from_date < to_date:
                next_date = datetime.combine(
                    from_date.date() +
                    timedelta(days=(self.settings['delta_days'] - 1)),
                    _time(hour=23, minute=59, second=59, microsecond=0))
                number_of_records = self.synchronise_period(
                    client, batcher, from_date, next_date)
                batcher.clear()  #Store the records in elasticsearch
                self.put_synchronisation_config(from_date, next_date,
                                                number_of_records)
                from_date += timedelta(days=(self.settings['delta_days']))
                total_records += number_of_records

                # Pause so as not to get banned.
                to = 20
                print "Sleeping for %i seconds so as not to get banned." % to
                time.sleep(to)

        # Store the records in the index
        batcher.clear()

        # Print out some statistics
        time_spent = time.time() - start
        print 'Total time spent: %d seconds' % (time_spent)

        if time_spent > 0.001:  # careful as its not an integer
            print 'Total records synchronised: %i records (%d records/second)' % (
                total_records, (total_records / time_spent))
        else:
            print 'Total records synchronised: %i records' % (total_records)
        return total_records

        sys.exit()
Ejemplo n.º 38
0
#aka oaijson
import sys

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

import simplejson as json

import couchdb

server = couchdb.Server()
db = server['dcat']

URL = 'http://cardinalscholar.bsu.edu/cgi/oai2'

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry)

records = client.listRecords(metadataPrefix='oai_dc')
i = 0
for hdr, metadata, _ in records:
    i = i + 1
    print hdr.identifier()
    print hdr.datestamp()
    map = metadata.getMap()
    map.update({'cdmcollection': 'cardinalscholar'})
    db.save(map)
    print 'saved ' + str(i)
    return


def get_bool(prompt):
    while True:
        try:
            return {"true": True, "false": False}[input(prompt).lower()]
        except KeyError:
            print("Invalid input please enter True or False!")


# init

registry = MetadataRegistry()
registry.registerReader('marc21', MarcXML)
client = Client(URL, registry)

start = valid_date(start_date)
stop = valid_date(stop_date)

# main

while start < stop:
    from_date = start
    start = start + timedelta(days=1)  # increase days one by one
    until_date = start
    try:
        records = client.listRecords(metadataPrefix='marc21',
                                     set='SKC',
                                     from_=from_date,
                                     until=until_date)
Ejemplo n.º 40
0
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from PdfParser.PdfParser import Preprocessing

import json
import bz2
from numpy import record
import pandas as pd
baseurl = 'http://export.arxiv.org/oai2?'
corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/corpus.csv'
if __name__ == "__main__":

    url = baseurl
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(url, registry)
    record = client.listSets()
    for word in record:
        print(word)
    #Write to file
    #with bz2.BZ2File('out.json', 'wb') as outfile:

for record in client.listRecords(metadataPrefix='oai_dc', set='cs'):
    header, metadata, _ = record
    doc = {}
    #Extract identifier
    #doc["id"] = header.identifier()
    #Extract title and other metadata
    doc["title"] = "\n".join(metadata["title"])
    doc["abstract"] = "\n".join(metadata["description"])
    #doc["authors"] = metadata["creator"]
Ejemplo n.º 41
0
class OaiPaperSource(PaperSource):  # TODO: this should not inherit from PaperSource
    """
    A paper source that fetches records from the OAI-PMH proxy
    (typically: proaixy).

    It uses the ListRecord verb to fetch records from the OAI-PMH
    source. Each record is then converted to a :class:`BarePaper`
    by an :class:`OaiTranslator` that handles the format
    the metadata is served in.
    """

    def __init__(self, endpoint, day_granularity=False, *args, **kwargs):
        """
        This sets up the paper source.

        :param endpoint: the address of the OAI-PMH endpoint
            to fetch from.
        :param day_granularity: should we use day-granular timestamps
            to fetch from the proxy or full timestamps (default: False,
            full timestamps)

        See the protocol reference for more information on timestamp
        granularity:
        https://www.openarchives.org/OAI/openarchivesprotocol.html
        """
        super(OaiPaperSource, self).__init__(*args, **kwargs)
        self.registry = MetadataRegistry()
        self.registry.registerReader('oai_dc', oai_dc_reader)
        self.registry.registerReader('base_dc', base_dc_reader)
        self.registry.registerReader('citeproc', citeproc_reader)
        self.client = Client(endpoint, self.registry)
        self.client._day_granularity = day_granularity
        if settings.PROAIXY_API_KEY:
            self.client.extra_parameters = {
                'key': settings.PROAIXY_API_KEY}
        self.translators = {}

    # Translator management

    def add_translator(self, translator):
        """
        Adds the given translator to the paper source,
        so that we know how to translate papers in the given format.

        The paper source cannot hold more than one translator
        per OAI format (it decides what translator to use
        solely based on the format) so if there is already a translator
        for that format, it will be overriden.
        """
        self.translators[translator.format()] = translator

    # Record ingestion

    def ingest(self, from_date=None, metadataPrefix='any',
               resumptionToken=None):
        """
        Main method to fill Dissemin with papers!

        :param from_date: only fetch papers modified after that date in
                          the proxy (useful for incremental fetching)
        :param metadataPrefix: restrict the ingest for this metadata
                          format
        """
	args = {'metadataPrefix':metadataPrefix}
	if from_date:
	    args['from_'] = from_date
	if resumptionToken:
	    args['resumptionToken'] = resumptionToken
        records = self.client.listRecords(**args)
        self.process_records(records)

    def create_paper_by_identifier(self, identifier, metadataPrefix):
        """
        Queries the OAI-PMH proxy for a single paper.

        :param identifier: the OAI identifier to fetch
        :param metadataPrefix: the format to use (a translator
                    has to be registered for that format, otherwise
                    we return None with a warning message)
        :returns: a Paper or None
        """
        record = self.client.getRecord(
                    metadataPrefix=metadataPrefix,
                    identifier=identifier)
        return self.process_record(record[0], record[1]._map)

    # Record search utilities

    def listRecords_or_empty(self, source, *args, **kwargs):
        """
        pyoai raises :class:`NoRecordsMatchError` when no records match,
        we would rather like to get an empty list in that case.
        """
        try:
            return source.listRecords(*args, **kwargs)
        except NoRecordsMatchError:
            return []

    def process_record(self, header, metadata):
        """
        Saves the record given by the header and metadata (as returned by
        pyoai) into a Paper, or None if anything failed.
        """
        translator = self.translators.get(header.format())
        if translator is None:
            print("Warning: unknown metadata format %s, skipping" %
                  header.format())
            return

        paper = translator.translate(header, metadata)
        if paper is not None:
            try:
                with transaction.atomic():
                    saved = Paper.from_bare(paper)
                return saved
            except ValueError as e:
                print "Ignoring invalid paper:"
                print header.identifier()
                print e

    def process_records(self, listRecords):
        """
        Save as :class:`Paper` all the records contained in this list
        """
        # check that we have at least one translator, otherwise
        # it's not really worth trying…
        if not self.translators:
            raise ValueError("No OAI translators have been set up: " +
                             "We cannot save any record.")

        last_report = datetime.now()
        processed_since_report = 0

        for record in listRecords:
            header = record[0]
            metadata = record[1]._map

            self.process_record(header, metadata)

            # rate reporting
            processed_since_report += 1
            if processed_since_report >= 1000:
                td = datetime.now() - last_report
                rate = 'infty'
                if td.seconds:
                    rate = unicode(processed_since_report / td.seconds)
                print("current rate: %s records/s" % rate)
                processed_since_report = 0
                last_report = datetime.now()
Ejemplo n.º 42
0
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

URL = sys.argv[1]
METADATA_PREFIX = sys.argv[2]
if len(sys.argv) == 4:
    SETSPEC = sys.argv[3]
else:
    SETSPEC = None

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
registry.registerReader(METADATA_PREFIX, oai_dc_reader)

client = Client(URL, registry)

record_count = 0
deleted_count = 0

if SETSPEC:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX, set=SETSPEC)
else:
    records = client.listRecords(metadataPrefix=METADATA_PREFIX)

for num, record in enumerate(records):
    record_count += 1
    delinfo = ''
    if record[0].isDeleted():
        deleted_count += 1
        delinfo = '(deleted)'
def get_names (dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"

    edm_reader = MetadataReader(
        fields={

        'objectId':    ('textList', record_prefix + '/@rdf:about'),
        'spatial':     ('textList', record_prefix + '/dcterms:spatial/text()'),


        },
        namespaces={
    	   'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
    	   'dc':'http://purl.org/dc/elements/1.1/',
    	   'dcterms':'http://purl.org/dc/terms/',
    	   'dct': 'http://purl.org/dc/terms/',
    	   'edm' : 'http://www.europeana.eu/schemas/edm/',
    	   'foaf': 'http://xmlns.com/foaf/0.1/',
    	   'owl' : 'http://www.w3.org/2002/07/owl#',
    	   'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    	   'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    	   'skos': 'http://www.w3.org/2004/02/skos/core#',
    	   'xsi' : 'http://www.w3.org/2001/XMLSchema-instance',
    	   'ore': 'http://www.openarchives.org/ore/terms/'
    	}
         )


    dictnames={}
    identifier=[]


    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry )

        k = 0

        for record in client.listRecords(metadataPrefix='edm' , set= dataname ):

            output = record[1].getMap()

            k = k + 1
            print(k)

            if output['spatial'] !=[]:

                if len(output['spatial']) ==1:

                    if len(output['spatial'][0])>3:

                        if [output['spatial'][0],output['objectId'][0]] not in identifier:

                            identifier.append([output['spatial'][0],output['objectId'][0]])

                        if output['spatial'][0] not in dictnames.keys():

                            key = output['spatial'][0]
                            dictnames.setdefault(key,[])
                            dictnames[key].append(output['objectId'][0])

                        else:

                            key = output['spatial'][0]

                            dictnames[key].append(output['objectId'][0])

                else:
                    for j in range (0,len(output['spatial'])):

                        if len(output['spatial'][j])>3:

                            if [output['spatial'][j],output['objectId'][0]] not in identifier:

                                identifier.append([output['spatial'][j],output['objectId'][0]])

                            if output['spatial'][j] not in dictnames.keys():

                                key = output['spatial'][j]
                                dictnames.setdefault(key,[])
                                dictnames[key].append(output['objectId'][0])

                            else:

                                key = output['spatial'][j]

                                dictnames[key].append(output['objectId'][0])

    #print (identifier)


    return dictnames
Ejemplo n.º 44
0
def get_names(dataname):

    record_prefix = "rdf:RDF/edm:ProvidedCHO"
    # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc)

    edm_reader = MetadataReader(
        fields={
            'title': ('textList', record_prefix + '/dc:title/text()'),
            'creator': ('textList', record_prefix + '/dc:creator/text()'),
            'subject': ('textList', record_prefix + '/dc:subject/text()'),
            'description':
            ('textList', record_prefix + '/dc:description/text()'),
            'publisher': ('textList', record_prefix + '/dc:publisher/text()'),
            'contributor':
            ('textList', record_prefix + '/dc:contributor/text()'),
            'date': ('textList', record_prefix + '/dc:date/text()'),
            'type': ('textList', record_prefix + '/dc:type/text()'),
            'format': ('textList', record_prefix + '/dc:format/text()'),
            'identifier':
            ('textList', record_prefix + '/dc:identifier/text()'),
            'source': ('textList', record_prefix + '/dc:source/text()'),
            'language': ('textList', record_prefix + '/dc:language/text()'),
            'relation': ('textList', record_prefix + '/dc:relation/text()'),
            'coverage': ('textList', record_prefix + '/dc:coverage/text()'),
            'rights': ('textList', record_prefix + '/dc:rights/text()'),
            'spatial': ('textList', record_prefix + '/dc:spatial/text()'),
            'objectId': ('textList', record_prefix + '/@rdf:about'),
        },
        namespaces={
            'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
            'dc': 'http://purl.org/dc/elements/1.1/',
            'dcterms': 'http://purl.org/dc/terms/',
            'dct': 'http://purl.org/dc/terms/',
            'edm': 'http://www.europeana.eu/schemas/edm/',
            'foaf': 'http://xmlns.com/foaf/0.1/',
            'owl': 'http://www.w3.org/2002/07/owl#',
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
            'skos': 'http://www.w3.org/2004/02/skos/core#',
            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
            'ore': 'http://www.openarchives.org/ore/terms/'
        })

    names = []
    identifier = []

    if __name__ == "__main__":

        URL = 'https://data.jhn.ngo/oai'

        registry = MetadataRegistry()
        registry.registerReader('edm', edm_reader)
        client = Client(URL, registry)
        # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21'

        k = 0

        for record in client.listRecords(metadataPrefix='edm', set=dataname):

            output = record[1].getMap()

            k = k + 1
            print(k)

            if output['creator'] != []:

                if len(output['creator']) == 1:

                    names.append([output['creator'][0]])
                    identifier.append(
                        [output['creator'][0], output['objectId'][0]])

                else:
                    for j in range(0, len(output['creator'])):

                        names.append([output['creator'][j]])
                        identifier.append(
                            [output['creator'][j], output['objectId'][0]])

            if output['contributor'] != []:

                if len(output['contributor']) == 1:

                    names.append([output['contributor'][0]])
                    identifier.append(
                        [output['contributor'][0], output['objectId'][0]])

                else:

                    for s in range(0, len(output['contributor'])):

                        names.append([output['contributor'][s]])
                        identifier.append(
                            [output['contributor'][s], output['objectId'][0]])

    print(names)

    return identifier
Ejemplo n.º 45
0
def add_provider(cxn, args):
    """Add a new provider to the registry database.
    
    Process ``args`` to add a new provider to the registry database. Return 0
    for success, 1 for failure (error message should be logged).
    
    ``cxn`` => instance of ``sqlite3.Connection``
    ``args`` => instance of ``argparse.Namespace``
    """
    global logger, MAX_NAME_LENGTH
    addlogger = logger.getChild('add')
    # Validate name
    if len(args.name) > MAX_NAME_LENGTH:
        addlogger.critical('Short name for new provider must be no more than '
                           '{0} characters long'.format(MAX_NAME_LENGTH))
        return 1
    elif args.name.startswith(('http://', 'https://')) or args.name == 'all':
        addlogger.critical('Short name for new provider may not be "all" nor '
                           'may it begin "http://" or "https://"')
        return 1
    # Try to create row now to avoid unnecessary validation if duplicate
    try:
        cxn.execute(
            "INSERT INTO providers(name, lastHarvest) values "
            "(?, ?)", (args.name, datetime.fromtimestamp(0)))
    except sqlite3.IntegrityError:
        addlogger.critical('Unable to add provider "{0}"; '
                           'provider with this name already exists'
                           ''.format(args.name))
        return 1
    else:
        addlogger.info('Adding provider "{0}"'.format(args.name))
    # Get any missing information
    # Base URL
    if args.url is None:
        args.url = raw_input('Base URL: '.ljust(20))
        if not args.url:
            addlogger.critical('Base URL for new provider not supplied')
            return 1
    # Set up an OAI-PMH client for validating providers
    md_registry = MetadataRegistry()
    md_registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(args.url, md_registry)
    # Validate Base URL by fetching Identify
    try:
        client.identify()
    except (XMLSyntaxError, HTTPError):
        addlogger.critical('Base URL for new provider does not return a valid '
                           'response to an `Identify` request')
        return 1
    # Destination
    if args.dest is None:
        args.dest = raw_input('Destination directory: '.ljust(20))
        if args.dest:
            # Expand user dir
            args.dest = os.path.expanduser(args.dest)
        else:
            addlogger.info('Destination for data for new provider not supplied'
                           ' using default `pwd`: {0}'.format(os.getcwd()))
            args.dest = os.getcwd()
    # metadataPrefix
    # Check that selected metadataPrefix is available from provider
    # Fetch list of available formats
    mdps = dict(
        (mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats())
    while args.metadataPrefix not in mdps:
        print "Available metadataPrefix values:"
        # List available formats
        for mdp in mdps:
            print mdp, '-', mdps[mdp][1]
        args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20))
        if not args.metadataPrefix:
            addlogger.info('metadataPrefix for new provider not supplied. '
                           'using default: oai_dc')
            args.metadataPrefix = 'oai_dc'
    cxn.execute(
        "UPDATE providers SET "
        "url=?, "
        "destination=?, "
        "metadataPrefix=? "
        "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name))
    addlogger.info('URL for next harvest: {0}?verb=ListRecords'
                   '&metadataPrefix={1}'
                   '&from={2:%Y-%m-%dT%H:%M:%SZ%z}'
                   ''.format(args.url, args.metadataPrefix,
                             datetime.fromtimestamp(0)))
    # All done, commit database
    cxn.commit()
    return 0
Ejemplo n.º 46
0
def identifiy(target):
    if target is not None:
        client = Client(target['url'], registry)
        identify = client.identify()
        return convert_identifiy(identify)
Ejemplo n.º 47
0
    return handler.records[0]

marcxml_reader = MARCXMLReader()
registry = metadata.MetadataRegistry()
registry.registerReader('marc21', marcxml_reader)

g = Graph()
g.namespace_manager.bind('skos', SKOS)
g.namespace_manager.bind('cn', CN)
g.namespace_manager.bind('dc', DC)
g.namespace_manager.bind('dct', DCT)
g.namespace_manager.bind('rdaa', RDAA)
g.namespace_manager.bind('rdac', RDAC)


oai = Client('https://fennica.linneanet.fi/cgi-bin/oai-pmh-fennica-asteri-aut.cgi', registry)

#recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames', from_=datetime(2016,2,10))
recs = oai.listRecords(metadataPrefix='marc21', set='corporateNames')

lang_cache = {}
if os.path.exists(LANG_CACHE_FILE):
  lcf = codecs.open(LANG_CACHE_FILE, 'r', 'utf-8')
  for line in lcf:
    lang, text = line.rstrip("\r\n").split("\t")
    if lang == '': lang = None
    lang_cache[text] = lang
  lcf.close()  

label_to_uri = {}
Ejemplo n.º 48
0
		return handler.records[0]

marcxml_reader = MARCXMLReader()

# Defining of metadata Readers in the Registry

from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('oai_dc', metadata.oai_dc_reader)
registry.registerReader('marc21', marcxml_reader)


#### OAI-PMH Client processing 

oai = Client('http://snape.mzk.cz/OAI-script', registry)

id = oai.identify()
print id.repositoryName()
print id.adminEmails()
print id.baseURL()

formats = oai.listMetadataFormats()
pprint formats

# 'marc21'

sets = oai.listSets()
for s in sets:
	print s
Ejemplo n.º 49
0
        'abstract': ('textList', 'oai_dc:dc/dc:description/text()'),
        'date': ('textList', 'oai_dc:dc/dc:date/text()'),
    },
    namespaces={
        'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
        'dc': 'http://purl.org/dc/elements/1.1/'
    })

# And create a registry for parsing the oai info, linked to the reader
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)

# arXiv OAI url we will query
URL = "http://export.arxiv.org/oai2"
# Create OAI client; now we're all set for listing some records
client = Client(URL, registry)

# Open files for writing
titlef = open(title_file, 'w')
#abstractf = open(abstr_file, 'w')

# Keep track of run-time and number of papers
start_time = time.time()
count = 0

# Harvest
for record in client.listRecords(metadataPrefix='oai_dc', set=section):
    try:
        # Extract the title
        title = record[1].getField('title')[0]
        # Extract the abstract
Ejemplo n.º 50
0
def oai_metadata(oai_endpoint):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(oai_endpoint, registry)
    return make_graphs(client.listRecords(metadataPrefix='oai_dc'))
Ejemplo n.º 51
0
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0):
    '''
    Pull articles using the Open Archives Initiative protocol
    
    subject    - String defining the subset of the main section
    section    - String defining the main section (typically physics or nothing)
    start      - A datetime.datetime object restricting the starting date of returned articles
    end        - A datetime.datetime object restricting the ending date of the returned articles
    sleep_time - A number specifying how many ms to wait between the record queries
    
    Examples

       oaiSpider("hep-ex", "physics")
       ==> returns all HEP experiment articles
       
       oaiSpider("cs", "", datetime(2011,06,24))
       ==> returns all computer science articles submitted after June 24th, 2011
       
       oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24))
       ==> returns all HEP phenomenology articles submitted before June 24th, 2011

    Returns a list of dictionaries containing the article metadata
    '''

    from oaipmh.client import Client
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    base_url = "http://export.arxiv.org/oai2"
    output = []

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    if section == None:
        section = ""
    if len(section) > 0 and section[-1] != ":":
        section += ":"

    # sets = client.listSets()
    # for entry in sets:
    #     print entry
    
    ### OAIPMH module sucks donkey balls
    # Causes some error when I use the from_ or until keys
    records = client.listRecords(metadataPrefix='oai_dc'
                                 , set='%s%s' % (section, subject)
                                 , from_=start
                                 #, from_=datestamp
                                 , until=end
                                 )
    
    counter = 0
    
    for (header, metadata, aux) in records:
        
        print counter

        # for key in  metadata._map.keys():
        #     print key, metadata[key]

        output.append({"title"    : cleanText(metadata["title"][0]),
                       "abstract" : cleanText(metadata["description"][0]),
                       "date"     : convertDate(max(metadata["date"])),
                       "subject"  : subject,
                       "url"      : metadata["identifier"][0],
                       "authors"  : "; ".join( metadata['creator']),
                       })

        print output[-1]
        counter += 1
        
        # break
        # if counter > 15:
        #     break
        time.sleep(sleep_time)

    return output
Ejemplo n.º 52
0
"""Retrieve arXiv abstracts with the OAI-PMH protocol
The parser is messed up: it returns empty lists for the metadata tags;
Do not use it"""

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

URL = 'http://export.arxiv.org/oai2'

registry = MetadataRegistry()
registry.registerReader('arXiv', oai_dc_reader)
client = Client(URL, registry)

counter = 0
for record in client.listRecords(metadataPrefix='arXiv', set='physics:hep-ex'):
    if counter == 150:
        break
    # print record[0].datestamp()
    # print record[0].identifier()
    # print record[1].getField(name='title')
    print(vars(record[1]))
    counter += 1

# for a_set in client.listSets():
#     # if counter == 50:
#     #     break
#     print a_set
#     counter += 1