Ejemplo n.º 1
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 2
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 3
0
def test(request):
	URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler'
	registry = MetadataRegistry()
	registry.registerReader('oai_dc', oai_dc_reader)
	client = Client(URL, registry)
	identifyResponse = client.identify()

	print dir(identifyResponse)
	#for record in client.listRecords(metadataPrefix='oai_dc'):
	#	result += record
	return HttpResponse(identifyResponse.repositoryName())
Ejemplo n.º 4
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Ejemplo n.º 5
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Ejemplo n.º 6
0
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
Ejemplo n.º 7
0
Archivo: forms.py Proyecto: llcit/llt
    def clean(self):
        cleaned_data = super(CreateRepositoryForm, self).clean()
        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(cleaned_data.get('base_url'), registry)
            server = client.identify()
            # set the repository name apply to model instance when saved.
            cleaned_data['name'] = server.repositoryName()
        except:
            raise ValidationError('Repository base url is invalid.')

        return cleaned_data
Ejemplo n.º 8
0
    def checkProvider(self, url):
        """
         Check OAI-PMH provider. A valid Identity response, is considered
         is considered as provider online. An exception is considered provider offline 
         """

        try:
            client = Client(url)
            ident = client.identify()
            self.log.debug("Service at: " + url + " is responding")
            self.log.debug("RepositoryName is: " + ident.repositoryName())
            self.log.debug("BaseURL is: " + ident.baseURL())
            return True

        except Exception as e:
            self.log.error("Problem with server at: " + url + "\n")
            #,exc_info=True)
            return False
Ejemplo n.º 9
0
    'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
    'oi': 'http://www.openbeelden.nl/oai/'}
    )

#URL = 'http://www.openbeelden.nl/oip-test/feeds/oai/'
URL = 'http://www.openbeelden.nl/feeds/oai/'
#URL = 'http://oai.tuxic.nl/oai/'

#Initieer de OAI client
registry = MetadataRegistry()
registry.registerReader('oai_oi', oai_oi_reader)
client = Client(URL, registry)
x = client.updateGranularity()

#Controleer of de OAI service goed geidentificeerd kan worden
x = client.identify()
print 'identity %s' % x.repositoryName()
print 'identity %s' % x.protocolVersion()
print 'identity %s' % x.baseURL()

OUTPUT_DIR = '/Users/jblom/temp'


print 'Firing up the openSKOSHandler'
osh = OpenSKOSHandler()

def processOpenbeelden():
	i=0
	iarecs = []
	#for y in client.listRecords(metadataPrefix='oai_oi', from_=parse('2011-01-01'), until=parse('2011-11-01')):
	extent = None
Ejemplo n.º 10
0
URL = "http://citeseerx.ist.psu.edu/oai2"

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)

client = Client(URL, registry)
client.updateGranularity()

store = Store()

if len(sys.argv) > 1:
    start = datetime.strptime(sys.argv[1], '%Y-%m-%d') #2011-10-27, for instance
elif store.last():
    start = store.last()
else:
    start = client.identify().earliestDatestamp()

#try this and see if it works; if it does resumption tokens right, this should work fine.


chunk = timedelta(days=1)
oneday = timedelta(days=1)

#TODO: clearly they don't do this whole "ordered" thing. Grab records by month or year or something instead of all at once.
#TODO: luckily, once we've done a full slurp, we only need to remember when the last full slurp was and start since then. But if interrupted, we need to start back from where the last *full* slurp was, due to the ordering problem.

#TODO: structure this better, with the try effectively moved much further above. Really, move a lot more into functions
try:
    current = start #TODO: make a nice little generator so I can use a for loop
    while current <= datetime.now():
        print >>sys.stderr, "fetching records @", now(), "starting with", current.strftime('%Y-%m-%d')
    def run(self):
        # Check that ElasticSearch is alive
        self.check_index()

        # If the user specified the --REBUILD flag, recreate the index
        if self.options['rebuild']:
            self.rebuild_index()

        # Connect to the repository
        registry = MetadataRegistry()
        registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"])

        client = Client(self.settings["uri"], registry)
        identity = client.identify()

        print "Connected to repository: %s" % identity.repositoryName()

        # got to update granularity or we barf with: 
        # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
        client.updateGranularity()

        # Initialise some variables
        batcher = Batch.Batch()
        total_records = 0
        start = time.time()
        
        # Now do the synchonisation
        
        # If the user specified an identifier, then synchronise this record
        if (self.options['identifier'] is not None):
            total_records += self.synchronise_record(client, batcher, self.options['identifier'])
        else:
            # Else, synchronise using the date-range provided by the user, or failing that, 
            # the date-range based on the last sync

            # Get the synchronisation config record
            synchronisation_config = self.get_synchronisation_config()

            
            if self.options["from_date"] is not None:
                # If the user specified a from-date argument, use it
                from_date = self.options["from_date"] # already a date (not a datetime)
            elif synchronisation_config is not None and "to_date" in synchronisation_config:
                # Else read the last synchronised to_date from the config, and add on a day
                from_date = dateutil.parser.parse(synchronisation_config["to_date"]).date() + timedelta(days=1)
            else:
                # Else use the default_from_date in the config
                from_date = dateutil.parser.parse(self.settings['default_from_date']).date()

            if self.options["to_date"] is not None:
                to_date = self.options["to_date"] # already a date (not a datetime)
            else:
                to_date = (date.today() - timedelta(days=1))
            
            # Force the from_date to use time 00:00:00
            from_date = datetime.combine(from_date, _time(hour=0, minute=0, second=0, microsecond=0))

            # Force the to_date to use time 23:59:59
            to_date = datetime.combine(to_date, _time(hour=23, minute=59, second=59, microsecond=0))


            print "Synchronising from %s - %s" % (from_date, to_date)

            while from_date < to_date:
                next_date = datetime.combine(from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0))
                number_of_records = self.synchronise_period(client, batcher, from_date, next_date)
                batcher.clear() #Store the records in elasticsearch
                self.put_synchronisation_config(from_date, next_date, number_of_records)
                from_date += timedelta(days=(self.settings['delta_days']))
                total_records += number_of_records

                # Pause so as not to get banned.
                to = 20
                print "Sleeping for %i seconds so as not to get banned." % to
                time.sleep(to)

            
        # Store the records in the index
        batcher.clear()
        
        # Print out some statistics
        time_spent = time.time() - start
        print 'Total time spent: %d seconds' % (time_spent)

        if time_spent > 0.001: # careful as its not an integer
            print 'Total records synchronised: %i records (%d records/second)' % (total_records, (total_records/time_spent))
        else:
            print 'Total records synchronised: %i records' % (total_records)
        return total_records

        sys.exit()
Ejemplo n.º 12
0
def add_provider(cxn, args):
    """Add a new provider to the registry database.
    
    Process ``args`` to add a new provider to the registry database. Return 0
    for success, 1 for failure (error message should be logged).
    
    ``cxn`` => instance of ``sqlite3.Connection``
    ``args`` => instance of ``argparse.Namespace``
    """
    global logger, MAX_NAME_LENGTH
    addlogger = logger.getChild('add')
    # Validate name
    if len(args.name) > MAX_NAME_LENGTH:
        addlogger.critical('Short name for new provider must be no more than '
                           '{0} characters long'.format(MAX_NAME_LENGTH))
        return 1
    elif args.name.startswith(('http://', 'https://')) or args.name == 'all':
        addlogger.critical('Short name for new provider may not be "all" nor '
                           'may it begin "http://" or "https://"')
        return 1
    # Try to create row now to avoid unnecessary validation if duplicate
    try:
        cxn.execute("INSERT INTO providers(name, lastHarvest) values "
                         "(?, ?)",
                         (args.name, datetime.fromtimestamp(0))
        )
    except sqlite3.IntegrityError:
        addlogger.critical('Unable to add provider "{0}"; '
                           'provider with this name already exists'
                           ''.format(args.name)
                           )
        return 1
    else:
        addlogger.info('Adding provider "{0}"'.format(args.name))
    # Get any missing information
    # Base URL
    if args.url is None:
        args.url = raw_input('Base URL:'.ljust(20))
        if not args.url:
            addlogger.critical('Base URL for new provider not supplied')
            return 1
    # Set up an OAI-PMH client for validating providers
    md_registry = MetadataRegistry()
    md_registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(args.url, md_registry)
    # Validate Base URL by fetching Identify
    try:
        client.identify()
    except (XMLSyntaxError, HTTPError):
        addlogger.critical('Base URL for new provider does not return a valid '
                           'response to an `Identify` request')
        return 1
    # Destination
    if args.dest is None:
        args.dest = raw_input('Destination directory: '.ljust(20))
        if args.dest:
            # Expand user dir
            args.dest = os.path.expanduser(args.dest)
        else:
            addlogger.info('Destination for data for new provider not supplied'
                           ' using default `pwd`: {0}'.format(os.getcwd())
                           )
            args.dest = os.getcwd()
    # metadataPrefix
    # Check that selected metadataPrefix is available from provider
    # Fetch list of available formats
    mdps = dict((mdpinfo[0], mdpinfo[1:])
                    for mdpinfo in
                    client.listMetadataFormats())
    while args.metadataPrefix not in mdps:
        print "Available metadataPrefix values:"
        # List available formats
        for mdp in mdps:
            print mdp, '-', mdps[mdp][1]
        args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20))
        if not args.metadataPrefix:
            addlogger.info('metadataPrefix for new provider not supplied. '
                           'using default: oai_dc')
            args.metadataPrefix = 'oai_dc'
    cxn.execute("UPDATE providers SET "
                     "url=?, "
                     "destination=?, "
                     "metadataPrefix=? "
                     "WHERE name=?",
                     (args.url,
                      args.dest,
                      args.metadataPrefix,
                      args.name
                      )
    )
    addlogger.info('URL for next harvest: {0}?verb=ListRecords'
                   '&metadataPrefix={1}'
                   '&from={2:%Y-%m-%dT%H:%M:%SZ%z}'
                   ''.format(args.url,
                             args.metadataPrefix,
                             datetime.fromtimestamp(0)
                             )
                   )
    # All done, commit database
    cxn.commit()
    return 0
Ejemplo n.º 13
0
class Repository(object):
    """ Repository handles interaction with the various interfaces provided by 
    the dspace repository. """
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)

    def _extractIdentifierBase(self, url):
        """ From a given URL, extract the OAI identifier base (hostname) """
        return urlparse(url).hostname

    def _extractSet(self, handle):
        """ Determine the OAI set from a collection handle """
        if not isinstance(handle, basestring):
            raise ValueError('Collection handles must be strings')
        return 'hdl_' + handle.replace('/', '_').replace(':', '_')

    def getName(self):
        """ Get the configured name of the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return self.oai.identify().repositoryName()

    def getCollections(self):
        """ Get a list of the collections in the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return map(lambda c: c[0:2], self.oai.listSets())

    def getItemHandles(self, collection=None, **kw):
        """ Get item handles from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        for item in self.getItemIdentifiers(collection=collection, **kw):
            yield item.identifier().split(':', 2)[2]

    def getItemIdentifiers(self, collection=None, **kw):
        """ Get item identifiers from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listIdentifiers(**kw)

    def getItems(self, collection=None, **kw):
        """ Get full items from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listRecords(**kw)

    def getItem(self, handle=None, identifier=None, **kwargs):
        """ Get a single item from the OAI-PMH interface either by handle or 
        identifier """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kwargs.setdefault('metadataPrefix', 'mets')

        if handle is None and identifier is None:
            raise ValueError('Either handle or identifier must be provided')

        if handle is not None:
            if identifier is not None:
                raise ValueError('Either a handle or identifier must be '
                                 'provided, not both')

            identifier = 'oai:%s:%s' % (
                self.identifier_base,
                handle,
            )

        return self.oai.getRecord(identifier=identifier, **kwargs)

    def getOAIItemIdentifier(self, handle):
        return 'oai:%s:%s' % (self._extractIdentifierBase(
            self.base_url), handle)

    def getSwordCollections(self):
        pass

    def getSwordCollection(self, args):
        pass
Ejemplo n.º 14
0
def identifiy(target):
    if target is not None:
        client = Client(target['url'], registry)
        identify = client.identify()
        return convert_identifiy(identify)
Ejemplo n.º 15
0
def add_provider(cxn, args):
    """Add a new provider to the registry database.
    
    Process ``args`` to add a new provider to the registry database. Return 0
    for success, 1 for failure (error message should be logged).
    
    ``cxn`` => instance of ``sqlite3.Connection``
    ``args`` => instance of ``argparse.Namespace``
    """
    global logger, MAX_NAME_LENGTH
    addlogger = logger.getChild('add')
    # Validate name
    if len(args.name) > MAX_NAME_LENGTH:
        addlogger.critical('Short name for new provider must be no more than '
                           '{0} characters long'.format(MAX_NAME_LENGTH))
        return 1
    elif args.name.startswith(('http://', 'https://')) or args.name == 'all':
        addlogger.critical('Short name for new provider may not be "all" nor '
                           'may it begin "http://" or "https://"')
        return 1
    # Try to create row now to avoid unnecessary validation if duplicate
    try:
        cxn.execute(
            "INSERT INTO providers(name, lastHarvest) values "
            "(?, ?)", (args.name, datetime.fromtimestamp(0)))
    except sqlite3.IntegrityError:
        addlogger.critical('Unable to add provider "{0}"; '
                           'provider with this name already exists'
                           ''.format(args.name))
        return 1
    else:
        addlogger.info('Adding provider "{0}"'.format(args.name))
    # Get any missing information
    # Base URL
    if args.url is None:
        args.url = raw_input('Base URL:'.ljust(20))
        if not args.url:
            addlogger.critical('Base URL for new provider not supplied')
            return 1
    # Set up an OAI-PMH client for validating providers
    md_registry = MetadataRegistry()
    md_registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(args.url, md_registry)
    # Validate Base URL by fetching Identify
    try:
        client.identify()
    except (XMLSyntaxError, HTTPError):
        addlogger.critical('Base URL for new provider does not return a valid '
                           'response to an `Identify` request')
        return 1
    # Destination
    if args.dest is None:
        args.dest = raw_input('Destination directory: '.ljust(20))
        if args.dest:
            # Expand user dir
            args.dest = os.path.expanduser(args.dest)
        else:
            addlogger.info('Destination for data for new provider not supplied'
                           ' using default `pwd`: {0}'.format(os.getcwd()))
            args.dest = os.getcwd()
    # metadataPrefix
    # Check that selected metadataPrefix is available from provider
    # Fetch list of available formats
    mdps = dict(
        (mdpinfo[0], mdpinfo[1:]) for mdpinfo in client.listMetadataFormats())
    while args.metadataPrefix not in mdps:
        print "Available metadataPrefix values:"
        # List available formats
        for mdp in mdps:
            print mdp, '-', mdps[mdp][1]
        args.metadataPrefix = raw_input('metadataPrefix [oai_dc]:'.ljust(20))
        if not args.metadataPrefix:
            addlogger.info('metadataPrefix for new provider not supplied. '
                           'using default: oai_dc')
            args.metadataPrefix = 'oai_dc'
    cxn.execute(
        "UPDATE providers SET "
        "url=?, "
        "destination=?, "
        "metadataPrefix=? "
        "WHERE name=?", (args.url, args.dest, args.metadataPrefix, args.name))
    addlogger.info('URL for next harvest: {0}?verb=ListRecords'
                   '&metadataPrefix={1}'
                   '&from={2:%Y-%m-%dT%H:%M:%SZ%z}'
                   ''.format(args.url, args.metadataPrefix,
                             datetime.fromtimestamp(0)))
    # All done, commit database
    cxn.commit()
    return 0
Ejemplo n.º 16
0
    def retrieval(self, repository):
        self.logger.info(u'Trying to retrieve url {0}'.format(repository[1]).encode(ENCODE))

        registry = MetadataRegistry()
        registry.registerReader(METADATA, oai_dc_reader)

        try:
            client = Client(repository[1], registry)

            self.logger.info(SEPARATOR)
            self.logger.info(u'Connection established successfully...')

            # identify info
            identify = client.identify()
            repository_name = identify.repositoryName()
            repository_name_normalized = re.sub(re.compile(FILE_ESCAPE_CHARS), '', repository_name).strip() \
                .replace(' ', '_').lower()
            base_url = identify.baseURL().encode(ENCODE)
            protocol_version = identify.protocolVersion().encode(ENCODE)
            granularity = identify.granularity().encode(ENCODE)
            compression = identify.compression()
            deleted_record = identify.deletedRecord().encode(ENCODE)

            metadata = {'repository_name': repository_name,
                        'base_url': base_url,
                        'latest_url': repository[1],
                        'protocol_version': protocol_version,
                        'granularity': granularity,
                        'compression': str(compression).strip('[]'),
                        'deleted_record': deleted_record}

            self.logger.info(u'Repository name: {0}'.format(repository_name))
            self.logger.info(u'URL connected: {0}'.format(repository[1]))
            self.logger.info(u'Base URL: {0}'.format(base_url))
            self.logger.info(u'Protocol version: {0}'.format(protocol_version))
            self.logger.info(u'Granularity: {0}'.format(granularity))
            self.logger.info(u'Compression: {0}'.format(compression))
            self.logger.info(u'Deleted record: {0}'.format(deleted_record))

            records_count = 0
            deleted_count = 0
            records_list = list()
            parsed_records_list = list()

            # we're not interested in all sets, so we must iterate over the ones we have and want to crawl
            if repository[2] is not None:
                self.logger.info(u'Fetching set {0}...'.format(repository[2]))
                records_list = client.listRecords(metadataPrefix=METADATA, set=repository[2])
            else:
                records_list = client.listRecords(metadataPrefix=METADATA)
            if records_list is not None:
                for record in records_list:
                    records_count += 1
                    if record[0].isDeleted():
                        deleted_count += 1
                    if record[1] is not None:
                        parsed_records_list.append(tostring(record[1].element()))
                self.logger.info(
                    u'Retrieved {0} records from set {1} where {2} were deleted'.format(records_count, repository[2],
                                                                                        deleted_count))
            if not exists(''.join(['files/', repository_name_normalized, '/'])):
                self.logger.info('Creating storage folder for {0}...'.format(repository_name))
                makedirs(''.join(['files/', repository_name_normalized, '/']))

            self.logger.info(u'Creating storage files...')
            meta_file = open(''.join(['files/', repository_name_normalized, '/metadata.xml']), 'w')
            metadata[repository[2] + '_records_number'] = records_count
            metadata[repository[2] + '_deleted_number'] = deleted_count
            meta_file.write(tostring(dict_to_xml('metadata', metadata)))
            meta_file.close()

            record_file = open(''.join(
                ['files/', repository_name_normalized, '/', repository_name_normalized, '_', repository[2], '.xml']),
                'w')
            record_file.write(''.join(parsed_records_list))
            record_file.close()

        except NoRecordsMatchError, nrme:
            self.logger.error(u'{0} on repository {1}'.format(nrme.message, repository_name))

            # add url to unvisited_url and ask retrieval to try to crawl them again
            if nrme.message == 'No matches for the query':
                self.unvisited_repository.append(repository)
class OpenBeeldenDataLoader(DataLoader):

	def __init__(self):
		self.ES_INDEX = 'et_openbeelden'
		self.ES_DOC_TYPE = 'mediaresource'
		self.es_local = Elasticsearch(host=LTV_ES_SETTINGS['host'], port=LTV_ES_SETTINGS['port'])

	def loadMediaResourceData(self, resourceUri, clientIP, loadAnnotations):
		mediaResource = MediaResource(resourceUri)

		#load the annotations (only named entities in this case)
		mediaResource = self.__getAllAnnotationsOfResource(mediaResource)

		#fetch the video metadata
		mediaResource = self.__getAllVideoMetadata(mediaResource, clientIP)

		#transform the mediaresource object to JSON and return it
		resp = simplejson.dumps(mediaResource, default=lambda obj: obj.__dict__)
		return resp

	def loadMediaResources(self, provider):#ignores provider
		return self.loadOpenBeeldenItemsFromES(0, [])


	def loadOpenBeeldenItemsFromES(self, offset, videos):
		query = {
			"query": {
				"match_all": {}
			},
  			"fields": [],
  			"from": offset,
			"size": 300
		}
		resp = self.es_local.search(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, body=query, timeout="10s")
		if resp and len(resp['hits']['hits']) > 0:
			print len(resp['hits']['hits'])
			vids = []
			for hit in resp['hits']['hits']:
				vid = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=hit['_id'])
				vids.append(vid['_source'])
			for vd in vids:
				video = {
					'id' : vd['id'].replace(':', '_'),
					'title' : '; '.join(vd['title']),
					'date' : '; '.join(vd['date']),
					'locator' : self.__getMediumByExtension(vd['medium'], 'mp4'),
					'thumbUrl' : self.__getMediumByExtension(vd['medium'], 'png'),
					'thumbBaseUrl' : ''
				}
				videos.append(video)
			self.loadOpenBeeldenItemsFromES(offset + 300, videos)
		return {'videos' : videos}


	def __getMediumByExtension(self, mediums, extension):
		poster = None
		for m in mediums:
			if m.find('.%s' % extension) != -1:
				poster = m
				break
		return poster

	def __getAllAnnotationsOfResource(self, mediaResource):
		nes = []
		"""
		nes.append(NamedEntity(
			label,
			entityType=LinkedTVDataUtils.getNEType(DCType, RDFType, OWLSameAs),
			subTypes=LinkedTVDataUtils.getDCTypes(DCType),
			disambiguationURL=OWLSameAs,
			start=start,
			end=end,
			annotationURI=annotationURI,
			relevance=r,
			confidence=c
			)
		)
		"""
		mediaResource.setNamedEntities(nes)

		return mediaResource

	def __getAllVideoMetadata(self, mediaResource, clientIP):
		print mediaResource.getId()
		vd = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=mediaResource.getId().replace('_', ':'))
		if vd:
			vd = vd['_source']
			mediaResource.setVideoMetadata(vd)

			mediaResource.setPlayoutUrl(self.__getMediumByExtension(vd['medium'], 'mp4'))

			#set the video metadata in the mediaresource
			mediaResource.setTitle('; '.join(vd['title']))
			mediaResource.setDate('; '.join(vd['date']))
			mediaResource.setThumbBaseUrl(None)
			mediaResource.setSrtUrl(None)
			mediaResource.setSubtitles(None)

		return mediaResource

	def setupOAIPMHConnection(self):
		oai_oi_reader = MetadataReader(
		    fields={
		    'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
		    'alternative':       ('textList', 'oai_oi:oi/oi:alternative/text()'),
		    'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
		    'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
		    'description': ('textList', 'oai_oi:oi/oi:description/text()'),
		    'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
		    'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
		    'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
		    'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
		    'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
		    'extent':        ('textList', 'oai_oi:oi/oi:extent/text()'),
		    'medium':        ('textList', 'oai_oi:oi/oi:medium/text()'),
		    'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
		    'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
		    'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
		    'references':    ('textList', 'oai_oi:oi/oi:references/text()'),
		    'spatial':    ('textList', 'oai_oi:oi/oi:spatial/text()'),
		    'attributionName':    ('textList', 'oai_oi:oi/oi:attributionName/text()'),
		    'attributionURL':    ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
		    'license':      ('textList', 'oai_oi:oi/oi:license/text()')
		    },

		    namespaces={
		    	'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
		    	'oi': 'http://www.openbeelden.nl/oai/'
		    }
		)

		URL = 'http://www.openbeelden.nl/feeds/oai/'

		#Initialize the OAI client
		self.registry = MetadataRegistry()
		self.registry.registerReader('oai_oi', oai_oi_reader)
		self.client = Client(URL, self.registry)

		#Test if the connection to the OAI-PMH provider works
		x = self.client.updateGranularity()
		x = self.client.identify()
		print 'identity %s' % x.repositoryName()
		print 'identity %s' % x.protocolVersion()
		print 'identity %s' % x.baseURL()

		"""
		for s in client.listSets():
			print s
		"""

		#initialize the OpenSKOSHandler
		self.openSKOSHandler = OpenSKOSHandler()

	def reindex(self, provider = None):
		setupOAIPMHConnection()
		i = 0
		extent = None
		item = None
		identifier = None
		for rec in self.client.listRecords(metadataPrefix=u'oai_oi', set=u'beeldengeluid'):#stichting_natuurbeelden, beeldengeluid
			header, metadata, about = rec

			extent = metadata.getField('extent')[0]
			item = {
				'id' : header.identifier(),
				'identifier' : self.getFieldData(metadata, 'identifier'),
				'title' : self.getFieldData(metadata, 'title'),
				'alternative' : self.getFieldData(metadata, 'alternative'),
				'creator' : self.getFieldData(metadata, 'creator'),
				'subject' : self.getFieldData(metadata, 'subject'),
				'description' : self.getFieldData(metadata, 'description'),
				'abstract' : self.getFieldData(metadata, 'abstract'),
				'publisher' : self.getFieldData(metadata, 'publisher'),
				'contributor' : self.getFieldData(metadata, 'contributor'),
				'date' : self.getFieldData(metadata, 'date'),
				'date2' : header.datestamp(),
				'type' : self.getFieldData(metadata, 'type'),
				'extent' : extent,
				'medium' : self.getFieldData(metadata, 'medium'),
				'source' : self.getFieldData(metadata, 'source'),
				'language' : self.getFieldData(metadata, 'language'),
				'references' : self.getFieldData(metadata, 'references'),
				'spatial' : self.getFieldData(metadata, 'spatial'),
				'attributionName' : self.getFieldData(metadata, 'attributionName'),
				'attributionURL' : self.getFieldData(metadata, 'attributionURL'),
				'license' : self.getFieldData(metadata, 'license'),
				'durationSecs' : self.getExtentInSeconds(extent)
			}
			self.es_local.index(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=header.identifier(), body=item)

		print 'Done'
		return True

	def getGTAATermsBySubjects(self, subject, spatial):
		"""Get the GTAA terms related to the subject"""
		gtaaTerms = self.getGTAATermsBasedOnSubjectAndLocation(subject, spatial)

		"""If there is no identifier, try to fetch the taakID from iMMix ES"""
		if identifier == '' and source != '':
			print 'No taakID!'
			taakID = self.getTaakIDBasedOnSource(source)
			if taakID:
				print 'assigning taakID to the identifier'
				identifier = taakID
		return gtaaTerms

	def getFieldData(self, metadata, fn):
		#return '; '.join(metadata.getField(fn))
		return metadata.getField(fn)

	def getExtentInSeconds(self, ext):
		secs = 0
		if ext and ext.find('PT') != -1:
			ext = ext[2:len(ext)]
			if ext.find('H') != -1:
				secs = int(ext[0:ext.find('H')]) * 3600
				ext = ext[ext.find('H') + 1:len(ext)]
			if ext.find('M') != -1:
				secs = int(ext[0:ext.find('M')]) * 60
				ext = ext[ext.find('M') + 1:len(ext)]
			if ext.find('S') != -1:
				secs += int(ext[0:ext.find('S')])
		return secs

	def secsToTimeString(self, secs):
		h = m = s = 0
		while secs - 3600 >= 0:
			h += 1
			secs -= 3600
		while secs - 60 >= 0:
			m += 1
			secs -= 60
		return '%d:%d:%d' % (h, m, s)
	#Run de hoofdfunctie

	def getGTAATermsBasedOnSubjectAndLocation(self, subject, spatial):
		subs = None
		locs = None
		os_res = None
		gtaaExact = []
		gtaaFuzzy = []

		"""First add GTAA terms based on the subject(s)"""
		if subject:
			subs = subject.split(';')
			for s in subs:
				 os_res = self.openSKOSHandler.autoCompleteTable(s)
				 if os_res:
					 if len(os_res) == 1:
						gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value']))
					 elif len(os_res) > 1:
						for r in os_res:
							gtaaFuzzy.append('%s,%s' % (r['label'], r['value']))

		"""Append the GTAA terms based on the location(s)"""
		if spatial:
			locs = spatial.split(';')
			for l in locs:
				 os_res = self.openSKOSHandler.autoCompleteTable(l, 'http://data.beeldengeluid.nl/gtaa/GeografischeNamen')
				 if os_res:
					 if len(os_res) == 1:
						gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value']))
					 elif len(os_res) > 1:
						for r in os_res:
							gtaaFuzzy.append('%s,%s' % (r['label'], r['value']))

		return (gtaaExact, gtaaFuzzy)

	def getImmixMetadataBasedOnDrager(self, drager):
		global tot
		query = {"query":{"bool":{"must":[{"query_string":{"default_field":"positie.dragernummer","query":"\"%s\"" % drager}}],"must_not":[],"should":[]}}}
		#print query
		resp = es_local.search(index="search_expressie", doc_type="searchable_expressie", body=query, timeout="10s")
		#print resp
		if resp and resp['hits']['total'] == 1:
			for hit in resp['hits']['hits']:
				return hit
		elif resp and resp['hits']['total'] > 1:
			print 'more than one hit...'
			print resp
		return None

	def getTaakIDBasedOnSource(self, source):
		dragernrs = str(source).split('; ')
		drager = None

		"""Get the drager from the source (sometimes there are two, but most of the times they are the same)"""
		if len(dragernrs) == 2:
			if dragernrs[0] != dragernrs[1]:
				print dragernrs
				print '>>>>>>>>>> There are two dragers...'
			else:
				drager = dragernrs[0]
		else:
			drager = dragernrs[0]

		"""Try to find the taakID related to the drager"""
		if drager:
			md = self.getImmixMetadataBasedOnDrager(drager)
			if md:
				taakID = md['_source']['expressie']['niveau']['taakID']
				if taakID:
					print 'Found a taakID: %s\t%s' % (drager, taakID)
					return taakID
		return None
    def run(self):
        # Check that ElasticSearch is alive
        self.check_index()

        # If the user specified the --REBUILD flag, recreate the index
        if self.options['rebuild']:
            self.rebuild_index()

        # Connect to the repository
        registry = MetadataRegistry()
        registry.registerReader(self.settings["metadata_format"],
                                self.settings["metadata_reader"])

        client = Client(self.settings["uri"], registry)
        identity = client.identify()

        print "Connected to repository: %s" % identity.repositoryName()

        # got to update granularity or we barf with:
        # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
        client.updateGranularity()

        # Initialise some variables
        batcher = Batch.Batch()
        total_records = 0
        start = time.time()

        # Now do the synchonisation

        # If the user specified an identifier, then synchronise this record
        if (self.options['identifier'] is not None):
            total_records += self.synchronise_record(
                client, batcher, self.options['identifier'])
        else:
            # Else, synchronise using the date-range provided by the user, or failing that,
            # the date-range based on the last sync

            # Get the synchronisation config record
            synchronisation_config = self.get_synchronisation_config()

            if self.options["from_date"] is not None:
                # If the user specified a from-date argument, use it
                from_date = self.options[
                    "from_date"]  # already a date (not a datetime)
            elif synchronisation_config is not None and "to_date" in synchronisation_config:
                # Else read the last synchronised to_date from the config, and add on a day
                from_date = dateutil.parser.parse(
                    synchronisation_config["to_date"]).date() + timedelta(
                        days=1)
            else:
                # Else use the default_from_date in the config
                from_date = dateutil.parser.parse(
                    self.settings['default_from_date']).date()

            if self.options["to_date"] is not None:
                to_date = self.options[
                    "to_date"]  # already a date (not a datetime)
            else:
                to_date = (date.today() - timedelta(days=1))

            # Force the from_date to use time 00:00:00
            from_date = datetime.combine(
                from_date, _time(hour=0, minute=0, second=0, microsecond=0))

            # Force the to_date to use time 23:59:59
            to_date = datetime.combine(
                to_date, _time(hour=23, minute=59, second=59, microsecond=0))

            print "Synchronising from %s - %s" % (from_date, to_date)

            while from_date < to_date:
                next_date = datetime.combine(
                    from_date.date() +
                    timedelta(days=(self.settings['delta_days'] - 1)),
                    _time(hour=23, minute=59, second=59, microsecond=0))
                number_of_records = self.synchronise_period(
                    client, batcher, from_date, next_date)
                batcher.clear()  #Store the records in elasticsearch
                self.put_synchronisation_config(from_date, next_date,
                                                number_of_records)
                from_date += timedelta(days=(self.settings['delta_days']))
                total_records += number_of_records

                # Pause so as not to get banned.
                to = 20
                print "Sleeping for %i seconds so as not to get banned." % to
                time.sleep(to)

        # Store the records in the index
        batcher.clear()

        # Print out some statistics
        time_spent = time.time() - start
        print 'Total time spent: %d seconds' % (time_spent)

        if time_spent > 0.001:  # careful as its not an integer
            print 'Total records synchronised: %i records (%d records/second)' % (
                total_records, (total_records / time_spent))
        else:
            print 'Total records synchronised: %i records' % (total_records)
        return total_records

        sys.exit()
def transfer_experiment(source):
    """
    Pull public experiments from source into current mytardis.
    """

    #TODO: Cleanup error messages
    #TODO: does not transfer liences as not part of METS format.
    #NOTE: As this is a pull we trust the data from the other tardis
    # Check identity of the feed
    from oaipmh.client import Client
    from oaipmh import error
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    from django.core.cache import cache
    from django.utils.hashcompat import md5_constructor as md5

    # The cache key consists of the task name and the MD5 digest
    # of the feed URL.
    cache_key = md5("token").hexdigest()
    lock_id = "%s-lock-%s" % ("consume_experiment", cache_key)
    LOCK_EXPIRE = 60 * 5
    # cache.add fails if if the key already exists
    acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE)
    # memcache delete is very slow, but we have to use it to take
    # advantage of using add() for atomic locking
    release_lock = lambda: cache.delete(lock_id)

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    source_url = "%s/apps/oaipmh/?verb=Identify" % source

    client = Client(source_url, registry)
    try:
        identify = client.identify()
    except AttributeError as e:
        msg = "Error reading repos identity: %s:%s" % (source, e)
        logger.error(msg)
        raise ReposReadError(msg)
    except error.ErrorBase as e:
        msg = "OAIPMH error: %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except URLError as e:
        logger.error(e)
        raise
    repos = identify.baseURL()
    import urlparse
    repos_url = urlparse.urlparse(repos)
    dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc)
    if dest_name != source:
        msg = "Source directory reports incorrect name: %s" % dest_name
        logger.error(msg)
        raise BadAccessError(msg)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(
        source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc",
        registry)
    try:
        exps_metadata = [
            meta for (header, meta,
                      extra) in client.listRecords(metadataPrefix='oai_dc')
        ]
    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    local_ids = []
    for exp_metadata in exps_metadata:
        exp_id = exp_metadata.getField('identifier')[0]
        user = exp_metadata.getField('creator')[0]

        found_user = _get_or_create_user(source, user)

        #make sure experiment is publicish
        try:
            xmldata = getURL("%s/apps/reposproducer/expstate/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        try:
            exp_state = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not exp_state in [
                Experiment.PUBLIC_ACCESS_FULL,
                Experiment.PUBLIC_ACCESS_METADATA
        ]:
            msg = 'cannot ingest private experiments.' % exp_id
            logger.error(msg)
            raise BadAccessError(msg)

        # Get the usernames of isOwner django_user ACLs for the experiment
        try:
            xmldata = getURL("%s/apps/reposproducer/acls/%s/" %
                             (source, exp_id))

        except HTTPError as e:
            msg = "Cannot get acl list of experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)
        try:
            acls = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse acl list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        owners = []
        for acl in acls:
            if acl['pluginId'] == 'django_user' and acl['isOwner']:
                user = _get_or_create_user(source, acl['entityId'])
                owners.append(user.username)
            else:
                # FIXME: skips all other types of acl for now
                pass

        # Get the METS for the experiment
        metsxml = ""
        try:
            metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" %
                             (source, exp_id))
            #metsxml = getURL("%s/experiment/metsexport/%s/"
            #% (source, exp_id))

        except HTTPError as e:
            msg = "cannot get METS for experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)

        # load schema and parametername for experiment keys
        try:
            key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE)
        except Schema.DoesNotExist as e:
            msg = "No ExperimentKeyService Schema found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            key_name = ParameterName.objects.get(name=settings.KEY_NAME)
        except ParameterName.DoesNotExist as e:
            msg = "No ExperimentKeyService ParameterName found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            xmldata = getURL("%s/apps/reposproducer/key/%s/" %
                             (source, exp_id))
        except HTTPError as e:
            msg = "cannot get key of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not xmldata:
            logger.warn(
                "Unable to retrieve experiment %s key.  Will try again later" %
                exp_id)
            return

        try:
            key_value = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse key list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not key_value:
            logger.warn(
                "Unable to retrieve experiment %s key value.  Will try again later"
                % exp_id)
            return

        logger.debug("retrieved key %s from experiment %s" %
                     (key_value, exp_id))
        exps = Experiment.objects.all()

        got_lock = True
        if not acquire_lock():
            logger.warning("another worker has access to consume experiment")
            return

        duplicate_exp = 0
        for exp in exps:
            #logger.warn("exp = %s" % exp.id)
            params = ExperimentParameter.objects.filter(
                name=key_name,
                parameterset__schema=key_schema,
                parameterset__experiment=exp)
            #logger.warn("params.count() = %s" % params.count())
            if params.count() >= 1:
                key = params[0].string_value
                if key == key_value:
                    duplicate_exp = exp.id
                    #logger.warn("found duplicate for %s" % duplicate_exp)
                    break

        if duplicate_exp:
            logger.warn(
                "Found duplicate experiment form %s exp %s to  exp %s" %
                (source, exp_id, duplicate_exp))
            if got_lock:
                release_lock()
            return

        # TODO: Need someway of updating and existing experiment.  Problem is
        # that copy will have different id from original, so need unique identifier
        # to allow matching

        # We have not pulled everything we need from producer and are ready to create
        # experiment.

        # Make placeholder experiment and ready metadata
        e = Experiment(
            title='Placeholder Title',
            approved=True,
            created_by=found_user,
            public_access=exp_state,
            locked=False  # so experiment can then be altered.
        )
        e.save()

        # store the key
        #eps, was_created = ExperimentParameterSet.objects.\
        #    get_or_create(experiment=e, schema=key_schema)
        #if was_created:
        #    logger.warn("was created")
        #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps,
        #    name=key_name,
        #    string_value=key_value)
        #if was_created:
        #    logger.warn("was created again")
        #ep.save()

        if got_lock:
            release_lock()

        local_id = e.id
        filename = path.join(e.get_or_create_directory(), 'mets_upload.xml')
        f = open(filename, 'wb+')
        f.write(metsxml)
        f.close()

        # Ingest this experiment META data and isOwner ACLS
        eid = None
        try:
            eid, sync_path = _registerExperimentDocument(filename=filename,
                                                         created_by=found_user,
                                                         expid=local_id,
                                                         owners=owners)
            logger.info('=== processing experiment %s: DONE' % local_id)
        except:
            # FIXME: what errors can mets return?
            msg = '=== processing experiment %s: FAILED!' \
                % local_id
            logger.error(msg)
            raise MetsParseError(msg)

        # FIXME: if METS parse fails then we should go back and delete the placeholder experiment

        exp = Experiment.objects.get(id=eid)

        # so that tardis does not copy the data
        for datafile in exp.get_datafiles():
            datafile.stay_remote = True
            datafile.save()

        #import nose.tools
        #nose.tools.set_trace()
        # FIXME: reverse lookup of URLs seem quite slow.
        # TODO: put this information into specific metadata schema attached to experiment
        exp.description += get_audit_message(source, exp_id)
        exp.save()

        local_ids.append(local_id)
    return local_ids
Ejemplo n.º 20
0
def identifiy(target):
    if target is not None:
        client = Client(target['url'], registry)
        identify = client.identify()
        return convert_identifiy(identify)
Ejemplo n.º 21
0
marcxml_reader = MARCXMLReader()

# Defining of metadata Readers in the Registry

from oaipmh import metadata

registry = metadata.MetadataRegistry()
registry.registerReader('oai_dc', metadata.oai_dc_reader)
registry.registerReader('marc21', marcxml_reader)


#### OAI-PMH Client processing 

oai = Client('http://snape.mzk.cz/OAI-script', registry)

id = oai.identify()
print id.repositoryName()
print id.adminEmails()
print id.baseURL()

formats = oai.listMetadataFormats()
pprint formats

# 'marc21'

sets = oai.listSets()
for s in sets:
	print s

# 'MZK03'
def transfer_experiment(source):
    """
    Pull public experiments from source into current mytardis.
    """

    #TODO: Cleanup error messages
    #TODO: does not transfer liences as not part of METS format.
    #NOTE: As this is a pull we trust the data from the other tardis
    # Check identity of the feed
    from oaipmh.client import Client
    from oaipmh import error
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    from django.core.cache import cache
    from django.utils.hashcompat import md5_constructor as md5

    # The cache key consists of the task name and the MD5 digest
    # of the feed URL.
    cache_key = md5("token").hexdigest()
    lock_id = "%s-lock-%s" % ("consume_experiment", cache_key)
    LOCK_EXPIRE = 60 * 5
    # cache.add fails if if the key already exists
    acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE)
    # memcache delete is very slow, but we have to use it to take
    # advantage of using add() for atomic locking
    release_lock = lambda: cache.delete(lock_id)

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    source_url = "%s/apps/oaipmh/?verb=Identify" % source

    client = Client(source_url, registry)
    try:
        identify = client.identify()
    except AttributeError as e:
        msg = "Error reading repos identity: %s:%s" % (source, e)
        logger.error(msg)
        raise ReposReadError(msg)
    except error.ErrorBase as e:
        msg = "OAIPMH error: %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except URLError as e:
        logger.error(e)
        raise
    repos = identify.baseURL()
    import urlparse
    repos_url = urlparse.urlparse(repos)
    dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc)
    if dest_name != source:
        msg = "Source directory reports incorrect name: %s" % dest_name
        logger.error(msg)
        raise BadAccessError(msg)
    # Get list of public experiments at sources
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(source
        + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry)
    try:
        exps_metadata = [meta
            for (header, meta, extra)
            in client.listRecords(metadataPrefix='oai_dc')]
    except AttributeError as e:
        msg = "Error reading experiment %s" % e
        logger.error(msg)
        raise OAIPMHError(msg)
    except error.NoRecordsMatchError as e:
        msg = "no public records found on source %s" % e
        logger.warn(msg)
        return

    local_ids = []
    for exp_metadata in exps_metadata:
        exp_id = exp_metadata.getField('identifier')[0]
        user = exp_metadata.getField('creator')[0]

        found_user = _get_or_create_user(source, user)

        #make sure experiment is publicish
        try:
            xmldata = getURL("%s/apps/reposproducer/expstate/%s/"
            % (source, exp_id))
        except HTTPError as e:
            msg = "cannot get public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        try:
            exp_state = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse public state of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not exp_state in [Experiment.PUBLIC_ACCESS_FULL,
                              Experiment.PUBLIC_ACCESS_METADATA]:
            msg = 'cannot ingest private experiments.' % exp_id
            logger.error(msg)
            raise BadAccessError(msg)

        # Get the usernames of isOwner django_user ACLs for the experiment
        try:
            xmldata = getURL("%s/apps/reposproducer/acls/%s/"
            % (source, exp_id))

        except HTTPError as e:
            msg = "Cannot get acl list of experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)
        try:
            acls = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse acl list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        owners = []
        for acl in acls:
            if acl['pluginId'] == 'django_user' and acl['isOwner']:
                user = _get_or_create_user(source, acl['entityId'])
                owners.append(user.username)
            else:
                # FIXME: skips all other types of acl for now
                pass

        # Get the METS for the experiment
        metsxml = ""
        try:
            metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls"
            % (source, exp_id))
            #metsxml = getURL("%s/experiment/metsexport/%s/"
            #% (source, exp_id))

        except HTTPError as e:
            msg = "cannot get METS for experiment %s" % exp_id
            logger.error(msg)
            raise ReposReadError(msg)

        # load schema and parametername for experiment keys
        try:
            key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE)
        except Schema.DoesNotExist as e:
            msg = "No ExperimentKeyService Schema found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            key_name = ParameterName.objects.get(name=settings.KEY_NAME)
        except ParameterName.DoesNotExist as e:
            msg = "No ExperimentKeyService ParameterName found"
            logger.error(msg)
            raise BadAccessError(msg)

        try:
            xmldata = getURL("%s/apps/reposproducer/key/%s/"
            % (source, exp_id))
        except HTTPError as e:
            msg = "cannot get key of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not xmldata:
            logger.warn("Unable to retrieve experiment %s key.  Will try again later" % exp_id)
            return

        try:
            key_value = json.loads(xmldata)
        except ValueError as e:
            msg = "cannot parse key list of experiment %s" % exp_id
            logger.error(msg)
            raise BadAccessError(msg)
        if not key_value:
            logger.warn("Unable to retrieve experiment %s key value.  Will try again later" % exp_id)
            return

        logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id))
        exps = Experiment.objects.all()

        got_lock = True
        if not acquire_lock():
            logger.warning("another worker has access to consume experiment")
            return

        duplicate_exp = 0
        for exp in exps:
            #logger.warn("exp = %s" % exp.id)
            params = ExperimentParameter.objects.filter(name=key_name,
                                    parameterset__schema=key_schema,
                                    parameterset__experiment=exp)
            #logger.warn("params.count() = %s" % params.count())
            if params.count() >= 1:
                key = params[0].string_value
                if key == key_value:
                    duplicate_exp = exp.id
                    #logger.warn("found duplicate for %s" % duplicate_exp)
                    break

        if duplicate_exp:
            logger.warn("Found duplicate experiment form %s exp %s to  exp %s"
                % (source, exp_id, duplicate_exp))
            if got_lock:
                release_lock()
            return

        # TODO: Need someway of updating and existing experiment.  Problem is
        # that copy will have different id from original, so need unique identifier
        # to allow matching

        # We have not pulled everything we need from producer and are ready to create
        # experiment.

        # Make placeholder experiment and ready metadata
        e = Experiment(
            title='Placeholder Title',
            approved=True,
            created_by=found_user,
            public_access=exp_state,
            locked=False  # so experiment can then be altered.
            )
        e.save()

        # store the key
        #eps, was_created = ExperimentParameterSet.objects.\
        #    get_or_create(experiment=e, schema=key_schema)
        #if was_created:
        #    logger.warn("was created")
        #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps,
        #    name=key_name,
        #    string_value=key_value)
        #if was_created:
        #    logger.warn("was created again")
        #ep.save()

        if got_lock:
            release_lock()

        local_id = e.id
        filename = path.join(e.get_or_create_directory(),
                             'mets_upload.xml')
        f = open(filename, 'wb+')
        f.write(metsxml)
        f.close()

        # Ingest this experiment META data and isOwner ACLS
        eid = None
        try:
            eid, sync_path = _registerExperimentDocument(filename=filename,
                                               created_by=found_user,
                                               expid=local_id,
                                               owners=owners)
            logger.info('=== processing experiment %s: DONE' % local_id)
        except:
            # FIXME: what errors can mets return?
            msg = '=== processing experiment %s: FAILED!' \
                % local_id
            logger.error(msg)
            raise MetsParseError(msg)

        # FIXME: if METS parse fails then we should go back and delete the placeholder experiment

        exp = Experiment.objects.get(id=eid)

        # so that tardis does not copy the data
        for datafile in exp.get_datafiles():
            datafile.stay_remote = True
            datafile.save()

        #import nose.tools
        #nose.tools.set_trace()
        # FIXME: reverse lookup of URLs seem quite slow.
        # TODO: put this information into specific metadata schema attached to experiment
        exp.description += get_audit_message(source, exp_id)
        exp.save()

        local_ids.append(local_id)
    return local_ids