Example #1
0
def import_resources(import_folder):
    """
    Imports resources from the given folder.
    """
    # Check that SOLR is running, or else all resources will stay at status INTERNAL:
    from metashare.repository import verify_at_startup
    verify_at_startup()  # may raise Exception, which we don't want to catch.

    # Disable verbose debug output for the import process...
    settings.DEBUG = False
    os.environ['DISABLE_INDEXING_DURING_IMPORT'] = 'True'

    from metashare.repository.supermodel import OBJECT_XML_CACHE

    # Clean cache before starting the import process.
    OBJECT_XML_CACHE.clear()

    # iterate over storage folder content
    from django.core import serializers
    from metashare.storage.models import MASTER, ALLOWED_ARCHIVE_EXTENSIONS
    from metashare.repository.models import resourceInfoType_model

    imported_resources = []
    erroneous_descriptors = []

    storage_path = os.path.join(import_folder, STORAGE_FOLDER)
    for folder_name in os.listdir(storage_path):
        folder_path = "{}/{}/".format(storage_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                print "importing from folder: '{0}'".format(folder_name)
                # import storage object
                so_filename = os.path.join(folder_path, STORAGE)
                so_in = open(so_filename, "rb")
                for obj in serializers.deserialize("xml", so_in):
                    print "importing storage object"
                    # storage.xml only contains a single storage object
                    storage_obj = obj.object
                    # this storage object is NOT saved!
                    # we only copy the relevant attributes from this storage
                    # object to the one at the resource!
                so_in.close()
                # import resource object
                ro_filename = os.path.join(folder_path, RESOURCE)
                ro_in = open(ro_filename, "rb")
                for obj in serializers.deserialize("xml", ro_in):
                    print "importing resource object"
                    # resource.xml only contains a single resource object
                    res_obj = obj
                    # the deserialized object contains the ManyToMany attributes
                    # in m2m_data
                ro_in.close()
                # import resource from metadata.xml
                res_filename = os.path.join(folder_path, METADATA)
                temp_file = open(res_filename, 'rb')
                xml_string = temp_file.read()
                result = resourceInfoType_model.import_from_string(
                    xml_string, copy_status=MASTER)
                if not result[0]:
                    msg = u''
                    if len(result) > 2:
                        msg = u'{}'.format(result[2])
                    raise Exception(msg)
                res = result[0]
                # update imported resource with imported resource object
                # and storage object
                _update_resource(res, res_obj, storage_obj)
                # copy possible binaries archives
                for archive_name in [
                        ARCHIVE_TPL.format(_ext)
                        for _ext in ALLOWED_ARCHIVE_EXTENSIONS
                ]:
                    archive_filename = os.path.join(folder_path, archive_name)
                    if os.path.isfile(archive_filename):
                        print "copying archive"
                        res_storage_path = '{0}/{1}/'.format(
                            settings.STORAGE_PATH,
                            res.storage_object.identifier)
                        shutil.copy(
                            archive_filename,
                            os.path.join(res_storage_path, archive_name))
                        # there can be at most one binary
                        break
                imported_resources.append(res)
            except Exception as problem:
                from django import db
                if isinstance(problem, db.utils.DatabaseError):
                    # reset database connection (required for PostgreSQL)
                    db.close_connection()
                erroneous_descriptors.append((folder_name, problem))

    print "Done.  Successfully imported {0} resources into the database, " \
      "errors occurred in {1} cases.".format(
      len(imported_resources), len(erroneous_descriptors))
    if len(erroneous_descriptors) > 0:
        print "The following resources could not be imported:"
        for descriptor, exception in erroneous_descriptors:
            print "\t{}: {}".format(descriptor, exception)

    # Be nice and cleanup cache...
    _cache_size = sum([len(x) for x in OBJECT_XML_CACHE.values()])
    OBJECT_XML_CACHE.clear()
    print "Cleared OBJECT_XML_CACHE ({} bytes)".format(_cache_size)

    from django.core.management import call_command
    call_command('rebuild_index', interactive=False)
def import_resources(import_folder):
    """
    Imports resources from the given folder.
    """
    # Check that SOLR is running, or else all resources will stay at status INTERNAL:
    from metashare.repository import verify_at_startup
    verify_at_startup() # may raise Exception, which we don't want to catch.

    # Disable verbose debug output for the import process...
    settings.DEBUG = False
    os.environ['DISABLE_INDEXING_DURING_IMPORT'] = 'True'
    
    from metashare.repository.supermodel import OBJECT_XML_CACHE

    # Clean cache before starting the import process.
    OBJECT_XML_CACHE.clear()
    
    # iterate over storage folder content
    from django.core import serializers
    from metashare.storage.models import MASTER, ALLOWED_ARCHIVE_EXTENSIONS
    from metashare.repository.models import resourceInfoType_model

    imported_resources = []
    erroneous_descriptors = []

    storage_path = os.path.join(import_folder, STORAGE_FOLDER)
    for folder_name in os.listdir(storage_path):
        folder_path = "{}/{}/".format(storage_path, folder_name)
        if os.path.isdir(folder_path):
            try:
                print "importing from folder: '{0}'".format(folder_name)
                # import storage object
                so_filename = os.path.join(folder_path, STORAGE)
                so_in = open(so_filename, "rb")
                for obj in serializers.deserialize("xml", so_in):
                    print "importing storage object"
                    # storage.xml only contains a single storage object
                    storage_obj = obj.object
                    # this storage object is NOT saved!
                    # we only copy the relevant attributes from this storage
                    # object to the one at the resource!
                so_in.close()
                # import resource object
                ro_filename = os.path.join(folder_path, RESOURCE)
                ro_in = open(ro_filename, "rb")
                for obj in serializers.deserialize("xml", ro_in):
                    print "importing resource object"
                    # resource.xml only contains a single resource object
                    res_obj = obj
                    # the deserialized object contains the ManyToMany attributes
                    # in m2m_data
                ro_in.close()
                # import resource from metadata.xml
                res_filename = os.path.join(folder_path, METADATA)
                temp_file = open(res_filename, 'rb')
                xml_string = temp_file.read()
                result = resourceInfoType_model.import_from_string(
                  xml_string, copy_status=MASTER)
                if not result[0]:
                    msg = u''
                    if len(result) > 2:
                        msg = u'{}'.format(result[2])
                    raise Exception(msg)
                res = result[0]
                # update imported resource with imported resource object 
                # and storage object
                _update_resource(res, res_obj, storage_obj)
                # copy possible binaries archives
                for archive_name in [ARCHIVE_TPL.format(_ext)
                                     for _ext in ALLOWED_ARCHIVE_EXTENSIONS]:
                    archive_filename = os.path.join(folder_path, archive_name)
                    if os.path.isfile(archive_filename):
                        print "copying archive"
                        res_storage_path = '{0}/{1}/'.format(
                          settings.STORAGE_PATH, res.storage_object.identifier)
                        shutil.copy(archive_filename,
                          os.path.join(res_storage_path, archive_name))
                        # there can be at most one binary
                        break
                imported_resources.append(res)
            except Exception as problem:
                from django import db
                if isinstance(problem, db.utils.DatabaseError):
                    # reset database connection (required for PostgreSQL)
                    db.close_connection()
                erroneous_descriptors.append((folder_name, problem))

    print "Done.  Successfully imported {0} resources into the database, " \
      "errors occurred in {1} cases.".format(
      len(imported_resources), len(erroneous_descriptors))
    if len(erroneous_descriptors) > 0:
        print "The following resources could not be imported:"
        for descriptor, exception in erroneous_descriptors:
            print "\t{}: {}".format(descriptor, exception)

    # Be nice and cleanup cache...
    _cache_size = sum([len(x) for x in OBJECT_XML_CACHE.values()])
    OBJECT_XML_CACHE.clear()
    print "Cleared OBJECT_XML_CACHE ({} bytes)".format(_cache_size)
    
    from django.core.management import call_command
    call_command('rebuild_index', interactive=False)
Example #3
0
                    print 'missing local json, skipping "{}"'.format(folder_name)
                    continue
                # get copy status from storage-local.json
                _copy_status = 'm'
                with open(_storage_local_path, 'rb') as _in:
                    json_string = _in.read()
                    _dict = loads(json_string)
                    if _dict['copy_status']:
                        _copy_status = _dict['copy_status']
                resource = restore_from_folder(folder_name, copy_status=_copy_status)
                successful_restored += [resource]
            # pylint: disable-msg=W0703
            except Exception as problem:
                erroneous_restored += [(folder_name, problem)]

    print "Done.  Successfully restored {0} files into the database, errors " \
      "occurred in {1} cases.".format(len(successful_restored), len(erroneous_restored))
    if len(erroneous_restored) > 0:
        print "The following resources could not be restored:"
        for descriptor, exception in erroneous_restored:
            print "{}: {}".format(descriptor, exception)
    
    # Be nice and cleanup cache...
    _cache_size = sum([len(x) for x in OBJECT_XML_CACHE.values()])
    OBJECT_XML_CACHE.clear()
    print "Cleared OBJECT_XML_CACHE ({} bytes)".format(_cache_size)
    
    from django.core.management import call_command
    call_command('rebuild_index', interactive=False)

Example #4
0
        erroneous_imports += failure
        temp_file.close()
    
    print "Done.  Successfully imported {0} files into the database, errors " \
      "occurred in {1} cases.".format(len(successful_imports), len(erroneous_imports))
    if len(erroneous_imports) > 0:
        print "The following files could not be imported:"
        for descriptor, exception in erroneous_imports:
            if isinstance(exception.args, basestring):
                print "\t{}: {}".format(descriptor, ' '.join(exception.args))
            else:
                print "\t{}: {}".format(descriptor, exception.args)
    
    # Salvatore:
    # This is useful for tracking where the resource is stored.
    # It is used by some scripts for testing purposes
    if not id_filename is None:
        id_file = open(id_filename, 'w')
        for resource in successful_imports:
            id_file.write('--->RESOURCE_ID:{0};STORAGE_IDENTIFIER:{1}\n'\
                .format(resource.id, resource.storage_object.identifier))
        id_file.close()

    # Be nice and cleanup cache...
    _cache_size = sum([len(x) for x in OBJECT_XML_CACHE.values()])
    OBJECT_XML_CACHE.clear()
    print "Cleared OBJECT_XML_CACHE ({} bytes)".format(_cache_size)
    
    from haystack.management.commands import rebuild_index
    rebuild_index.Command().handle(interactive=False)