Beispiel #1
0
            l = 0
            for host in lines:
                if (options.host == host.split()[0]):
                    options.auth = host.split()[1]
                    break
        else:
            logger.critical(
                "\033[1m [CRITICAL] " +
                "For CKAN database delete mode valid URL of CKAN instance (option --host) and API key (--auth or read from ~/.netrc) must be given"
                + "\033[0;0m")
            sys.exit(-1)

        CKAN = CKAN_CLIENT(options.host, options.auth)
        ## UP = UPLOADER(CKAN, OUT, options.outdir,options.fromdate)
        UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT,
                      options.outdir, options.fromdate, options.host)
    if (options.identifier):
        list = [options.identifier]
        listtext = 'given by option -i (%d id\'s)' % len(list)
    elif (options.list):
        f = open(options.list, 'r')
        list = f.readlines()
        f.close()
        listtext = 'got from file %s (%d id\'s)' % (options.list, len(list))
    elif (options.community):
        ##UP.purge_group(options.community)
        UP.get_packages(options.community)
        ##HEW??? UP.get_group_list(options.community)
        print "--- Start get community list from CKAN---\n"
        list = UP.package_list.keys()
        ##clist = UP.get_packages(options.community).keys()
Beispiel #2
0
             l = 0
             for host in lines:
                if(options.host == host.split()[0]):
                   options.auth = host.split()[1]
                   break
        else:
            logger.critical(
                "\033[1m [CRITICAL] " +
                    "For CKAN database delete mode valid URL of CKAN instance (option --host) and API key (--auth or read from ~/.netrc) must be given" + "\033[0;0m"
            )
            sys.exit(-1)

        CKAN = CKAN_CLIENT(options.host,options.auth)
        ## UP = UPLOADER(CKAN, OUT, options.outdir,options.fromdate)
        UP = Uploader(CKAN,options.ckan_check,HandleClient,cred,OUT,options.outdir,options.fromdate,options.host)
    if (options.identifier):
             list = [ options.identifier ]
             listtext='given by option -i (%d id\'s)' % len(list) 
    elif (options.list):
             f = open(options.list,'r')
             list = f.readlines()
             f.close()
             listtext='got from file %s (%d id\'s)' % (options.list,len(list)) 
    elif (options.community):
             ##UP.purge_group(options.community)
             UP.get_packages(options.community)
             ##HEW??? UP.get_group_list(options.community)
             print "--- Start get community list from CKAN---\n"
             list = UP.package_list.keys()
             ##clist = UP.get_packages(options.community).keys()
def process_delete(OUT, dir, options):
    print("###JM# Don't use this function. It is not up to date.")
    return False

    # create CKAN object
    CKAN = CKAN_CLIENT(options.iphost, options.auth)
    UP = Uploader(CKAN, OUT, options.outdir)

    ##HEW-D-ec credentials,ec = None,None

    # create credentials
    try:
        cred = b2handle.clientcredentials.PIDClientCredentials.load_from_JSON(
            'credentials_11098')
    except Exception:
        logging.critical(
            "[CRITICAL] %s Could not create credentials from credstore %s" %
            (options.handle_check))
        p.print_help()
        sys.exit(-1)
    else:
        logging.debug(
            "Create handle client instance to add uuid to handle server")

    for delete_file in glob.glob(dir + '/*.del'):
        community, mdprefix = os.path.splitext(
            os.path.basename(delete_file))[0].split('-')

        logging.info('\n## Deleting datasets from community "%s" ##' %
                     (community))

        # get packages from the group in CKAN:
        UP.get_packages(community)

        # open the delete file and loop over its lines:
        file_content = ''
        try:
            f = open(delete_file, 'r')
            file_content = f.read()
            f.close()
        except IOError:
            logging.critical("Cannot read data from '{0}'".format(delete_file))
            f.close
        else:
            # rename the file in a crash backup file:
            os.rename(delete_file, delete_file + '.crash-backup')

        results = {'count': 0, 'ecount': 0, 'tcount': 0, 'time': 0}

        # use a try-except-finally environment to gurantee that no deleted metadata information will be lost:
        try:
            logging.info('    |   | %-4s | %-50s | %-50s |\n    |%s|' %
                         ('#', 'oai identifier', 'CKAN identifier', "-" * 116))

            deletestart = time.time()

            for line in file_content.split('\n'):
                # ignore empty lines:
                if not line:
                    continue

                results['tcount'] += 1
                subset, identifier = line.split('\t')

                # dataset name uniquely generated from oai identifier
                uid = uuid.uuid5(uuid.NAMESPACE_DNS,
                                 identifier.encode('ascii', 'replace'))
                ds = str(uid)

                # output:
                logging.info('    | d | %-4d | %-50s | %-50s |' %
                             (results['tcount'], identifier, ds))

                ### CHECK STATUS OF DATASET IN CKAN AND PID:
                # status of data set
                dsstatus = "unknown"

                # check against handle server
                handlestatus = "unknown"
                ##HEW-D-ec???  pid = credentials.prefix + "/eudat-jmd_" + ds
                pid = "11098/eudat-jmd_" + ds_id
                pidRecord["CHECKSUM"] = client.get_value_from_handle(
                    pid, "CHECKSUM")

                if (pidRecord["CHECKSUM"] == None):
                    logging.debug(
                        "        |-> Can not access pid %s to get checksum" %
                        (pid))
                    handlestatus = "new"
                else:
                    logging.debug("        |-> pid %s exists" % (pid))
                    handlestatus = "exist"

                # check against CKAN database
                ckanstatus = 'unknown'
                ckanstatus = UP.check_dataset(ds, None)

                delete = 0
                # depending on handle status delete record from B2FIND
                if (handlestatus == "new"
                        and ckanstatus == "new"):  # no action required
                    logging.info('        |-> %s' % ('No deletion required'))
                else:
                    delete = UP.delete(ds, ckanstatus)
                    if (delete == 1):
                        logging.info('        |-> %s' %
                                     ('Deletion was successful'))
                        results['count'] += 1

                        # delete handle (to keep the symmetry between handle and B2FIND server)
                        if (handlestatus == "exist"):
                            logging.info(
                                "        |-> Delete handle %s with checksum %s"
                                % (pid, pidRecord["CHECKSUM"]))
                            try:
                                client.delete_handle(pid)
                            except GenericHandleError as err:
                                logging.error('[ERROR] Unexpected Error: %s' %
                                              err)
                            except Exception:
                                logging.error('[ERROR] Unexpected Error:')

                        else:
                            logging.info(
                                "        |-> No action (deletion) required for handle %s"
                                % pid)
                    else:
                        logging.info('        |-> %s' % ('Deletion failed'))
                        results['ecount'] += 1
        except Exception:
            logging.error('[ERROR] Unexpected Error')
            logging.error('You find the ids of the deleted metadata in "%s"' %
                          (delete_file + '.crash-backup'))
            raise
        else:
            # all worked fine you can remove the crash-backup file:
            os.remove(delete_file + '.crash-backup')

        deletetime = time.time() - deletestart
        results['time'] = deletetime

        # save stats:
        OUT.save_stats(community + '-' + mdprefix, subset, 'd', results)
def process(options, pstat, OUT):
    ## process (options,pstat,OUT) - function
    # Starts processing as specified in pstat['tbd'] and
    #  according the request list given bey the options
    #
    # Parameters:
    # -----------
    # 1. options (OptionsParser object)
    # 2. pstat   (process status dict)
    #

    # set list of request lsits for single or multi mode:
    mode = None
    procOptions = [
        'community', 'source', 'verb', 'mdprefix', 'mdsubset',
        'target_mdschema'
    ]
    if (options.source):
        mode = 'single'
        mandParams = ['community', 'verb',
                      'mdprefix']  # mandatory processing params
        for param in mandParams:
            if not getattr(options, param):
                logger.critical(
                    "Processing parameter %s is required in single mode" %
                    param)
                sys.exit(-1)
        reqlist = [[
            options.community, options.source, options.verb, options.mdprefix,
            options.mdsubset, options.ckan_check, options.handle_check,
            options.target_mdschema
        ]]
    elif (options.list):
        mode = 'multi'
        logger.debug(' |- Joblist:  \t%s' % options.list)
        reqlist = parse_list_file(options)

    logger.debug(' |- Requestlist:  \t%s' % reqlist)

    ## check job request (processing) options
    logger.debug('|- Command line options')
    for opt in procOptions:
        if hasattr(options, opt):
            logger.debug(' |- %s:\t%s' % (opt.upper(), getattr(options, opt)))

    ## HARVESTING mode:
    if (pstat['status']['h'] == 'tbd'):
        logger.info('\n|- Harvesting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        HV = Harvester(OUT, pstat, options.outdir, options.fromdate)
        process_harvest(HV, reqlist)

    ## MAPPINING - Mode:
    if (pstat['status']['m'] == 'tbd'):
        logger.info('\n|- Mapping started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_map(MP, reqlist)

    ## VALIDATING - Mode:
    if (pstat['status']['v'] == 'tbd'):
        logger.info(' |- Validating started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_validate(MP, reqlist)

    ## OAI-CONVERTING - Mode:
    if (pstat['status']['o'] == 'tbd'):
        logger.info('\n|- OAI-Converting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_oaiconvert(MP, reqlist)

    ## UPLOADING - Mode:
    if (pstat['status']['u'] == 'tbd'):
        logger.info('\n|- Uploading started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        # create CKAN object
        CKAN = CKAN_CLIENT(options.iphost, options.auth)
        # create credentials and handle client if required
        if (options.handle_check):
            try:
                cred = PIDClientCredentials.load_from_JSON('credentials_11098')
            except Exception as err:
                logger.critical(
                    "%s : Could not create credentials from credstore %s" %
                    (err, options.handle_check))
                ##p.print_help()
                sys.exit(-1)
            else:
                logger.debug("Create EUDATHandleClient instance")
                HandleClient = EUDATHandleClient.instantiate_with_credentials(
                    cred)
        else:
            cred = None
            HandleClient = None

        UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT,
                      options.outdir, options.fromdate, options.iphost)
        logger.info(' |- Host:  \t%s' % CKAN.ip_host)
        process_upload(UP, reqlist)

    ## DELETING - Mode:
    if (pstat['status']['d'] == 'tbd'):
        # start the process deleting:
        logger.info('\n|- Deleting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))

        if mode is 'multi':
            dir = options.outdir + '/delete'
            if os.path.exists(dir):
                process_delete(OUT, dir, options)
            else:
                logger.error(
                    '[ERROR] The directory "%s" does not exist! No files for deleting are found!'
                    % (dir))
        else:
            logger.critical(
                "[CRITICAL] Deleting mode only supported in 'multi mode' and an explicitly deleting script given !"
            )
Beispiel #5
0
def process_delete(OUT, dir, options):
    print ("###JM# Don't use this function. It is not up to date.")
    return False

    # create CKAN object                       
    CKAN = CKAN_CLIENT(options.iphost,options.auth)
    UP = Uploader(CKAN,OUT,options.outdir)
    
    ##HEW-D-ec credentials,ec = None,None

    # create credentials
    try:
        cred = b2handle.clientcredentials.PIDClientCredentials.load_from_JSON('credentials_11098')
    except Exception:
        logging.critical("[CRITICAL] %s Could not create credentials from credstore %s" % (options.handle_check))
        p.print_help()
        sys.exit(-1)
    else:
        logging.debug("Create handle client instance to add uuid to handle server")

    for delete_file in glob.glob(dir+'/*.del'):
        community, mdprefix = os.path.splitext(os.path.basename(delete_file))[0].split('-')
        
        logging.info('\n## Deleting datasets from community "%s" ##' % (community))
        
        # get packages from the group in CKAN:
        UP.get_packages(community)
        
        # open the delete file and loop over its lines:
        file_content = ''
        try:
            f = open(delete_file, 'r')
            file_content = f.read()
            f.close()
        except IOError :
            logging.critical("Cannot read data from '{0}'".format(delete_file))
            f.close
        else:
            # rename the file in a crash backup file:
            os.rename(delete_file,delete_file+'.crash-backup')
        
        results = {
            'count':0,
            'ecount':0,
            'tcount':0,
            'time':0
        }

        # use a try-except-finally environment to gurantee that no deleted metadata information will be lost:
        try:
            logging.info('    |   | %-4s | %-50s | %-50s |\n    |%s|' % ('#','oai identifier','CKAN identifier',"-" * 116))
            
            deletestart = time.time()
     
            for line in file_content.split('\n'):
                # ignore empty lines:
                if not line:
                    continue
                   
                results['tcount'] += 1
                subset, identifier = line.split('\t')
         
                # dataset name uniquely generated from oai identifier
                uid = uuid.uuid5(uuid.NAMESPACE_DNS, identifier.encode('ascii','replace'))
                ds = str(uid)

                # output:
                logging.info('    | d | %-4d | %-50s | %-50s |' % (results['tcount'],identifier,ds))

                ### CHECK STATUS OF DATASET IN CKAN AND PID:
                # status of data set
                dsstatus="unknown"
         
                # check against handle server
                handlestatus="unknown"
                ##HEW-D-ec???  pid = credentials.prefix + "/eudat-jmd_" + ds
                pid = "11098/eudat-jmd_" + ds_id
                pidRecord["CHECKSUM"] = client.get_value_from_handle(pid, "CHECKSUM")

                if (pidRecord["CHECKSUM"] == None):
                  logging.debug("        |-> Can not access pid %s to get checksum" % (pid))
                  handlestatus="new"
                else:
                  logging.debug("        |-> pid %s exists" % (pid))
                  handlestatus="exist"

                # check against CKAN database
                ckanstatus = 'unknown'                  
                ckanstatus=UP.check_dataset(ds,None)

                delete = 0
                # depending on handle status delete record from B2FIND
                if ( handlestatus == "new" and ckanstatus == "new") : # no action required
                    logging.info('        |-> %s' % ('No deletion required'))
                else:
                    delete = UP.delete(ds,ckanstatus)
                    if (delete == 1):
                        logging.info('        |-> %s' % ('Deletion was successful'))
                        results['count'] +=  1
                        
                        # delete handle (to keep the symmetry between handle and B2FIND server)
                        if (handlestatus == "exist"):
                           logging.info("        |-> Delete handle %s with checksum %s" % (pid,pidRecord["CHECKSUM"]))
                           try:
                               client.delete_handle(pid)
                           except GenericHandleError as err:
                               logging.error('[ERROR] Unexpected Error: %s' % err)
                           except Exception:
                               logging.error('[ERROR] Unexpected Error:')

                        else:
                           logging.info("        |-> No action (deletion) required for handle %s" % pid)
                    else:
                        logging.info('        |-> %s' % ('Deletion failed'))
                        results['ecount'] += 1
        except Exception:
            logging.error('[ERROR] Unexpected Error')
            logging.error('You find the ids of the deleted metadata in "%s"' % (delete_file+'.crash-backup'))
            raise
        else:
            # all worked fine you can remove the crash-backup file:
            os.remove(delete_file+'.crash-backup')
            
        deletetime=time.time()-deletestart
        results['time'] = deletetime
        
        # save stats:
        OUT.save_stats(community+'-'+mdprefix,subset,'d',results)
Beispiel #6
0
def process(options, pstat, OUT):
    ## process (options,pstat) - function
    # Starts processing as specified in pstat['tbd'] and
    #  according the request list given bey the options
    #
    # Parameters:
    # -----------
    # 1. options (OptionsParser object)
    # 2. pstat   (process status dict)
    #

    # set single or multi mode:
    mode = None
    procOptions = [
        'community', 'source', 'verb', 'mdprefix', 'mdsubset',
        'target_mdschema'
    ]
    if (options.source):
        mode = 'single'
        ##HEW Not used in training
        options.target_mdschema = None
        mandParams = ['community', 'verb',
                      'mdprefix']  # mandatory processing params
        for param in mandParams:
            if not getattr(options, param):
                logger.critical(
                    "Processing parameter %s is required in single mode" %
                    param)
                sys.exit(-1)
        reqlist = [[
            options.community, options.source, options.verb, options.mdprefix,
            options.mdsubset, options.ckan_check, options.handle_check,
            options.target_mdschema
        ]]
    elif (options.list):
        if (pstat['status']['g'] == 'tbd'):
            logger.critical(
                "  Processing parameter [ --source | -s SOURCE ] is required in generation mode"
            )
            sys.exit(-1)
        mode = 'multi'
        logger.debug(' |- Joblist:  \t%s' % options.list)
        ## HEW set options.target_mdschema to NONE for Training

        ## options.target_mdschema=None
        reqlist = parse_list_file(options)

    ## check job request (processing) options
    for opt in procOptions:
        if hasattr(options, opt):
            logger.debug(' |- %s:\t%s' % (opt.upper(), getattr(options, opt)))

    ## GENERATION mode:
    if (pstat['status']['g'] == 'tbd'):
        GEN = Generator(pstat, options.outdir)
        process_generate(GEN, reqlist)

    ## HARVESTING mode:
    if (pstat['status']['h'] == 'tbd'):
        ### print('\n|- Harvesting started : %s' % time.strftime("%Y-%m-%d %H:%M:%S"))
        HV = Harvester(pstat, options.outdir, options.fromdate)
        process_harvest(HV, reqlist)

    ## MAPPINING - Mode:
    if (pstat['status']['m'] == 'tbd'):
        print('\n|- Mapping started : %s' % time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_map(MP, reqlist)

    ## VALIDATOR - Mode:
    if (pstat['status']['v'] == 'tbd'):
        print('\n|- Validating started : %s' %
              time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_validate(MP, reqlist)

    ## UPLOADING - Mode:
    if (pstat['status']['u'] == 'tbd'):
        # create CKAN object
        CKAN = CKAN_CLIENT(options.iphost, options.auth)

        # create credentials and handle client if required
        if (options.handle_check):
            try:
                cred = PIDClientCredentials.load_from_JSON('credentials_11098')
            except Exception as err:
                logger.critical(
                    "%s : Could not create credentials from credstore %s" %
                    (err, options.handle_check))
                ##p.print_help()
                sys.exit(-1)
            else:
                logger.debug("Create EUDATHandleClient instance")
                HandleClient = EUDATHandleClient.instantiate_with_credentials(
                    cred)
        else:
            cred = None
            HandleClient = None

        UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT,
                      options.outdir, options.fromdate, options.iphost,
                      options.ckan_organization)
        logger.info(' |- Host:  \t%s' % CKAN.ip_host)
        process_upload(UP, reqlist)