Example #1
0
def main():

    pstat = {
        'status': {},
        'text': {},
        'short': {},
    }
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    jid = os.getpid()
    ckanlistrequests = ['package_list', 'group_list', 'tag_list']

    ## Get options and arguments
    args = get_args(ckanlistrequests)

    # Output instance
    OUT = Output(pstat, now, jid, args)
    logger = OUT.setup_custom_logger('root', args.verbose)

    ## Settings for CKAN client and API
    ckanapi3 = 'http://' + args.ckan + '/api/3'
    if PY2:
        ckan = ckanclient.CkanClient(ckanapi3)
    else:
        auth = '12345'
        ckan = CKAN_CLIENT(args.ckan, auth)

    ckan_limit = 500000

    start = time.time()

    if args.request.endswith('list'):
        try:
            if args.request == 'community_list':
                action = 'group_list'
            else:
                action = args.request
            if PY2:
                answer = ckan.action(action, rows=ckan_limit)
            else:
                answer = ckan.action(action)
        except ckanclient.CkanApiError as e:
            print('\t\tError %s Supported list requests are %s.' %
                  (e, ckanlistrequests))
            sys.exit(1)
        ## print '|- The list of %ss :\n\t%s' % (args.request.split('_')[0],'\n\t'.join(answer).encode('utf8'))
        print('\n\t%s' % '\n\t'.join(answer).encode('utf8'))
        sys.exit(0)

    # create CKAN search pattern :
    ckan_pattern = ''
    sand = ''
    pattern = ' '.join(args.pattern)

    if (args.community):
        ckan_pattern += "groups:%s" % args.community
        sand = " AND "
    if (args.pattern):
        ckan_pattern += sand + pattern

    print(' | - Search\n\t|- in\t%s\n\t|- for\t%s\n' %
          (args.ckan, ckan_pattern))

    if args.request == 'package_search':
        if PY2:
            answer = ckan.action('package_search',
                                 q=ckan_pattern,
                                 rows=ckan_limit)
        else:
            answer = ckan.action('package_search', {"q": ckan_pattern})
    for key, value in answer.items():
        logger.warning('answer has key %s' % key)
    if PY2:
        tcount = answer['count']
    else:
        tcount = answer['result']['count']
    print(' | - Results:\n\t|- %d records found in %d sec' %
          (tcount, time.time() - start))

    # Read in B2FIND metadata schema and fields
    schemafile = '%s/mapfiles/b2find_schema.json' % (os.getcwd())
    with open(schemafile, 'r') as f:
        b2findfields = json.loads(f.read(), object_pairs_hook=OrderedDict)

    if tcount > 0 and args.keys is not None:
        if len(args.keys) == 0:
            akeys = []
        else:
            if args.keys[0] == 'B2FIND.*':
                akeys = OrderedDict(sorted(b2findfields.keys()))
            else:
                akeys = args.keys

        suppid = b2findfields.keys()

        fh = io.open(args.output, "w", encoding='utf8')
        record = {}

        totlist = []
        count = {}
        count['id'] = 0
        statc = {}
        for outt in akeys:
            if outt not in suppid:
                print(' [WARNING] Not supported key %s is removed' % outt)
                akeys.remove(outt)
            else:
                count[outt] = 0
                statc[outt] = Counter()

        printfacets = ''
        if (len(akeys) > 0):
            printfacets = "and related facets %s " % ", ".join(akeys)

            print('\t|- IDs %sare written to %s ...' %
                  (printfacets, args.output))

        counter = 0
        cstart = 0
        oldperc = 0
        start2 = time.time()

        while (cstart < tcount):
            if (cstart > 0):
                if PY2:
                    answer = ckan.action('package_search',
                                         q=ckan_pattern,
                                         rows=ckan_limit,
                                         start=cstart)
                else:
                    answer = ckan.action('package_search', {
                        "q": ckan_pattern,
                        "rows": ckan_limit,
                        "start": cstart
                    })
            if PY2:
                if len(answer['results']) == 0:
                    break
            #HEW-D else:
            ##HEW-D    if len(answer['result']['results']) == 0 :
            ##HEW-D        break

            # loop over found records
            if PY2:
                results = answer['results']
            else:
                results = answer['result']['results']
            for ds in results:  #### answer['results']:
                counter += 1
                logger.debug('    | %-4d | %-40s |' % (counter, ds['name']))
                perc = int(counter * 100 / tcount)
                bartags = perc / 5
                if perc % 10 == 0 and perc != oldperc:
                    oldperc = perc
                    print('\r\t[%-20s] %5d (%3d%%) in %d sec' %
                          ('=' * int(bartags), counter, perc,
                           time.time() - start2))
                    sys.stdout.flush()

                record['id'] = '%s' % (ds['name'])
                outline = record['id']

                # loop over facets
                for facet in akeys:
                    ##HEW-T print 'facet : %s' % facet
                    ckanFacet = b2findfields[facet]["ckanName"]
                    if ckanFacet in ds:  ## CKAN default field
                        if facet == 'Group':
                            record[facet] = ds[ckanFacet][0]['display_name']
                        else:
                            record[facet] = ds[ckanFacet]
                    else:  ## CKAN extra field
                        ##HEW-T print 'ds extras %s' % ds['extras']
                        efacet = [e for e in ds['extras'] if e['key'] == facet]
                        if efacet:
                            ##HEW-T print 'rrrr %s effff %s' % (record[facet],efacet[0]['value'])
                            record[facet] = efacet[0]['value']
                        else:
                            record[facet] = 'N/A'
                    if record[facet] is None:
                        record[facet] = 'None'
                        statc[facet][record[facet]] += 1
                    else:
                        if not isinstance(record[facet], list):
                            words = record[facet].split(';')
                        else:
                            words = record[facet]
                        for word in words:
                            if isinstance(word, dict): word = word['name']
                            statc[facet][word] += 1
                    if not (record[facet] == 'N/A' or record[facet]
                            == 'Not Stated') and len(record[facet]) > 0:
                        count[facet] += 1
                    outline += '\t | %-30s' % record[facet][:30]
                fh.write(outline + '\n')
            cstart += len(results)
            logger.warning('%d records done, %d in total' % (cstart, tcount))
        fh.close()

        if len(akeys) > 0:
            statfh = io.open('stat_' + args.output, "w", encoding='utf8')
            ##print "\n|- Statistics :\n\t| %-16s | %-10s | %6s |\n\t%s " % ('Facet','Occurence','%',"-" * 50)
            print('|- Statistics written to file %s' % 'stat_' + args.output)

            statline = unicode("")
            for outt in akeys:
                statline += "| %-16s\n\t| %-15s | %-6d | %3d |\n" % (
                    outt, '-Total-', count[outt],
                    int(count[outt] * 100 / tcount))
                for word in statc[outt].most_common(10):
                    statline += '\t| %-15s | %-6d | %3d |\n' % (
                        word[0][:100], word[1], int(word[1] * 100 / tcount))

            statfh.write(statline)

            statfh.close()
Example #2
0
            lines = f.read().splitlines()
            f.close()

            l = 0
            for host in lines:
                if (options.host == host.split()[0]):
                    options.auth = host.split()[1]
                    break
        else:
            logger.critical(
                "\033[1m [CRITICAL] " +
                "For CKAN database delete mode valid URL of CKAN instance (option --host) and API key (--auth or read from ~/.netrc) must be given"
                + "\033[0;0m")
            sys.exit(-1)

        CKAN = CKAN_CLIENT(options.host, options.auth)
        ## UP = UPLOADER(CKAN, OUT, options.outdir,options.fromdate)
        UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT,
                      options.outdir, options.fromdate, options.host)
    if (options.identifier):
        list = [options.identifier]
        listtext = 'given by option -i (%d id\'s)' % len(list)
    elif (options.list):
        f = open(options.list, 'r')
        list = f.readlines()
        f.close()
        listtext = 'got from file %s (%d id\'s)' % (options.list, len(list))
    elif (options.community):
        ##UP.purge_group(options.community)
        UP.get_packages(options.community)
        ##HEW??? UP.get_group_list(options.community)
Example #3
0
def process_delete(OUT, dir, options):
    print("###JM# Don't use this function. It is not up to date.")
    return False

    # create CKAN object
    CKAN = CKAN_CLIENT(options.iphost, options.auth)
    UP = Uploader(CKAN, OUT, options.outdir)

    ##HEW-D-ec credentials,ec = None,None

    # create credentials
    try:
        cred = b2handle.clientcredentials.PIDClientCredentials.load_from_JSON(
            'credentials_11098')
    except Exception:
        logging.critical(
            "[CRITICAL] %s Could not create credentials from credstore %s" %
            (options.handle_check))
        p.print_help()
        sys.exit(-1)
    else:
        logging.debug(
            "Create handle client instance to add uuid to handle server")

    for delete_file in glob.glob(dir + '/*.del'):
        community, mdprefix = os.path.splitext(
            os.path.basename(delete_file))[0].split('-')

        logging.info('\n## Deleting datasets from community "%s" ##' %
                     (community))

        # get packages from the group in CKAN:
        UP.get_packages(community)

        # open the delete file and loop over its lines:
        file_content = ''
        try:
            f = open(delete_file, 'r')
            file_content = f.read()
            f.close()
        except IOError:
            logging.critical("Cannot read data from '{0}'".format(delete_file))
            f.close
        else:
            # rename the file in a crash backup file:
            os.rename(delete_file, delete_file + '.crash-backup')

        results = {'count': 0, 'ecount': 0, 'tcount': 0, 'time': 0}

        # use a try-except-finally environment to gurantee that no deleted metadata information will be lost:
        try:
            logging.info('    |   | %-4s | %-50s | %-50s |\n    |%s|' %
                         ('#', 'oai identifier', 'CKAN identifier', "-" * 116))

            deletestart = time.time()

            for line in file_content.split('\n'):
                # ignore empty lines:
                if not line:
                    continue

                results['tcount'] += 1
                subset, identifier = line.split('\t')

                # dataset name uniquely generated from oai identifier
                uid = uuid.uuid5(uuid.NAMESPACE_DNS,
                                 identifier.encode('ascii', 'replace'))
                ds = str(uid)

                # output:
                logging.info('    | d | %-4d | %-50s | %-50s |' %
                             (results['tcount'], identifier, ds))

                ### CHECK STATUS OF DATASET IN CKAN AND PID:
                # status of data set
                dsstatus = "unknown"

                # check against handle server
                handlestatus = "unknown"
                ##HEW-D-ec???  pid = credentials.prefix + "/eudat-jmd_" + ds
                pid = "11098/eudat-jmd_" + ds_id
                pidRecord["CHECKSUM"] = client.get_value_from_handle(
                    pid, "CHECKSUM")

                if (pidRecord["CHECKSUM"] == None):
                    logging.debug(
                        "        |-> Can not access pid %s to get checksum" %
                        (pid))
                    handlestatus = "new"
                else:
                    logging.debug("        |-> pid %s exists" % (pid))
                    handlestatus = "exist"

                # check against CKAN database
                ckanstatus = 'unknown'
                ckanstatus = UP.check_dataset(ds, None)

                delete = 0
                # depending on handle status delete record from B2FIND
                if (handlestatus == "new"
                        and ckanstatus == "new"):  # no action required
                    logging.info('        |-> %s' % ('No deletion required'))
                else:
                    delete = UP.delete(ds, ckanstatus)
                    if (delete == 1):
                        logging.info('        |-> %s' %
                                     ('Deletion was successful'))
                        results['count'] += 1

                        # delete handle (to keep the symmetry between handle and B2FIND server)
                        if (handlestatus == "exist"):
                            logging.info(
                                "        |-> Delete handle %s with checksum %s"
                                % (pid, pidRecord["CHECKSUM"]))
                            try:
                                client.delete_handle(pid)
                            except GenericHandleError as err:
                                logging.error('[ERROR] Unexpected Error: %s' %
                                              err)
                            except Exception:
                                logging.error('[ERROR] Unexpected Error:')

                        else:
                            logging.info(
                                "        |-> No action (deletion) required for handle %s"
                                % pid)
                    else:
                        logging.info('        |-> %s' % ('Deletion failed'))
                        results['ecount'] += 1
        except Exception:
            logging.error('[ERROR] Unexpected Error')
            logging.error('You find the ids of the deleted metadata in "%s"' %
                          (delete_file + '.crash-backup'))
            raise
        else:
            # all worked fine you can remove the crash-backup file:
            os.remove(delete_file + '.crash-backup')

        deletetime = time.time() - deletestart
        results['time'] = deletetime

        # save stats:
        OUT.save_stats(community + '-' + mdprefix, subset, 'd', results)
def main():

    pstat = {
        'status' : {},
        'text' : {},
        'short' : {},
     }
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    jid = os.getpid()
    ckanlistrequests=['package_list','group_list','tag_list']

    ## Get options and arguments
    args = get_args(ckanlistrequests)

    # Output instance
    OUT = Output(pstat,now,jid,args)
    logger = OUT.setup_custom_logger('root',args.verbose)
    
    ## Settings for CKAN client and API
    ckanapi3='http://'+args.iphost+'/api/3'
    auth='12345'
    CKAN = CKAN_CLIENT(args.iphost,auth)

    ckan_limit=500000

    start=time.time()

    if args.request.endswith('list'):
        try:
            if args.request == 'community_list' :
                action='group_list'
            else:
                action=args.request
            if PY2 :
                answer = CKAN.action(action, rows=ckan_limit)
            else:
                answer = CKAN.action(action)
        except ckanclient.CkanApiError as e :
            print('\t\tError %s Supported list requests are %s.' % (e,ckanlistrequests))
            sys.exit(1)

        print('\n\t%s' % '\n\t'.join(answer).encode('utf8'))
        sys.exit(0)

    # create CKAN search pattern :
    ckan_pattern = ''
    sand=''
    pattern=' '.join(args.pattern)

    if (args.community):
        ckan_pattern += "groups:%s" % args.community
        sand=" AND "
    if (args.pattern):
        ckan_pattern += sand + pattern   

    print(' | - Search\n\t|- in\t%s\n\t|- for\t%s\n' % (args.iphost,ckan_pattern))

    if args.request == 'package_search' :
        if PY2:
            answer = CKAN.action('package_search', {"q":ckan_pattern}) ##HEW-D? , rows=ckan_limit)
        else:
            answer = CKAN.action('package_search',{"q":ckan_pattern})
    for key, value in answer.items() :
        logger.warning('answer has key %s' % key)
    if PY2 :
        tcount=answer['result']['count'] ### ['count']
    else:
        tcount=answer['result']['count']
    print(' | - Results:\n\t|- %d records found in %d sec' % (tcount,time.time()-start))

    # Read in B2FIND metadata schema and fields
    schemafile =  '%s/mapfiles/b2find_schema.json' % (os.getcwd())
    with open(schemafile, 'r') as f:
        b2findfields=json.loads(f.read(), object_pairs_hook=OrderedDict)   


    if tcount>0 and args.keys is not None :
        if len(args.keys) == 0 :
            akeys=[]
        else:
            if args.keys[0] == 'B2FIND.*' :
                akeys=OrderedDict(sorted(b2findfields.keys()))
            else:
                akeys=args.keys

        suppid=b2findfields.keys()

        fh = io.open(args.output, "w", encoding='utf8')
        record={} 
  
        totlist=[]
        count={}
        count['id']=0
        statc={}
        for outt in akeys:
                if outt not in suppid :
                    print(' [WARNING] Not supported key %s is removed' % outt)
                    akeys.remove(outt)
                else:
                    count[outt]=0
                    statc[outt] = Counter()

        printfacets=''
        if (len(akeys) > 0):
            printfacets="and related facets %s " % ", ".join(akeys)

            print('\t|- IDs %sare written to %s ...' % (printfacets,args.output))

        counter=0
        cstart=0
        oldperc=0
        start2=time.time()

        while (cstart < tcount) :
            if (cstart > 0):
                if PY2 :
                    answer = CKAN.action('package_search', {"q":ckan_pattern,"rows":ckan_limit,"start":cstart}) ##HEW-D q=ckan_pattern, rows=ckan_limit, start=cstart)
                else:
                    answer = CKAN.action('package_search',{"q":ckan_pattern,"rows":ckan_limit,"start":cstart})
            if PY2 :
                if len(answer['result']) == 0 :
                    break
        
            # loop over found records
            if PY2:
                results= answer['result']['results'] ### ['results']
            else:
                results= answer['result']['results']
            for ds in results : #### answer['results']:
                    counter +=1
                    logger.debug('    | %-4d | %-40s |' % (counter,ds['name']))
                    perc=int(counter*100/tcount)
                    bartags=perc/5
                    if perc%10 == 0 and perc != oldperc :
                        oldperc=perc
                        print('\r\t[%-20s] %5d (%3d%%) in %d sec' % ('='*int(bartags), counter, perc, time.time()-start2 ))
                        sys.stdout.flush()
        
                    
                    record['id']  = '%s' % (ds['name'])
                    outline='%s\n' % record['id']
        
                    # loop over facets
                    for facet in akeys:
                        ckanFacet=b2findfields[facet]["ckanName"]
                        if ckanFacet in ds: ## CKAN default field
                            if facet == 'Group':
                                record[facet]  = ds[ckanFacet][0]['display_name']
                            else:
                                record[facet]  = ds[ckanFacet]
                        else: ## CKAN extra field
                            efacet=[e for e in ds['extras'] if e['key'] == facet]
                            if efacet:
                                record[facet]  = efacet[0]['value']
                            else:
                                record[facet]  = 'N/A'
                        if record[facet] is None :
                            record[facet]='None'
                            statc[facet][record[facet]]+=1
                        else:
                            if not isinstance(record[facet],list):
                                words=record[facet].split(';')
                            else:
                                words=record[facet]
                            for word in words:
                                if isinstance(word,dict): word=word['name']
                                statc[facet][word]+=1
                        if not ( record[facet] == 'N/A' or record[facet] == 'Not Stated') and len(record[facet])>0 : 
                            count[facet]+=1
                        if PY2 :
                            fh.writelines((record[facet]+'\n').decode('utf-8'))
                        else :
                            fh.writelines((record[facet]+'\n'))

            cstart+=len(results)
            logger.warning('%d records done, %d in total' % (cstart,tcount))
        fh.close()
        
        if len(akeys) > 0 :
                statfh = io.open('stat_'+args.output, "w", encoding='utf8')
                print('|- Statistics written to file %s' % 'stat_'+args.output)
        
                statline=""
                for outt in akeys:
                    statline+= "| %-16s\n\t| %-15s | %-6d | %3d |\n" % (outt,'-Total-',count[outt],int(count[outt]*100/tcount))
                    for word in statc[outt].most_common(10):
                        statline+= '\t| %-15s | %-6d | %3d |\n' % (word[0][:100], word[1], int(word[1]*100/tcount))
        
                if PY2 :
                    statfh.write(statline.decode('utf-8'))
                else :
                    statfh.write(statline)
        
                statfh.close()
Example #5
0
def process(options, pstat, OUT):
    ## process (options,pstat,OUT) - function
    # Starts processing as specified in pstat['tbd'] and
    #  according the request list given bey the options
    #
    # Parameters:
    # -----------
    # 1. options (OptionsParser object)
    # 2. pstat   (process status dict)
    #

    # set list of request lsits for single or multi mode:
    mode = None
    procOptions = [
        'community', 'source', 'verb', 'mdprefix', 'mdsubset',
        'target_mdschema'
    ]
    if (options.source):
        mode = 'single'
        mandParams = ['community', 'verb',
                      'mdprefix']  # mandatory processing params
        for param in mandParams:
            if not getattr(options, param):
                logger.critical(
                    "Processing parameter %s is required in single mode" %
                    param)
                sys.exit(-1)
        reqlist = [[
            options.community, options.source, options.verb, options.mdprefix,
            options.mdsubset, options.ckan_check, options.handle_check,
            options.target_mdschema
        ]]
    elif (options.list):
        mode = 'multi'
        logger.debug(' |- Joblist:  \t%s' % options.list)
        reqlist = parse_list_file(options)

    logger.debug(' |- Requestlist:  \t%s' % reqlist)

    ## check job request (processing) options
    logger.debug('|- Command line options')
    for opt in procOptions:
        if hasattr(options, opt):
            logger.debug(' |- %s:\t%s' % (opt.upper(), getattr(options, opt)))

    ## HARVESTING mode:
    if (pstat['status']['h'] == 'tbd'):
        logger.info('\n|- Harvesting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        HV = Harvester(OUT, pstat, options.outdir, options.fromdate)
        process_harvest(HV, reqlist)

    ## MAPPINING - Mode:
    if (pstat['status']['m'] == 'tbd'):
        logger.info('\n|- Mapping started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_map(MP, reqlist)

    ## VALIDATING - Mode:
    if (pstat['status']['v'] == 'tbd'):
        logger.info(' |- Validating started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_validate(MP, reqlist)

    ## OAI-CONVERTING - Mode:
    if (pstat['status']['o'] == 'tbd'):
        logger.info('\n|- OAI-Converting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        MP = Mapper(OUT, options.outdir, options.fromdate)
        process_oaiconvert(MP, reqlist)

    ## UPLOADING - Mode:
    if (pstat['status']['u'] == 'tbd'):
        logger.info('\n|- Uploading started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))
        # create CKAN object
        CKAN = CKAN_CLIENT(options.iphost, options.auth)
        # create credentials and handle client if required
        if (options.handle_check):
            try:
                cred = PIDClientCredentials.load_from_JSON('credentials_11098')
            except Exception as err:
                logger.critical(
                    "%s : Could not create credentials from credstore %s" %
                    (err, options.handle_check))
                ##p.print_help()
                sys.exit(-1)
            else:
                logger.debug("Create EUDATHandleClient instance")
                HandleClient = EUDATHandleClient.instantiate_with_credentials(
                    cred)
        else:
            cred = None
            HandleClient = None

        UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT,
                      options.outdir, options.fromdate, options.iphost)
        logger.info(' |- Host:  \t%s' % CKAN.ip_host)
        process_upload(UP, reqlist)

    ## DELETING - Mode:
    if (pstat['status']['d'] == 'tbd'):
        # start the process deleting:
        logger.info('\n|- Deleting started : %s' %
                    time.strftime("%Y-%m-%d %H:%M:%S"))

        if mode is 'multi':
            dir = options.outdir + '/delete'
            if os.path.exists(dir):
                process_delete(OUT, dir, options)
            else:
                logger.error(
                    '[ERROR] The directory "%s" does not exist! No files for deleting are found!'
                    % (dir))
        else:
            logger.critical(
                "[CRITICAL] Deleting mode only supported in 'multi mode' and an explicitly deleting script given !"
            )