def main(): pstat = { 'status': {}, 'text': {}, 'short': {}, } now = time.strftime("%Y-%m-%d %H:%M:%S") jid = os.getpid() ckanlistrequests = ['package_list', 'group_list', 'tag_list'] ## Get options and arguments args = get_args(ckanlistrequests) # Output instance OUT = Output(pstat, now, jid, args) logger = OUT.setup_custom_logger('root', args.verbose) ## Settings for CKAN client and API ckanapi3 = 'http://' + args.ckan + '/api/3' if PY2: ckan = ckanclient.CkanClient(ckanapi3) else: auth = '12345' ckan = CKAN_CLIENT(args.ckan, auth) ckan_limit = 500000 start = time.time() if args.request.endswith('list'): try: if args.request == 'community_list': action = 'group_list' else: action = args.request if PY2: answer = ckan.action(action, rows=ckan_limit) else: answer = ckan.action(action) except ckanclient.CkanApiError as e: print('\t\tError %s Supported list requests are %s.' % (e, ckanlistrequests)) sys.exit(1) ## print '|- The list of %ss :\n\t%s' % (args.request.split('_')[0],'\n\t'.join(answer).encode('utf8')) print('\n\t%s' % '\n\t'.join(answer).encode('utf8')) sys.exit(0) # create CKAN search pattern : ckan_pattern = '' sand = '' pattern = ' '.join(args.pattern) if (args.community): ckan_pattern += "groups:%s" % args.community sand = " AND " if (args.pattern): ckan_pattern += sand + pattern print(' | - Search\n\t|- in\t%s\n\t|- for\t%s\n' % (args.ckan, ckan_pattern)) if args.request == 'package_search': if PY2: answer = ckan.action('package_search', q=ckan_pattern, rows=ckan_limit) else: answer = ckan.action('package_search', {"q": ckan_pattern}) for key, value in answer.items(): logger.warning('answer has key %s' % key) if PY2: tcount = answer['count'] else: tcount = answer['result']['count'] print(' | - Results:\n\t|- %d records found in %d sec' % (tcount, time.time() - start)) # Read in B2FIND metadata schema and fields schemafile = '%s/mapfiles/b2find_schema.json' % (os.getcwd()) with open(schemafile, 'r') as f: b2findfields = json.loads(f.read(), object_pairs_hook=OrderedDict) if tcount > 0 and args.keys is not None: if len(args.keys) == 0: akeys = [] else: if args.keys[0] == 'B2FIND.*': akeys = OrderedDict(sorted(b2findfields.keys())) else: akeys = args.keys suppid = b2findfields.keys() fh = io.open(args.output, "w", encoding='utf8') record = {} totlist = [] count = {} count['id'] = 0 statc = {} for outt in akeys: if outt not in suppid: print(' [WARNING] Not supported key %s is removed' % outt) akeys.remove(outt) else: count[outt] = 0 statc[outt] = Counter() printfacets = '' if (len(akeys) > 0): printfacets = "and related facets %s " % ", ".join(akeys) print('\t|- IDs %sare written to %s ...' % (printfacets, args.output)) counter = 0 cstart = 0 oldperc = 0 start2 = time.time() while (cstart < tcount): if (cstart > 0): if PY2: answer = ckan.action('package_search', q=ckan_pattern, rows=ckan_limit, start=cstart) else: answer = ckan.action('package_search', { "q": ckan_pattern, "rows": ckan_limit, "start": cstart }) if PY2: if len(answer['results']) == 0: break #HEW-D else: ##HEW-D if len(answer['result']['results']) == 0 : ##HEW-D break # loop over found records if PY2: results = answer['results'] else: results = answer['result']['results'] for ds in results: #### answer['results']: counter += 1 logger.debug(' | %-4d | %-40s |' % (counter, ds['name'])) perc = int(counter * 100 / tcount) bartags = perc / 5 if perc % 10 == 0 and perc != oldperc: oldperc = perc print('\r\t[%-20s] %5d (%3d%%) in %d sec' % ('=' * int(bartags), counter, perc, time.time() - start2)) sys.stdout.flush() record['id'] = '%s' % (ds['name']) outline = record['id'] # loop over facets for facet in akeys: ##HEW-T print 'facet : %s' % facet ckanFacet = b2findfields[facet]["ckanName"] if ckanFacet in ds: ## CKAN default field if facet == 'Group': record[facet] = ds[ckanFacet][0]['display_name'] else: record[facet] = ds[ckanFacet] else: ## CKAN extra field ##HEW-T print 'ds extras %s' % ds['extras'] efacet = [e for e in ds['extras'] if e['key'] == facet] if efacet: ##HEW-T print 'rrrr %s effff %s' % (record[facet],efacet[0]['value']) record[facet] = efacet[0]['value'] else: record[facet] = 'N/A' if record[facet] is None: record[facet] = 'None' statc[facet][record[facet]] += 1 else: if not isinstance(record[facet], list): words = record[facet].split(';') else: words = record[facet] for word in words: if isinstance(word, dict): word = word['name'] statc[facet][word] += 1 if not (record[facet] == 'N/A' or record[facet] == 'Not Stated') and len(record[facet]) > 0: count[facet] += 1 outline += '\t | %-30s' % record[facet][:30] fh.write(outline + '\n') cstart += len(results) logger.warning('%d records done, %d in total' % (cstart, tcount)) fh.close() if len(akeys) > 0: statfh = io.open('stat_' + args.output, "w", encoding='utf8') ##print "\n|- Statistics :\n\t| %-16s | %-10s | %6s |\n\t%s " % ('Facet','Occurence','%',"-" * 50) print('|- Statistics written to file %s' % 'stat_' + args.output) statline = unicode("") for outt in akeys: statline += "| %-16s\n\t| %-15s | %-6d | %3d |\n" % ( outt, '-Total-', count[outt], int(count[outt] * 100 / tcount)) for word in statc[outt].most_common(10): statline += '\t| %-15s | %-6d | %3d |\n' % ( word[0][:100], word[1], int(word[1] * 100 / tcount)) statfh.write(statline) statfh.close()
lines = f.read().splitlines() f.close() l = 0 for host in lines: if (options.host == host.split()[0]): options.auth = host.split()[1] break else: logger.critical( "\033[1m [CRITICAL] " + "For CKAN database delete mode valid URL of CKAN instance (option --host) and API key (--auth or read from ~/.netrc) must be given" + "\033[0;0m") sys.exit(-1) CKAN = CKAN_CLIENT(options.host, options.auth) ## UP = UPLOADER(CKAN, OUT, options.outdir,options.fromdate) UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT, options.outdir, options.fromdate, options.host) if (options.identifier): list = [options.identifier] listtext = 'given by option -i (%d id\'s)' % len(list) elif (options.list): f = open(options.list, 'r') list = f.readlines() f.close() listtext = 'got from file %s (%d id\'s)' % (options.list, len(list)) elif (options.community): ##UP.purge_group(options.community) UP.get_packages(options.community) ##HEW??? UP.get_group_list(options.community)
def process_delete(OUT, dir, options): print("###JM# Don't use this function. It is not up to date.") return False # create CKAN object CKAN = CKAN_CLIENT(options.iphost, options.auth) UP = Uploader(CKAN, OUT, options.outdir) ##HEW-D-ec credentials,ec = None,None # create credentials try: cred = b2handle.clientcredentials.PIDClientCredentials.load_from_JSON( 'credentials_11098') except Exception: logging.critical( "[CRITICAL] %s Could not create credentials from credstore %s" % (options.handle_check)) p.print_help() sys.exit(-1) else: logging.debug( "Create handle client instance to add uuid to handle server") for delete_file in glob.glob(dir + '/*.del'): community, mdprefix = os.path.splitext( os.path.basename(delete_file))[0].split('-') logging.info('\n## Deleting datasets from community "%s" ##' % (community)) # get packages from the group in CKAN: UP.get_packages(community) # open the delete file and loop over its lines: file_content = '' try: f = open(delete_file, 'r') file_content = f.read() f.close() except IOError: logging.critical("Cannot read data from '{0}'".format(delete_file)) f.close else: # rename the file in a crash backup file: os.rename(delete_file, delete_file + '.crash-backup') results = {'count': 0, 'ecount': 0, 'tcount': 0, 'time': 0} # use a try-except-finally environment to gurantee that no deleted metadata information will be lost: try: logging.info(' | | %-4s | %-50s | %-50s |\n |%s|' % ('#', 'oai identifier', 'CKAN identifier', "-" * 116)) deletestart = time.time() for line in file_content.split('\n'): # ignore empty lines: if not line: continue results['tcount'] += 1 subset, identifier = line.split('\t') # dataset name uniquely generated from oai identifier uid = uuid.uuid5(uuid.NAMESPACE_DNS, identifier.encode('ascii', 'replace')) ds = str(uid) # output: logging.info(' | d | %-4d | %-50s | %-50s |' % (results['tcount'], identifier, ds)) ### CHECK STATUS OF DATASET IN CKAN AND PID: # status of data set dsstatus = "unknown" # check against handle server handlestatus = "unknown" ##HEW-D-ec??? pid = credentials.prefix + "/eudat-jmd_" + ds pid = "11098/eudat-jmd_" + ds_id pidRecord["CHECKSUM"] = client.get_value_from_handle( pid, "CHECKSUM") if (pidRecord["CHECKSUM"] == None): logging.debug( " |-> Can not access pid %s to get checksum" % (pid)) handlestatus = "new" else: logging.debug(" |-> pid %s exists" % (pid)) handlestatus = "exist" # check against CKAN database ckanstatus = 'unknown' ckanstatus = UP.check_dataset(ds, None) delete = 0 # depending on handle status delete record from B2FIND if (handlestatus == "new" and ckanstatus == "new"): # no action required logging.info(' |-> %s' % ('No deletion required')) else: delete = UP.delete(ds, ckanstatus) if (delete == 1): logging.info(' |-> %s' % ('Deletion was successful')) results['count'] += 1 # delete handle (to keep the symmetry between handle and B2FIND server) if (handlestatus == "exist"): logging.info( " |-> Delete handle %s with checksum %s" % (pid, pidRecord["CHECKSUM"])) try: client.delete_handle(pid) except GenericHandleError as err: logging.error('[ERROR] Unexpected Error: %s' % err) except Exception: logging.error('[ERROR] Unexpected Error:') else: logging.info( " |-> No action (deletion) required for handle %s" % pid) else: logging.info(' |-> %s' % ('Deletion failed')) results['ecount'] += 1 except Exception: logging.error('[ERROR] Unexpected Error') logging.error('You find the ids of the deleted metadata in "%s"' % (delete_file + '.crash-backup')) raise else: # all worked fine you can remove the crash-backup file: os.remove(delete_file + '.crash-backup') deletetime = time.time() - deletestart results['time'] = deletetime # save stats: OUT.save_stats(community + '-' + mdprefix, subset, 'd', results)
def main(): pstat = { 'status' : {}, 'text' : {}, 'short' : {}, } now = time.strftime("%Y-%m-%d %H:%M:%S") jid = os.getpid() ckanlistrequests=['package_list','group_list','tag_list'] ## Get options and arguments args = get_args(ckanlistrequests) # Output instance OUT = Output(pstat,now,jid,args) logger = OUT.setup_custom_logger('root',args.verbose) ## Settings for CKAN client and API ckanapi3='http://'+args.iphost+'/api/3' auth='12345' CKAN = CKAN_CLIENT(args.iphost,auth) ckan_limit=500000 start=time.time() if args.request.endswith('list'): try: if args.request == 'community_list' : action='group_list' else: action=args.request if PY2 : answer = CKAN.action(action, rows=ckan_limit) else: answer = CKAN.action(action) except ckanclient.CkanApiError as e : print('\t\tError %s Supported list requests are %s.' % (e,ckanlistrequests)) sys.exit(1) print('\n\t%s' % '\n\t'.join(answer).encode('utf8')) sys.exit(0) # create CKAN search pattern : ckan_pattern = '' sand='' pattern=' '.join(args.pattern) if (args.community): ckan_pattern += "groups:%s" % args.community sand=" AND " if (args.pattern): ckan_pattern += sand + pattern print(' | - Search\n\t|- in\t%s\n\t|- for\t%s\n' % (args.iphost,ckan_pattern)) if args.request == 'package_search' : if PY2: answer = CKAN.action('package_search', {"q":ckan_pattern}) ##HEW-D? , rows=ckan_limit) else: answer = CKAN.action('package_search',{"q":ckan_pattern}) for key, value in answer.items() : logger.warning('answer has key %s' % key) if PY2 : tcount=answer['result']['count'] ### ['count'] else: tcount=answer['result']['count'] print(' | - Results:\n\t|- %d records found in %d sec' % (tcount,time.time()-start)) # Read in B2FIND metadata schema and fields schemafile = '%s/mapfiles/b2find_schema.json' % (os.getcwd()) with open(schemafile, 'r') as f: b2findfields=json.loads(f.read(), object_pairs_hook=OrderedDict) if tcount>0 and args.keys is not None : if len(args.keys) == 0 : akeys=[] else: if args.keys[0] == 'B2FIND.*' : akeys=OrderedDict(sorted(b2findfields.keys())) else: akeys=args.keys suppid=b2findfields.keys() fh = io.open(args.output, "w", encoding='utf8') record={} totlist=[] count={} count['id']=0 statc={} for outt in akeys: if outt not in suppid : print(' [WARNING] Not supported key %s is removed' % outt) akeys.remove(outt) else: count[outt]=0 statc[outt] = Counter() printfacets='' if (len(akeys) > 0): printfacets="and related facets %s " % ", ".join(akeys) print('\t|- IDs %sare written to %s ...' % (printfacets,args.output)) counter=0 cstart=0 oldperc=0 start2=time.time() while (cstart < tcount) : if (cstart > 0): if PY2 : answer = CKAN.action('package_search', {"q":ckan_pattern,"rows":ckan_limit,"start":cstart}) ##HEW-D q=ckan_pattern, rows=ckan_limit, start=cstart) else: answer = CKAN.action('package_search',{"q":ckan_pattern,"rows":ckan_limit,"start":cstart}) if PY2 : if len(answer['result']) == 0 : break # loop over found records if PY2: results= answer['result']['results'] ### ['results'] else: results= answer['result']['results'] for ds in results : #### answer['results']: counter +=1 logger.debug(' | %-4d | %-40s |' % (counter,ds['name'])) perc=int(counter*100/tcount) bartags=perc/5 if perc%10 == 0 and perc != oldperc : oldperc=perc print('\r\t[%-20s] %5d (%3d%%) in %d sec' % ('='*int(bartags), counter, perc, time.time()-start2 )) sys.stdout.flush() record['id'] = '%s' % (ds['name']) outline='%s\n' % record['id'] # loop over facets for facet in akeys: ckanFacet=b2findfields[facet]["ckanName"] if ckanFacet in ds: ## CKAN default field if facet == 'Group': record[facet] = ds[ckanFacet][0]['display_name'] else: record[facet] = ds[ckanFacet] else: ## CKAN extra field efacet=[e for e in ds['extras'] if e['key'] == facet] if efacet: record[facet] = efacet[0]['value'] else: record[facet] = 'N/A' if record[facet] is None : record[facet]='None' statc[facet][record[facet]]+=1 else: if not isinstance(record[facet],list): words=record[facet].split(';') else: words=record[facet] for word in words: if isinstance(word,dict): word=word['name'] statc[facet][word]+=1 if not ( record[facet] == 'N/A' or record[facet] == 'Not Stated') and len(record[facet])>0 : count[facet]+=1 if PY2 : fh.writelines((record[facet]+'\n').decode('utf-8')) else : fh.writelines((record[facet]+'\n')) cstart+=len(results) logger.warning('%d records done, %d in total' % (cstart,tcount)) fh.close() if len(akeys) > 0 : statfh = io.open('stat_'+args.output, "w", encoding='utf8') print('|- Statistics written to file %s' % 'stat_'+args.output) statline="" for outt in akeys: statline+= "| %-16s\n\t| %-15s | %-6d | %3d |\n" % (outt,'-Total-',count[outt],int(count[outt]*100/tcount)) for word in statc[outt].most_common(10): statline+= '\t| %-15s | %-6d | %3d |\n' % (word[0][:100], word[1], int(word[1]*100/tcount)) if PY2 : statfh.write(statline.decode('utf-8')) else : statfh.write(statline) statfh.close()
def process(options, pstat, OUT): ## process (options,pstat,OUT) - function # Starts processing as specified in pstat['tbd'] and # according the request list given bey the options # # Parameters: # ----------- # 1. options (OptionsParser object) # 2. pstat (process status dict) # # set list of request lsits for single or multi mode: mode = None procOptions = [ 'community', 'source', 'verb', 'mdprefix', 'mdsubset', 'target_mdschema' ] if (options.source): mode = 'single' mandParams = ['community', 'verb', 'mdprefix'] # mandatory processing params for param in mandParams: if not getattr(options, param): logger.critical( "Processing parameter %s is required in single mode" % param) sys.exit(-1) reqlist = [[ options.community, options.source, options.verb, options.mdprefix, options.mdsubset, options.ckan_check, options.handle_check, options.target_mdschema ]] elif (options.list): mode = 'multi' logger.debug(' |- Joblist: \t%s' % options.list) reqlist = parse_list_file(options) logger.debug(' |- Requestlist: \t%s' % reqlist) ## check job request (processing) options logger.debug('|- Command line options') for opt in procOptions: if hasattr(options, opt): logger.debug(' |- %s:\t%s' % (opt.upper(), getattr(options, opt))) ## HARVESTING mode: if (pstat['status']['h'] == 'tbd'): logger.info('\n|- Harvesting started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) HV = Harvester(OUT, pstat, options.outdir, options.fromdate) process_harvest(HV, reqlist) ## MAPPINING - Mode: if (pstat['status']['m'] == 'tbd'): logger.info('\n|- Mapping started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) MP = Mapper(OUT, options.outdir, options.fromdate) process_map(MP, reqlist) ## VALIDATING - Mode: if (pstat['status']['v'] == 'tbd'): logger.info(' |- Validating started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) MP = Mapper(OUT, options.outdir, options.fromdate) process_validate(MP, reqlist) ## OAI-CONVERTING - Mode: if (pstat['status']['o'] == 'tbd'): logger.info('\n|- OAI-Converting started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) MP = Mapper(OUT, options.outdir, options.fromdate) process_oaiconvert(MP, reqlist) ## UPLOADING - Mode: if (pstat['status']['u'] == 'tbd'): logger.info('\n|- Uploading started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) # create CKAN object CKAN = CKAN_CLIENT(options.iphost, options.auth) # create credentials and handle client if required if (options.handle_check): try: cred = PIDClientCredentials.load_from_JSON('credentials_11098') except Exception as err: logger.critical( "%s : Could not create credentials from credstore %s" % (err, options.handle_check)) ##p.print_help() sys.exit(-1) else: logger.debug("Create EUDATHandleClient instance") HandleClient = EUDATHandleClient.instantiate_with_credentials( cred) else: cred = None HandleClient = None UP = Uploader(CKAN, options.ckan_check, HandleClient, cred, OUT, options.outdir, options.fromdate, options.iphost) logger.info(' |- Host: \t%s' % CKAN.ip_host) process_upload(UP, reqlist) ## DELETING - Mode: if (pstat['status']['d'] == 'tbd'): # start the process deleting: logger.info('\n|- Deleting started : %s' % time.strftime("%Y-%m-%d %H:%M:%S")) if mode is 'multi': dir = options.outdir + '/delete' if os.path.exists(dir): process_delete(OUT, dir, options) else: logger.error( '[ERROR] The directory "%s" does not exist! No files for deleting are found!' % (dir)) else: logger.critical( "[CRITICAL] Deleting mode only supported in 'multi mode' and an explicitly deleting script given !" )