def __init__(self, args): # read args, route verb to verb handler self.verb_routes = { 'GetRecord': self._GetRecord, 'Identify': self._Identify, 'ListIdentifiers': self._ListIdentifiers, 'ListMetadataFormats': self._ListMetadataFormats, 'ListRecords': self._ListRecords, 'ListSets': self._ListSets } # debug logger.debug(args) self.args = args.copy() self.request_timestamp = datetime.datetime.now() self.request_timestamp_string = self.request_timestamp.strftime( '%Y-%m-%dT%H:%M:%SZ') self.record_nodes = [] # published dataframe slice parameters self.start = 0 self.chunk_size = settings.OAI_RESPONSE_SIZE self.publish_set_id = None if 'set' in self.args.keys(): self.publish_set_id = self.args['set'] else: self.publish_set_id = None # get instance of Published model self.published = models.PublishedRecords() # begin scaffolding self.scaffold()
def job_publish(ct_id): # get CombineTask (ct) try: ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # publish job publish_results = cjob.job.publish(publish_set_id=ct.task_params['publish_set_id']) # add publish_set_id to published subsets if present for published_subset in ct.task_params['in_published_subsets']: logger.debug('adding publish_set_id to Published Subset: %s' % published_subset) pr = models.PublishedRecords(subset=published_subset) pr.add_publish_set_id_to_subset(publish_set_id=ct.task_params['publish_set_id']) # REEVALUATE SUBSET HIERARCHY # If the Org or Record Group exists in any published subset, re-evaluate that list of job|# # remove from published subsets cjob.job.remove_from_published_precounts() # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'job_id':ct.task_params['job_id'], 'publish_results':publish_results }) ct.save() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def job_publish(ct_id): # get CombineTask (ct) try: ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) LOGGER.info('using %s', ct) # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # publish job publish_results = cjob.job.publish( publish_set_id=ct.task_params['publish_set_id']) # remove from published subsets cjob.job.remove_from_published_precounts() # add publish_set_id to published subsets if present, and remove precount for published_subset in ct.task_params['in_published_subsets']: LOGGER.debug('adding publish_set_id to Published Subset: %s', published_subset) pr = models.PublishedRecords(subset=published_subset) pr.add_publish_set_id_to_subset( publish_set_id=ct.task_params['publish_set_id']) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'job_id': ct.task_params['job_id'], 'publish_results': publish_results }) ct.save() except Exception as e: LOGGER.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({'error': str(e)}) ct.save()
def export_documents(ct_id): ''' - submit livy job and poll until complete - use livy session from cjob (works, but awkward way to get this) - add wrapper element to file parts - rename file parts - tar/zip together ''' # get CombineBackgroundTask ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) logger.info('using %s' % ct) # generate spark code output_path = '/tmp/%s' % str(uuid.uuid4()) # handle single Job if 'job_id' in ct.task_params.keys(): # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # set archive filename of loose XML files archive_filename_root = 'j_%s_documents' % cjob.job.id # build job_dictionary job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} logger.info(job_dict) # handle published records if 'published' in ct.task_params.keys(): # set archive filename of loose XML files archive_filename_root = 'published_documents' # get anonymous CombineJob cjob = models.CombineJob() # get published records to determine sets pr = models.PublishedRecords(subset=ct.task_params['subset']) # init job dictionary job_dict = {} # handle published jobs with publish set ids for publish_id, jobs in pr.sets.items(): job_dict[publish_id] = [ job.id for job in jobs ] # handle "loose" Jobs job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] # debug logger.info(job_dict) # update task params ct.refresh_from_db() ct.update_task_params({ 'output_path':output_path, 'archive_filename_root':archive_filename_root, 'job_dict':job_dict }) # prepare spark code spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id)) logger.info(spark_code) try: # check for livy session _check_livy_session() # submit to livy logger.info('submitting code to Spark') submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) # poll until complete logger.info('polling for Spark job to complete...') results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) logger.info(results) # handle s3 bucket if ct.task_params.get('s3_export', False): if ct.task_params.get('s3_export_type') == 'archive': logger.debug('writing archive file to S3') # create single archive file ct = _create_export_documents_archive(ct) # upload to s3 s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ .put(Body=open(ct.task_params['export_output_archive'],'rb')) # delete all traces from local output shutil.rmtree(ct.task_params['output_path']) elif ct.task_params.get('s3_export_type') == 'spark_df': logger.debug('s3 export type was spark_df, nothing to cleanup or do') # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 's3_export_type':ct.task_params['s3_export_type'], 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), }) ct.save() logger.info(ct.task_output_json) # handle local filesystem else: # create single archive file ct = _create_export_documents_archive(ct) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'export_output':ct.task_params['export_output_archive'], 'name':ct.task_params['export_output_archive'].split('/')[-1], 'content_type':ct.task_params['content_type'], 'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1]) }) ct.save() logger.info(ct.task_output_json) except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()
def export_mapped_fields(ct_id): # get CombineTask (ct) ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) try: # JSON export if ct.task_params['mapped_fields_export_type'] == 'json': # handle single Job if 'job_id' in ct.task_params.keys(): # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # set output filename output_path = '/tmp/%s' % uuid.uuid4().hex os.mkdir(output_path) export_output = '%s/job_%s_mapped_fields.json' % (output_path, cjob.job.id) # build command list cmd = [ "elasticdump", "--input=http://%s:9200/j%s" % (settings.ES_HOST, cjob.job.id), "--output=%s" % export_output, "--type=data", "--sourceOnly", "--ignore-errors", "--noRefresh" ] # handle published records if 'published' in ct.task_params.keys(): # set output filename output_path = '/tmp/%s' % uuid.uuid4().hex os.mkdir(output_path) export_output = '%s/published_mapped_fields.json' % (output_path) # get list of jobs ES indices to export pr = models.PublishedRecords(subset=ct.task_params['subset']) es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) # build command list cmd = [ "elasticdump", "--input=http://%s:9200/%s" % (settings.ES_HOST, es_list), "--output=%s" % export_output, "--type=data", "--sourceOnly", "--ignore-errors", "--noRefresh" ] # if fields provided, limit if ct.task_params['mapped_field_include']: logger.info('specific fields selected, adding to elasticdump command:') searchBody = { "_source":ct.task_params['mapped_field_include'] } cmd.append("--searchBody='%s'" % json.dumps(searchBody)) # CSV export if ct.task_params['mapped_fields_export_type'] == 'csv': # handle single Job if 'job_id' in ct.task_params.keys(): # get CombineJob cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) # set output filename output_path = '/tmp/%s' % uuid.uuid4().hex os.mkdir(output_path) export_output = '%s/job_%s_mapped_fields.csv' % (output_path, cjob.job.id) # build command list cmd = [ "es2csv", "-u http://%s:9200" % settings.ES_HOST, "-q '*'", "-i 'j%s'" % cjob.job.id, "-D 'record'", "-o '%s'" % export_output ] # handle published records if 'published' in ct.task_params.keys(): # set output filename output_path = '/tmp/%s' % uuid.uuid4().hex os.mkdir(output_path) export_output = '%s/published_mapped_fields.csv' % (output_path) # get list of jobs ES indices to export pr = models.PublishedRecords(subset=ct.task_params['subset']) es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) # build command list cmd = [ "es2csv", "-u http://%s:9200" % settings.ES_HOST, "-q '*'", "-i '%s'" % es_list, "-D 'record'", "-o '%s'" % export_output ] # handle kibana style if ct.task_params['kibana_style']: cmd.append('-k') cmd.append("-kd '|'") # if fields provided, limit if ct.task_params['mapped_field_include']: logger.info('specific fields selected, adding to es2csv command:') cmd.append('-f ' + " ".join(["'%s'" % field for field in ct.task_params['mapped_field_include']])) # execute compiled command logger.info(cmd) os.system(" ".join(cmd)) # handle compression if ct.task_params['archive_type'] == 'none': logger.info('uncompressed csv file requested, continuing') elif ct.task_params['archive_type'] == 'zip': logger.info('creating compressed zip archive') content_type = 'application/zip' # establish output archive file export_output_archive = '%s/%s.zip' % (output_path, export_output.split('/')[-1]) with zipfile.ZipFile(export_output_archive,'w', zipfile.ZIP_DEFLATED) as zip: zip.write(export_output, export_output.split('/')[-1]) # set export output to archive file export_output = export_output_archive # tar.gz elif ct.task_params['archive_type'] == 'targz': logger.info('creating compressed tar archive') content_type = 'application/gzip' # establish output archive file export_output_archive = '%s/%s.tar.gz' % (output_path, export_output.split('/')[-1]) with tarfile.open(export_output_archive, 'w:gz') as tar: tar.add(export_output, arcname=export_output.split('/')[-1]) # set export output to archive file export_output = export_output_archive # handle s3 bucket if ct.task_params.get('s3_export', False): logger.debug('writing archive file to S3') # upload to s3 s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ .put(Body=open(export_output,'rb')) # delete all traces from local output shutil.rmtree(output_path) # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 's3_export_type':ct.task_params['s3_export_type'], 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), }) ct.save() logger.info(ct.task_output_json) # handle local filesystem else: # save export output to Combine Task output ct.refresh_from_db() ct.task_output_json = json.dumps({ 'export_output':export_output, 'name':export_output.split('/')[-1], 'export_dir':"/".join(export_output.split('/')[:-1]) }) ct.save() except Exception as e: logger.info(str(e)) # attempt to capture error and return for task ct.task_output_json = json.dumps({ 'error':str(e) }) ct.save()