def spark_function(spark, **kwargs): ''' Publish records in Combine, prepares for OAI server output Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID job_input (str): location of avro files on disk Returns: None - creates symlinks from input job to new avro file symlinks on disk - copies records in DB from input job to new published job - copies documents in ES from input to new published job index ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # read output from input job, filtering by job_id, grabbing Combine Record schema fields input_job = Job.objects.get(pk=int(kwargs['input_job_id'])) bounds = get_job_db_bounds(input_job) sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE, column='id', lowerBound=bounds['lowerBound'], upperBound=bounds['upperBound'], numPartitions=settings.JDBC_NUMPARTITIONS) records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id'])) # repartition records = records.repartition(settings.SPARK_REPARTITION) # get rows with document content records = records[records['document'] != ''] # update job column, overwriting job_id from input jobs in merge job_id = job.id job_id_udf = udf(lambda record_id: job_id, IntegerType()) records = records.withColumn('job_id', job_id_udf(records.record_id)) # write job output to avro records.select(CombineRecordSchema().field_names).write.format( "com.databricks.spark.avro").save(job.job_output) # confirm directory exists published_dir = '%s/published' % ( settings.BINARY_STORAGE.split('file://')[-1].rstrip('/')) if not os.path.exists(published_dir): os.mkdir(published_dir) # get avro files job_output_dir = job.job_output.split('file://')[-1] avros = [f for f in os.listdir(job_output_dir) if f.endswith('.avro')] for avro in avros: os.symlink(os.path.join(job_output_dir, avro), os.path.join(published_dir, avro)) # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=records, write_avro=False, index_records=False) # copy index from input job to new Publish job index_to_job_index = ESIndex.copy_es_index(source_index='j%s' % input_job.id, target_index='j%s' % job.id, wait_for_completion=False) # copy index from new Publish Job to /published index # NOTE: because back to back reindexes, and problems with timeouts on requests, # wait on task from previous reindex es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) retry = 1 while retry <= 100: # get task task = es_handle_temp.tasks.get(index_to_job_index['task']) # if task complete, index job index to published index if task['completed']: index_to_published_index = ESIndex.copy_es_index( source_index='j%s' % job.id, target_index='published', wait_for_completion=False, add_copied_from= job_id # do not use Job instance here, only pass string ) break # break from retry loop else: print( "indexing to /published, waiting on task node %s, retry: %s/10" % (task['task']['node'], retry)) # bump retries, sleep, and continue retry += 1 time.sleep(3) continue # get PublishedRecords handle pr = PublishedRecords() # set records from job as published pr.set_published_field(job_id) # update uniqueness of all published records pr.update_published_uniqueness() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def spark_function(spark, sc, write_avro=True, **kwargs): ''' Harvest records, select non-null, and write to avro files Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID job_inputs (list): list of locations of avro files on disk index_mapper (str): class name from core.spark.es, extending BaseMapper validation_scenarios (list): list of Validadtion Scenario IDs Returns: None - merges records from previous jobs, writes new aggregated records to avro files on disk - indexes records into DB - map / flatten records and indexes to ES ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # rehydrate list of input jobs input_jobs_ids = ast.literal_eval(kwargs['input_jobs_ids']) # get total range of id's from input jobs to help partition jdbc reader records_ids = [] for input_job_id in input_jobs_ids: input_job_temp = Job.objects.get(pk=int(input_job_id)) records = input_job_temp.get_records().order_by('id') start_id = records.first().id end_id = records.last().id records_ids += [start_id, end_id] records_ids.sort() # get list of RDDs from input jobs sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE, column='id', lowerBound=records_ids[0], upperBound=records_ids[-1], numPartitions=settings.JDBC_NUMPARTITIONS) input_jobs_dfs = [] for input_job_id in input_jobs_ids: # db job_df = sqldf.filter(sqldf.job_id == int(input_job_id)) input_jobs_dfs.append(job_df) # create aggregate rdd of frames agg_rdd = sc.union([df.rdd for df in input_jobs_dfs]) agg_df = spark.createDataFrame(agg_rdd, schema=input_jobs_dfs[0].schema) # repartition agg_df = agg_df.repartition(settings.SPARK_REPARTITION) # update job column, overwriting job_id from input jobs in merge job_id = job.id job_id_udf = udf(lambda record_id: job_id, IntegerType()) agg_df = agg_df.withColumn('job_id', job_id_udf(agg_df.record_id)) # if Analysis Job, do not write avro if job.job_type == 'AnalysisJob': write_avro = False # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=agg_df, write_avro=write_avro) # run record validation scnearios if requested, using db_records from save_records() output vs = ValidationScenarioSpark(spark=spark, job=job, records_df=db_records, validation_scenarios=ast.literal_eval( kwargs['validation_scenarios'])) vs.run_record_validation_scenarios() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def spark_function(spark, **kwargs): ''' Transform records based on Transformation Scenario. Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID job_input (str): location of avro files on disk transformation_id (str): id of Transformation Scenario index_mapper (str): class name from core.spark.es, extending BaseMapper validation_scenarios (list): list of Validadtion Scenario IDs Returns: None - transforms records via XSL, writes new records to avro files on disk - indexes records into DB - map / flatten records and indexes to ES ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # read output from input job, filtering by job_id, grabbing Combine Record schema fields input_job = Job.objects.get(pk=int(kwargs['input_job_id'])) bounds = get_job_db_bounds(input_job) sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE, column='id', lowerBound=bounds['lowerBound'], upperBound=bounds['upperBound'], numPartitions=settings.JDBC_NUMPARTITIONS) records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id'])) # repartition records = records.repartition(settings.SPARK_REPARTITION) # get transformation transformation = Transformation.objects.get( pk=int(kwargs['transformation_id'])) # if xslt type transformation if transformation.transformation_type == 'xslt': records_trans = TransformSpark.transform_xslt( spark, kwargs, job, transformation, records) # if python type transformation if transformation.transformation_type == 'python': records_trans = TransformSpark.transform_python( spark, kwargs, job, transformation, records) # convert back to DataFrame records_trans = records_trans.toDF() # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=records_trans) # run record validation scnearios if requested, using db_records from save_records() output vs = ValidationScenarioSpark(spark=spark, job=job, records_df=db_records, validation_scenarios=ast.literal_eval( kwargs['validation_scenarios'])) vs.run_record_validation_scenarios() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def spark_function(spark, **kwargs): ''' Harvest static XML records provided by user. Expected input structure: /foo/bar <-- self.static_payload baz1.xml <-- record at self.xpath_query within file baz2.xml baz3.xml As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: - record_id - job_id - oai_set - publish_set_id - unique (TBD) Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID static_payload (str): path of static payload on disk # TODO: add other kwargs here from static job index_mapper (str): class name from core.spark.es, extending BaseMapper validation_scenarios (list): list of Validadtion Scenario IDs Returns: None: - opens and parses static files from payload - indexes records into DB - map / flatten records and indexes to ES ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # read directory of static files static_rdd = spark.sparkContext.wholeTextFiles( 'file://%s' % kwargs['static_payload'], minPartitions=settings.SPARK_REPARTITION) # parse namespaces def get_namespaces(xml_node): nsmap = {} for ns in xml_node.xpath('//namespace::*'): if ns[0]: nsmap[ns[0]] = ns[1] return nsmap def get_metadata_udf(job_id, row, kwargs): # get doc string doc_string = row[1] try: # parse with lxml xml_root = etree.fromstring(doc_string.encode('utf-8')) # get namespaces nsmap = get_namespaces(xml_root) # get metadata root if kwargs['xpath_document_root'] != '': meta_root = xml_root.xpath(kwargs['xpath_document_root'], namespaces=nsmap) else: meta_root = xml_root.xpath('/*', namespaces=nsmap) if len(meta_root) == 1: meta_root = meta_root[0] elif len(meta_root) > 1: raise Exception( 'multiple elements found for metadata root xpath: %s' % kwargs['xpath_document_root']) elif len(meta_root) == 0: raise Exception( 'no elements found for metadata root xpath: %s' % kwargs['xpath_document_root']) # get unique identifier if kwargs['xpath_record_id'] != '': record_id = meta_root.xpath(kwargs['xpath_record_id'], namespaces=nsmap) if len(record_id) == 1: record_id = record_id[0].text elif len(meta_root) > 1: raise AmbiguousIdentifier( 'multiple elements found for identifier xpath: %s' % kwargs['xpath_record_id']) elif len(meta_root) == 0: raise AmbiguousIdentifier( 'no elements found for identifier xpath: %s' % kwargs['xpath_record_id']) else: record_id = hashlib.md5( doc_string.encode('utf-8')).hexdigest() # return success Row return Row(record_id=record_id, document=etree.tostring(meta_root).decode('utf-8'), error='', job_id=int(job_id), oai_set='', success=1) # catch missing or ambiguous identifiers except AmbiguousIdentifier as e: # hash record string to produce a unique id record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() # return error Row return Row(record_id=record_id, document=etree.tostring(meta_root).decode('utf-8'), error=str(e), job_id=int(job_id), oai_set='', success=0) # handle all other exceptions except Exception as e: # hash record string to produce a unique id record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() # return error Row return Row(record_id=record_id, document='', error=str(e), job_id=int(job_id), oai_set='', success=0) # transform via rdd.map job_id = job.id records = static_rdd.map( lambda row: get_metadata_udf(job_id, row, kwargs)) # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=records.toDF()) # run record validation scnearios if requested, using db_records from save_records() output vs = ValidationScenarioSpark(spark=spark, job=job, records_df=db_records, validation_scenarios=ast.literal_eval( kwargs['validation_scenarios'])) vs.run_record_validation_scenarios() # remove temporary payload directory if static job was upload based, not location on disk if kwargs['static_type'] == 'upload': shutil.rmtree(kwargs['static_payload']) # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def spark_function(spark, **kwargs): ''' Harvest records via OAI. As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: - record_id - job_id - oai_set - publish_set_id - unique (TBD) Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID endpoint (str): OAI endpoint verb (str): OAI verb used metadataPrefix (str): metadataPrefix for OAI harvest scope_type (str): [setList, whiteList, blackList, harvestAllSets], used by DPLA Ingestion3 scope_value (str): value for scope_type index_mapper (str): class name from core.spark.es, extending BaseMapper validation_scenarios (list): list of Validadtion Scenario IDs Returns: None: - harvests OAI records and writes to disk as avro files - indexes records into DB - map / flatten records and indexes to ES ''' # refresh Django DB Connection refresh_django_db_connection() # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack(job_id=job.id) job_track.save() # harvest OAI records via Ingestion3 df = spark.read.format("dpla.ingestion3.harvesters.oai")\ .option("endpoint", kwargs['endpoint'])\ .option("verb", kwargs['verb'])\ .option("metadataPrefix", kwargs['metadataPrefix'])\ .option(kwargs['scope_type'], kwargs['scope_value'])\ .load() # select records with content records = df.select("record.*").where("record is not null") # repartition records = records.repartition(settings.SPARK_REPARTITION) # attempt to find and select <metadata> element from OAI record, else filter out def find_metadata_udf(document): if type(document) == str: xml_root = etree.fromstring(document) m_root = xml_root.find( '{http://www.openarchives.org/OAI/2.0/}metadata') if m_root is not None: # expecting only one child to <metadata> element m_children = m_root.getchildren() if len(m_children) == 1: m_child = m_children[0] m_string = etree.tostring(m_child).decode('utf-8') return m_string else: return 'none' else: return 'none' metadata_udf = udf(lambda col_val: find_metadata_udf(col_val), StringType()) records = records.select(*[ metadata_udf(col).alias('document') if col == 'document' else col for col in records.columns ]) records = records.filter(records.document != 'none') # establish 'success' column, setting all success for Harvest records = records.withColumn('success', pyspark_sql_functions.lit(1)) # copy 'id' from OAI harvest to 'record_id' column records = records.withColumn('record_id', records.id) # add job_id as column job_id = job.id job_id_udf = udf(lambda id: job_id, IntegerType()) records = records.withColumn('job_id', job_id_udf(records.id)) # add oai_set records = records.withColumn('oai_set', records.setIds[0]) # add blank error column error = udf(lambda id: '', StringType()) records = records.withColumn('error', error(records.id)) # index records to DB and index to ElasticSearch db_records = save_records(spark=spark, kwargs=kwargs, job=job, records_df=records) # run record validation scnearios if requested, using db_records from save_records() output vs = ValidationScenarioSpark(spark=spark, job=job, records_df=db_records, validation_scenarios=ast.literal_eval( kwargs['validation_scenarios'])) vs.run_record_validation_scenarios() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()
def spark_function(spark, **kwargs): ''' Harvest records, select non-null, and write to avro files Args: spark (pyspark.sql.session.SparkSession): provided by pyspark context kwargs: job_id (int): Job ID job_input (str): location of avro files on disk transform_filepath (str): location of XSL file used for transformation index_mapper (str): class name from core.spark.es, extending BaseMapper Returns: None - transforms records via XSL, writes new records to avro files on disk - indexes records into DB - map / flatten records and indexes to ES ''' # get job job = Job.objects.get(pk=int(kwargs['job_id'])) # start job_track instance, marking job start job_track = JobTrack( job_id = job.id ) job_track.save() # read output from input job, filtering by job_id, grabbing Combine Record schema fields input_job = Job.objects.get(pk=int(kwargs['input_job_id'])) bounds = get_job_db_bounds(input_job) sqldf = spark.read.jdbc( settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE, column='id', lowerBound=bounds['lowerBound'], upperBound=bounds['upperBound'], numPartitions=settings.JDBC_NUMPARTITIONS ) records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id'])) # repartition records = records.repartition(settings.SPARK_REPARTITION) # get transformation transformation = Transformation.objects.get(pk=int(kwargs['transformation_id'])) # if xslt type transformation if transformation.transformation_type == 'xslt': # define udf function for transformation def transform_xml_udf(job_id, row, xslt_string): # attempt transformation and save out put to 'document' try: # transform with pyjxslt gateway gw = pyjxslt.Gateway(6767) gw.add_transform('xslt_transform', xslt_string) result = gw.transform('xslt_transform', row.document) gw.drop_transform('xslt_transform') # set trans_result tuple trans_result = (result, '', 1) # catch transformation exception and save exception to 'error' except Exception as e: # set trans_result tuple trans_result = ('', str(e), 0) # return Row return Row( record_id = row.record_id, document = trans_result[0], error = trans_result[1], job_id = int(job_id), oai_set = row.oai_set, success = trans_result[2] ) # open XSLT transformation, pass to map as string with open(transformation.filepath,'r') as f: xslt_string = f.read() # transform via rdd.map job_id = job.id records_trans = records.rdd.map(lambda row: transform_xml_udf(job_id, row, xslt_string)) # back to DataFrame records_trans = records_trans.toDF() # index records to DB and index to ElasticSearch db_records = save_records( spark=spark, kwargs=kwargs, job=job, records_df=records_trans ) # run record validation scnearios if requested, using db_records from save_records() output vs = ValidationScenarioSpark( spark=spark, job=job, records_df=db_records, validation_scenarios = ast.literal_eval(kwargs['validation_scenarios']) ) vs.run_record_validation_scenarios() # finally, update finish_timestamp of job_track instance job_track.finish_timestamp = datetime.datetime.now() job_track.save()