Ejemplo n.º 1
0
    def spark_function(spark, **kwargs):
        '''
		Publish records in Combine, prepares for OAI server output

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				job_input (str): location of avro files on disk				

		Returns:
			None
			- creates symlinks from input job to new avro file symlinks on disk
			- copies records in DB from input job to new published job
			- copies documents in ES from input to new published job index
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # read output from input job, filtering by job_id, grabbing Combine Record schema fields
        input_job = Job.objects.get(pk=int(kwargs['input_job_id']))
        bounds = get_job_db_bounds(input_job)
        sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'],
                                'core_record',
                                properties=settings.COMBINE_DATABASE,
                                column='id',
                                lowerBound=bounds['lowerBound'],
                                upperBound=bounds['upperBound'],
                                numPartitions=settings.JDBC_NUMPARTITIONS)
        records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id']))

        # repartition
        records = records.repartition(settings.SPARK_REPARTITION)

        # get rows with document content
        records = records[records['document'] != '']

        # update job column, overwriting job_id from input jobs in merge
        job_id = job.id
        job_id_udf = udf(lambda record_id: job_id, IntegerType())
        records = records.withColumn('job_id', job_id_udf(records.record_id))

        # write job output to avro
        records.select(CombineRecordSchema().field_names).write.format(
            "com.databricks.spark.avro").save(job.job_output)

        # confirm directory exists
        published_dir = '%s/published' % (
            settings.BINARY_STORAGE.split('file://')[-1].rstrip('/'))
        if not os.path.exists(published_dir):
            os.mkdir(published_dir)

        # get avro files
        job_output_dir = job.job_output.split('file://')[-1]
        avros = [f for f in os.listdir(job_output_dir) if f.endswith('.avro')]
        for avro in avros:
            os.symlink(os.path.join(job_output_dir, avro),
                       os.path.join(published_dir, avro))

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=records,
                                  write_avro=False,
                                  index_records=False)

        # copy index from input job to new Publish job
        index_to_job_index = ESIndex.copy_es_index(source_index='j%s' %
                                                   input_job.id,
                                                   target_index='j%s' % job.id,
                                                   wait_for_completion=False)

        # copy index from new Publish Job to /published index
        # NOTE: because back to back reindexes, and problems with timeouts on requests,
        # wait on task from previous reindex
        es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST])
        retry = 1
        while retry <= 100:

            # get task
            task = es_handle_temp.tasks.get(index_to_job_index['task'])

            # if task complete, index job index to published index
            if task['completed']:
                index_to_published_index = ESIndex.copy_es_index(
                    source_index='j%s' % job.id,
                    target_index='published',
                    wait_for_completion=False,
                    add_copied_from=
                    job_id  # do not use Job instance here, only pass string
                )
                break  # break from retry loop

            else:
                print(
                    "indexing to /published, waiting on task node %s, retry: %s/10"
                    % (task['task']['node'], retry))

                # bump retries, sleep, and continue
                retry += 1
                time.sleep(3)
                continue

        # get PublishedRecords handle
        pr = PublishedRecords()

        # set records from job as published
        pr.set_published_field(job_id)

        # update uniqueness of all published records
        pr.update_published_uniqueness()

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Ejemplo n.º 2
0
    def spark_function(spark, sc, write_avro=True, **kwargs):
        '''
		Harvest records, select non-null, and write to avro files

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				job_inputs (list): list of locations of avro files on disk
				index_mapper (str): class name from core.spark.es, extending BaseMapper
				validation_scenarios (list): list of Validadtion Scenario IDs

		Returns:
			None
			- merges records from previous jobs, writes new aggregated records to avro files on disk
			- indexes records into DB
			- map / flatten records and indexes to ES
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # rehydrate list of input jobs
        input_jobs_ids = ast.literal_eval(kwargs['input_jobs_ids'])

        # get total range of id's from input jobs to help partition jdbc reader
        records_ids = []
        for input_job_id in input_jobs_ids:
            input_job_temp = Job.objects.get(pk=int(input_job_id))
            records = input_job_temp.get_records().order_by('id')
            start_id = records.first().id
            end_id = records.last().id
            records_ids += [start_id, end_id]
        records_ids.sort()

        # get list of RDDs from input jobs
        sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'],
                                'core_record',
                                properties=settings.COMBINE_DATABASE,
                                column='id',
                                lowerBound=records_ids[0],
                                upperBound=records_ids[-1],
                                numPartitions=settings.JDBC_NUMPARTITIONS)

        input_jobs_dfs = []
        for input_job_id in input_jobs_ids:

            # db
            job_df = sqldf.filter(sqldf.job_id == int(input_job_id))
            input_jobs_dfs.append(job_df)

        # create aggregate rdd of frames
        agg_rdd = sc.union([df.rdd for df in input_jobs_dfs])
        agg_df = spark.createDataFrame(agg_rdd,
                                       schema=input_jobs_dfs[0].schema)

        # repartition
        agg_df = agg_df.repartition(settings.SPARK_REPARTITION)

        # update job column, overwriting job_id from input jobs in merge
        job_id = job.id
        job_id_udf = udf(lambda record_id: job_id, IntegerType())
        agg_df = agg_df.withColumn('job_id', job_id_udf(agg_df.record_id))

        # if Analysis Job, do not write avro
        if job.job_type == 'AnalysisJob':
            write_avro = False

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=agg_df,
                                  write_avro=write_avro)

        # run record validation scnearios if requested, using db_records from save_records() output
        vs = ValidationScenarioSpark(spark=spark,
                                     job=job,
                                     records_df=db_records,
                                     validation_scenarios=ast.literal_eval(
                                         kwargs['validation_scenarios']))
        vs.run_record_validation_scenarios()

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Ejemplo n.º 3
0
    def spark_function(spark, **kwargs):
        '''
		Transform records based on Transformation Scenario.

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				job_input (str): location of avro files on disk
				transformation_id (str): id of Transformation Scenario
				index_mapper (str): class name from core.spark.es, extending BaseMapper
				validation_scenarios (list): list of Validadtion Scenario IDs

		Returns:
			None
			- transforms records via XSL, writes new records to avro files on disk
			- indexes records into DB
			- map / flatten records and indexes to ES
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # read output from input job, filtering by job_id, grabbing Combine Record schema fields
        input_job = Job.objects.get(pk=int(kwargs['input_job_id']))
        bounds = get_job_db_bounds(input_job)
        sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'],
                                'core_record',
                                properties=settings.COMBINE_DATABASE,
                                column='id',
                                lowerBound=bounds['lowerBound'],
                                upperBound=bounds['upperBound'],
                                numPartitions=settings.JDBC_NUMPARTITIONS)
        records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id']))

        # repartition
        records = records.repartition(settings.SPARK_REPARTITION)

        # get transformation
        transformation = Transformation.objects.get(
            pk=int(kwargs['transformation_id']))

        # if xslt type transformation
        if transformation.transformation_type == 'xslt':
            records_trans = TransformSpark.transform_xslt(
                spark, kwargs, job, transformation, records)

        # if python type transformation
        if transformation.transformation_type == 'python':
            records_trans = TransformSpark.transform_python(
                spark, kwargs, job, transformation, records)

        # convert back to DataFrame
        records_trans = records_trans.toDF()

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=records_trans)

        # run record validation scnearios if requested, using db_records from save_records() output
        vs = ValidationScenarioSpark(spark=spark,
                                     job=job,
                                     records_df=db_records,
                                     validation_scenarios=ast.literal_eval(
                                         kwargs['validation_scenarios']))
        vs.run_record_validation_scenarios()

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Ejemplo n.º 4
0
    def spark_function(spark, **kwargs):
        '''
		Harvest static XML records provided by user.

		Expected input structure:
			/foo/bar <-- self.static_payload
				baz1.xml <-- record at self.xpath_query within file
				baz2.xml
				baz3.xml

		As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time:
			- record_id 
			- job_id
			- oai_set
			- publish_set_id
			- unique (TBD)

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				static_payload (str): path of static payload on disk
				# TODO: add other kwargs here from static job
				index_mapper (str): class name from core.spark.es, extending BaseMapper
				validation_scenarios (list): list of Validadtion Scenario IDs

		Returns:
			None:
			- opens and parses static files from payload
			- indexes records into DB
			- map / flatten records and indexes to ES
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # read directory of static files
        static_rdd = spark.sparkContext.wholeTextFiles(
            'file://%s' % kwargs['static_payload'],
            minPartitions=settings.SPARK_REPARTITION)

        # parse namespaces
        def get_namespaces(xml_node):
            nsmap = {}
            for ns in xml_node.xpath('//namespace::*'):
                if ns[0]:
                    nsmap[ns[0]] = ns[1]
            return nsmap

        def get_metadata_udf(job_id, row, kwargs):

            # get doc string
            doc_string = row[1]

            try:

                # parse with lxml
                xml_root = etree.fromstring(doc_string.encode('utf-8'))

                # get namespaces
                nsmap = get_namespaces(xml_root)

                # get metadata root
                if kwargs['xpath_document_root'] != '':
                    meta_root = xml_root.xpath(kwargs['xpath_document_root'],
                                               namespaces=nsmap)
                else:
                    meta_root = xml_root.xpath('/*', namespaces=nsmap)
                if len(meta_root) == 1:
                    meta_root = meta_root[0]
                elif len(meta_root) > 1:
                    raise Exception(
                        'multiple elements found for metadata root xpath: %s' %
                        kwargs['xpath_document_root'])
                elif len(meta_root) == 0:
                    raise Exception(
                        'no elements found for metadata root xpath: %s' %
                        kwargs['xpath_document_root'])

                # get unique identifier
                if kwargs['xpath_record_id'] != '':
                    record_id = meta_root.xpath(kwargs['xpath_record_id'],
                                                namespaces=nsmap)
                    if len(record_id) == 1:
                        record_id = record_id[0].text
                    elif len(meta_root) > 1:
                        raise AmbiguousIdentifier(
                            'multiple elements found for identifier xpath: %s'
                            % kwargs['xpath_record_id'])
                    elif len(meta_root) == 0:
                        raise AmbiguousIdentifier(
                            'no elements found for identifier xpath: %s' %
                            kwargs['xpath_record_id'])
                else:
                    record_id = hashlib.md5(
                        doc_string.encode('utf-8')).hexdigest()

                # return success Row
                return Row(record_id=record_id,
                           document=etree.tostring(meta_root).decode('utf-8'),
                           error='',
                           job_id=int(job_id),
                           oai_set='',
                           success=1)

            # catch missing or ambiguous identifiers
            except AmbiguousIdentifier as e:

                # hash record string to produce a unique id
                record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest()

                # return error Row
                return Row(record_id=record_id,
                           document=etree.tostring(meta_root).decode('utf-8'),
                           error=str(e),
                           job_id=int(job_id),
                           oai_set='',
                           success=0)

            # handle all other exceptions
            except Exception as e:

                # hash record string to produce a unique id
                record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest()

                # return error Row
                return Row(record_id=record_id,
                           document='',
                           error=str(e),
                           job_id=int(job_id),
                           oai_set='',
                           success=0)

        # transform via rdd.map
        job_id = job.id
        records = static_rdd.map(
            lambda row: get_metadata_udf(job_id, row, kwargs))

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=records.toDF())

        # run record validation scnearios if requested, using db_records from save_records() output
        vs = ValidationScenarioSpark(spark=spark,
                                     job=job,
                                     records_df=db_records,
                                     validation_scenarios=ast.literal_eval(
                                         kwargs['validation_scenarios']))
        vs.run_record_validation_scenarios()

        # remove temporary payload directory if static job was upload based, not location on disk
        if kwargs['static_type'] == 'upload':
            shutil.rmtree(kwargs['static_payload'])

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Ejemplo n.º 5
0
    def spark_function(spark, **kwargs):
        '''
		Harvest records via OAI.

		As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time:
			- record_id 
			- job_id			
			- oai_set
			- publish_set_id
			- unique (TBD)

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				endpoint (str): OAI endpoint
				verb (str): OAI verb used
				metadataPrefix (str): metadataPrefix for OAI harvest
				scope_type (str): [setList, whiteList, blackList, harvestAllSets], used by DPLA Ingestion3
				scope_value (str): value for scope_type
				index_mapper (str): class name from core.spark.es, extending BaseMapper
				validation_scenarios (list): list of Validadtion Scenario IDs

		Returns:
			None:
			- harvests OAI records and writes to disk as avro files
			- indexes records into DB
			- map / flatten records and indexes to ES
		'''

        # refresh Django DB Connection
        refresh_django_db_connection()

        # get job
        job = Job.objects.get(pk=int(kwargs['job_id']))

        # start job_track instance, marking job start
        job_track = JobTrack(job_id=job.id)
        job_track.save()

        # harvest OAI records via Ingestion3
        df = spark.read.format("dpla.ingestion3.harvesters.oai")\
        .option("endpoint", kwargs['endpoint'])\
        .option("verb", kwargs['verb'])\
        .option("metadataPrefix", kwargs['metadataPrefix'])\
        .option(kwargs['scope_type'], kwargs['scope_value'])\
        .load()

        # select records with content
        records = df.select("record.*").where("record is not null")

        # repartition
        records = records.repartition(settings.SPARK_REPARTITION)

        # attempt to find and select <metadata> element from OAI record, else filter out
        def find_metadata_udf(document):
            if type(document) == str:
                xml_root = etree.fromstring(document)
                m_root = xml_root.find(
                    '{http://www.openarchives.org/OAI/2.0/}metadata')
                if m_root is not None:
                    # expecting only one child to <metadata> element
                    m_children = m_root.getchildren()
                    if len(m_children) == 1:
                        m_child = m_children[0]
                        m_string = etree.tostring(m_child).decode('utf-8')
                        return m_string
                else:
                    return 'none'
            else:
                return 'none'

        metadata_udf = udf(lambda col_val: find_metadata_udf(col_val),
                           StringType())
        records = records.select(*[
            metadata_udf(col).alias('document') if col == 'document' else col
            for col in records.columns
        ])
        records = records.filter(records.document != 'none')

        # establish 'success' column, setting all success for Harvest
        records = records.withColumn('success', pyspark_sql_functions.lit(1))

        # copy 'id' from OAI harvest to 'record_id' column
        records = records.withColumn('record_id', records.id)

        # add job_id as column
        job_id = job.id
        job_id_udf = udf(lambda id: job_id, IntegerType())
        records = records.withColumn('job_id', job_id_udf(records.id))

        # add oai_set
        records = records.withColumn('oai_set', records.setIds[0])

        # add blank error column
        error = udf(lambda id: '', StringType())
        records = records.withColumn('error', error(records.id))

        # index records to DB and index to ElasticSearch
        db_records = save_records(spark=spark,
                                  kwargs=kwargs,
                                  job=job,
                                  records_df=records)

        # run record validation scnearios if requested, using db_records from save_records() output
        vs = ValidationScenarioSpark(spark=spark,
                                     job=job,
                                     records_df=db_records,
                                     validation_scenarios=ast.literal_eval(
                                         kwargs['validation_scenarios']))
        vs.run_record_validation_scenarios()

        # finally, update finish_timestamp of job_track instance
        job_track.finish_timestamp = datetime.datetime.now()
        job_track.save()
Ejemplo n.º 6
0
	def spark_function(spark, **kwargs):

		'''
		Harvest records, select non-null, and write to avro files

		Args:
			spark (pyspark.sql.session.SparkSession): provided by pyspark context
			kwargs:
				job_id (int): Job ID
				job_input (str): location of avro files on disk
				transform_filepath (str): location of XSL file used for transformation
				index_mapper (str): class name from core.spark.es, extending BaseMapper

		Returns:
			None
			- transforms records via XSL, writes new records to avro files on disk
			- indexes records into DB
			- map / flatten records and indexes to ES
		'''

		# get job
		job = Job.objects.get(pk=int(kwargs['job_id']))

		# start job_track instance, marking job start
		job_track = JobTrack(
			job_id = job.id
		)
		job_track.save()

		# read output from input job, filtering by job_id, grabbing Combine Record schema fields
		input_job = Job.objects.get(pk=int(kwargs['input_job_id']))
		bounds = get_job_db_bounds(input_job)
		sqldf = spark.read.jdbc(
				settings.COMBINE_DATABASE['jdbc_url'],
				'core_record',
				properties=settings.COMBINE_DATABASE,
				column='id',
				lowerBound=bounds['lowerBound'],
				upperBound=bounds['upperBound'],
				numPartitions=settings.JDBC_NUMPARTITIONS
			)
		records = sqldf.filter(sqldf.job_id == int(kwargs['input_job_id']))

		# repartition
		records = records.repartition(settings.SPARK_REPARTITION)

		# get transformation
		transformation = Transformation.objects.get(pk=int(kwargs['transformation_id']))

		# if xslt type transformation
		if transformation.transformation_type == 'xslt':

			# define udf function for transformation
			def transform_xml_udf(job_id, row, xslt_string):

				# attempt transformation and save out put to 'document'
				try:
					
					# transform with pyjxslt gateway
					gw = pyjxslt.Gateway(6767)
					gw.add_transform('xslt_transform', xslt_string)
					result = gw.transform('xslt_transform', row.document)
					gw.drop_transform('xslt_transform')

					# set trans_result tuple
					trans_result = (result, '', 1)

				# catch transformation exception and save exception to 'error'
				except Exception as e:
					# set trans_result tuple
					trans_result = ('', str(e), 0)

				# return Row
				return Row(
						record_id = row.record_id,
						document = trans_result[0],
						error = trans_result[1],
						job_id = int(job_id),
						oai_set = row.oai_set,
						success = trans_result[2]
					)

			# open XSLT transformation, pass to map as string
			with open(transformation.filepath,'r') as f:
				xslt_string = f.read()

			# transform via rdd.map
			job_id = job.id			
			records_trans = records.rdd.map(lambda row: transform_xml_udf(job_id, row, xslt_string))

		# back to DataFrame
		records_trans = records_trans.toDF()

		# index records to DB and index to ElasticSearch
		db_records = save_records(
			spark=spark,
			kwargs=kwargs,
			job=job,
			records_df=records_trans
		)

		# run record validation scnearios if requested, using db_records from save_records() output
		vs = ValidationScenarioSpark(
			spark=spark,
			job=job,
			records_df=db_records,
			validation_scenarios = ast.literal_eval(kwargs['validation_scenarios'])
		)
		vs.run_record_validation_scenarios()

		# finally, update finish_timestamp of job_track instance
		job_track.finish_timestamp = datetime.datetime.now()
		job_track.save()