def test_valid_hive_work(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, type="work", stage="valid") self.assertEqual( self.t_hive._query('describe ' + reg.db_name(metadata, type="work", stage="valid") + '.' + reg.db_table(metadata, type="work", stage="valid")), [ ("date_field", "string", ""), ("time_field", "string", ""), ("globalactivepower", "decimal(8,3)", ""), ("instance_guid", "string", "") ] ) self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata))
def test_sandbox_nodelim(self): metadata = reg.get_metadata("no_delim") test_header = 'Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;' test_header += 'Sub_metering_1;Sub_metering_2;Sub_metering_3' self.t_hive.create_hive_table(metadata, reset=True, header=test_header) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('column1', 'string', '')] )
def test_raw_hive_work(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, type="work") self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata, type="work")), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer')] )
def main(args): # Check the registry for entries and run the script appropriately script = "" # Check the number of arguments, if its not exactly one, end if len(args) > 2: # first arguments is the file itself and is disregarded, so > 2 is really > 1 logger.info("Too many arguments provided, ending process") exit(0) elif len(args) == 1: # first arguments is the file itself and is disregarded, so 1 is really 0 logger.info("No arguments provided, ending process") exit(0) else: # we have a single argument, its the name of the script we want to run script = args[1] # Get the metadata for the script metadata = reg.get_refined_metadata(script) # Start a hive connection and use it to create the table for the refined script if it doesn't already exist val = validator.Hive() val.create_hive_table(metadata['refinedDataset'], stage='refined') # Count the number rows in the target table initial_count = val.row_count( reg.db_name(metadata['refinedDataset'], stage='refined') + '.' + reg.db_table(metadata['refinedDataset'], stage='refined')) logger.info("row count ") # print json.dumps(metadata['script'], indent=3) # left for debugging # execute the script run_script(metadata['script']['path'], metadata) # get the row count of the target table after execution final_count = val.row_count( reg.db_name(metadata['refinedDataset'], stage='refined') + '.' + reg.db_table(metadata['refinedDataset'], stage='refined')) # register that the script was run reg.register_raw(metadata['refinedDataset'], metadata['script']['path'], 'refined', final_count - initial_count) return 0
def test_valid_hive(self): metadata = reg.get_metadata("small") self.t_hive.create_hive_table(metadata, reset=True, stage="valid") self.assertEqual( self.t_hive._query('describe ' + reg.db_name(metadata, stage="valid") + '.' + reg.db_table(metadata)), [('globalactivepower', 'decimal(8,3)', ''), ('instance_guid', 'string', ''), ('time_field', 'string', ''), ('date_field', 'string', ''), ('', None, None), ('# Partition Information', None, None), ('# col_name ', 'data_type ', 'comment '), ('', None, None), ('time_field', 'string', ''), ('date_field', 'string', '')] )
def test_sandbox_full_MD(self): metadata = reg.get_metadata("fullsand") self.t_hive.create_hive_table(metadata, reset=True) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer'), ('globalreactivepower', 'string', 'from deserializer'), ('voltage', 'string', 'from deserializer'), ('globalintensity', 'string', 'from deserializer'), ('submetering1', 'string', 'from deserializer'), ('submetering2', 'string', 'from deserializer'), ('submetering3', 'string', 'from deserializer')] )
def test_sandbox_header_delim(self): metadata = reg.get_metadata("sandbox") test_header = 'Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;' test_header += 'Sub_metering_1;Sub_metering_2;Sub_metering_3' self.t_hive.create_hive_table(metadata, reset=True, header=test_header) self.assertEqual(self.t_hive._query('describe ' + reg.db_name(metadata) + '.' + reg.db_table(metadata)), [('date_field', 'string', 'from deserializer'), ('time_field', 'string', 'from deserializer'), ('globalactivepower', 'string', 'from deserializer'), ('globalreactivepower', 'string', 'from deserializer'), ('voltage', 'string', 'from deserializer'), ('globalintensity', 'string', 'from deserializer'), ('submetering1', 'string', 'from deserializer'), ('submetering2', 'string', 'from deserializer'), ('submetering3', 'string', 'from deserializer')] )
def drop_table(self, metadata, **kwargs): # drops the provided table # get db and table name from the metadata db_name = reg.db_name(metadata, **kwargs) db_table = reg.db_table(metadata, **kwargs) # create the drop table query drop_table = 'drop table if exists ' + db_name + '.' + db_table logging.info('Dropping table for recreation ' + db_name + '.' + db_table) try: # execute drop table self._cur.execute(drop_table) logging.info('table dropped') except Exception as e: # log errors if they occur logging.error(e)
def tearDown(self): metadata = reg.get_metadata("household_power_consumption") self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata)) self.t_hive._query("drop table if exists " + reg.db_name(metadata) + '.' + reg.db_table(metadata, type="work")) self.t_hive._query("drop table if exists " + reg.db_name(metadata, stage="valid") + '.' + reg.db_table(metadata)) metadata2 = reg.get_metadata("small") self.t_hive._query("drop table if exists " + reg.db_name(metadata2) + '.' + reg.db_table(metadata2)) self.t_hive._query("drop table if exists " + reg.db_name(metadata2) + '.' + reg.db_table(metadata2, type="work")) self.t_hive._query("drop table if exists " + reg.db_name(metadata2, stage="valid") + '.' + reg.db_table(metadata2))
def test_db_name_valid(self): self.assertEqual( registry.db_name(registry.get_metadata('sampleFile.txt'), stage='valid'), 'dev_none_test')
def copy_and_compare(self, metadata, instance_guid, file_name): src_db_name = reg.db_name(metadata) src_db_table = reg.db_table(metadata, type='work') work_db_name = reg.db_name(metadata, stage="valid", type='work') work_db_table = reg.db_table(metadata, stage="valid", type='work') invalid_reason = {} if 'fileUpdateType' in metadata['file']['technical']: update_type = metadata['file']['technical']['fileUpdateType'] else: update_type = 'append' field_order = sorted(metadata['fields'], key=lambda k: k['position']) select_list = partition = '' partition_list = [] for field in field_order: # build the field list for the create statement if 'partitionPosition' in field: partition_list.append(field) else: # select_list += field["name"] + ', ' select_list += field_conversion(field) + ', ' now = time.time() logger.info('Comparison start') # dropping work table if it exists self._query('drop table if exists ' + work_db_name + '.' + work_db_table) self.create_hive_table(metadata, stage="valid", type='work') try: self.copy_table_data(metadata, instance_guid, copy_type='compare') except RuntimeError: invalid_reason[ "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs" except impala.dbapi.OperationalError: invalid_reason[ "badMetadata"] = "Metadata didn't match file and caused hive to fail, check ingestion logs" logger.info('Data loaded to validation table') compare = ('SELECT count(*) FROM ' + src_db_name + '.' + src_db_table + ' where ') for field in metadata['fields']: if str(field['datatype']).upper() in {'TIMESTAMP', 'DATE'}: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and ' +\ field_conversion(field) + ' is null) or ' elif str(field['datatype']).upper() in {'FLOAT'}: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and not(' +\ field_conversion(field) + ' <=> cast(' + field["name"] + ' as float))) or ' elif str(field['datatype']).upper() == 'BOOLEAN': compare += '( not(' + field_conversion( field) + ' <=> ' + field["name"] + ')) or ' elif str(field['datatype']).upper() == 'BINARY': pass else: compare += '((' + field["name"] + ' is not null and length(' + field["name"] + ') > 0) and not(' +\ field_conversion(field) + ' <=> ' + field["name"] + ')) or ' compare = compare[:-3] logger.info('comparison query : ' + compare) raw_rows = valid_rows = invalid_rows = 0 try: self._cur.execute(compare) except Exception as e: logger.info(e.message) raise logger.info('Valid data check query complete') # get the number of invalid rows from the comparison query for row in self._cur: invalid_rows = row[0] if invalid_rows > 0: logger.info(str(invalid_rows) + " invalid rows ") invalid_reason["datatypeMismatch"] = invalid_rows else: logger.info("All copied rows are valid") raw_rows = self.row_count(src_db_name + '.' + src_db_table) logger.info('Raw row count complete') valid_rows = self.row_count( work_db_name + '.' + work_db_table, 'instance_guid = "' + str(instance_guid) + '"') logger.info('Valid row count complete') logger.info("Raw rows = " + str(raw_rows) + ": Valid Rows = " + str(valid_rows)) if raw_rows - valid_rows != 0: logger.info("Mismatch count = " + str(raw_rows - valid_rows)) invalid_reason["rowCountMismatch"] = raw_rows - valid_rows logger.info("End copy and compare" + str(time.time())) logger.info("finished in " + str(time.time() - now) + " seconds") if len(invalid_reason) > 0: reg.register_invalid(metadata, instance_guid, file_name, invalid_reason, valid_rows, compare) else: # Adding append vs full file logic if 'fileUpdateType' in metadata['file']['technical']: if update_type == 'append': # this is the default path so we don't do anything logging.info('append file') pass elif update_type == 'full': # delete everything in the valid file location logger.info('Deleting existing data from valid table') if hdfs.path.exists(reg.file_path(metadata, stage="valid")): hdfs.rmr(reg.file_path(metadata, stage="valid")) elif update_type == 'delta': logging.info('delta file') self.delta(metadata) else: logging.info('update type blank, treating as append file') else: logging.info( 'no update type or update type null, treating as append file' ) self.create_hive_table(metadata, stage="valid") try: self.copy_table_data(metadata, instance_guid, valid_copy=True, update_type=update_type) reg.register_valid(metadata, instance_guid, file_name, valid_rows, compare) self._query('drop table if exists ' + work_db_name + '.' + work_db_table) if 'fileUpdateType' in metadata['file'][ 'technical'] and metadata['file']['technical'][ 'fileUpdateType'] == 'full': logger.info('Deleting existing data from raw table') if hdfs.path.exists(reg.file_path(metadata, stage="raw")): hdfs.rmr(reg.file_path(metadata, stage="raw")) hdfs.cp(reg.file_path(metadata, stage="raw", type='work'), reg.file_path(metadata, stage="raw")) except RuntimeError or impala.dbapi.OperationalError: invalid_reason["badMetadata"] = "Metadata didn't match file and " + \ "caused hive to fail, check ingestion logs" reg.register_invalid(metadata, instance_guid, file_name, invalid_reason, valid_rows, compare)
def test_db_name(self): self.assertEqual( registry.db_name(registry.get_metadata('sample')), 'dev_none_test_raw')
def test_db_name_refined(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), stage='refined'), 'dev_none_test', "Incorrect db returned")
def test_db_name_test(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), env='test'), 'test_none_test_raw')
def create_hive_table(self, metadata, **kwargs): logging.info('Create Hive Table') # set up the list for partition fields and other variables partition_list = [] raw_table = work_table = sandbox = False if 'stage' in kwargs and kwargs['stage'] == 'sandbox': sandbox = True elif 'stage' not in kwargs or (kwargs['stage'] != 'valid' and kwargs['stage'] != 'refined'): raw_table = True if 'type' in kwargs and kwargs['type'] == 'work': work_table = True # check for a delimiter and set it up, or set it to nothing if 'technical' in metadata['file'] and 'fieldDelimiter' in metadata['file']['technical'] \ and metadata['file']['technical']['fieldDelimiter'] is not None \ and metadata['file']['technical']['fieldDelimiter'] != '': delimiter = metadata['file']['technical']['fieldDelimiter'] else: delimiter = '' # get db name and table name from registry db_name = reg.db_name(metadata, **kwargs) db_table = reg.db_table(metadata, **kwargs) logging.info("Creating " + db_name + '.' + db_table) # create the DB if it doesn't exist self._cur.execute("create database if not exists " + db_name) # sort the fields in the metadata by order field_order = sorted(metadata['fields'], key=lambda k: k['position']) # start the table creation script logging.info('Begin table creation script') if not raw_table: table_create = 'CREATE ' else: table_create = 'CREATE external ' # add db and table to table create script table_create += 'TABLE if not exists ' + db_name + '.' + db_table + ' ' # loop through the field list if len(field_order) > 0: # table with metadata field_order = sorted(metadata['fields'], key=lambda k: k['position']) table_create += ' ( ' # build the field list and partition list for the create statement for field in field_order: # raw tables are always string, unless they're boolean which we have to interpret as boolean if raw_table: if str(field["datatype"]).upper() == 'BOOLEAN': table_create += field["name"] + ' ' + field[ "datatype"] + ', ' else: table_create += field["name"] + ' string, ' elif 'partitionPosition' in field and not work_table and not sandbox: # if its not raw, we may have partition fields, they go somewhere different in the query than the rest of the fileds partition_list.append(field) #elif sandbox and str(field['datatype']).upper() in {'DATE', 'TIMESTAMP'}: # sandboxes usually get the specified data types, but dates stay string # table_create += field["name"] + ' string, ' else: # add in the precision on data types that have it ie, decimal and varchar if "precision" in field and field[ "precision"] is not None and field[ "precision"] != '': table_create += field["name"] + ' ' + field[ "datatype"] + '(' + str(field["precision"]) + '), ' elif str(field["datatype"]).upper() == 'VARCHAR': table_create += field["name"] + ' ' + field[ "datatype"] + '(65355), ' elif str(field["datatype"]).upper() == 'CHAR': table_create += field["name"] + ' ' + field[ "datatype"] + '(255), ' else: table_create += field["name"] + ' ' + field[ "datatype"] + ', ' if 'stage' in kwargs and kwargs['stage'] == 'valid': # valid tables get the instance guid tacked on table_create += "instance_guid string)" elif len(partition_list) != len(field_order): # get rid of the trailing comma table_create = table_create[:-2] + ") " else: # if it gets to this point, there are no fields in the table that aren't in the # partition list. This is bad sys.exit('No non-partition fields in table') table_create = table_create[: -2] # get rid of the trailing comma # dealing with the header row from sandbox files elif 'technical' in metadata['file'] \ and ('containsHeaderRow' in metadata['file']['technical'] and metadata['file']['technical']['containsHeaderRow']) \ and ('fieldDelimiter' in metadata['file']['technical'] and metadata['file']['technical']['fieldDelimiter'] is not None and metadata['file']['technical']['fieldDelimiter'] != ''): # table with field names from a header row and a delimiter fields = kwargs['header'].replace('.', '_').replace('"', '').split( delimiter.decode('string_escape')) table_create += ' ( ' for field in fields: table_create += escape_hive(field) + ' string, ' table_create = table_create[:-2] + ") " # dealing with delimited files that don't have field names elif 'technical' in metadata['file'] and ( 'containsHeaderRow' not in metadata['file']['technical'] or not metadata['file']['technical']['containsHeaderRow']): # no header row but there's a delimiter table_create += ' ( ' if delimiter is not None and len(delimiter) > 0: fields = kwargs['header'].split(delimiter) for n in range(len(fields)): table_create += ' column' + str(n + 1) + ' string, ' else: fields = kwargs['header'] table_create += ' column1 string, ' table_create = table_create[:-2] + ") " else: # no header row or delimiter table_create += '(column1 string) ' logging.info('field list done, moving to partitioning') # if the partition list has anything in it, add partitioning to the table create if len(partition_list) > 0: partition = 'PARTITIONED BY (' partition_list.sort(key=lambda k: k['partitionPosition']) for field in partition_list: if "precision" in field and field["precision"] is not None: partition += field["name"] + ' ' + field[ "datatype"] + '(' + str(field["precision"]) + '), ' else: partition += field["name"] + ' ' + field["datatype"] + ', ' partition = partition[:-2] + ") " table_create += partition if delimiter == ';': delimiter = '\u0059' logging.info('partitioning done, moving to delimiting') # add in the delimiter, if there's a mulit-character delimiter, include the neccesary serde if 'technical' in metadata['file'] and 'fieldDelimiter' in metadata['file']['technical'] \ and metadata['file']['technical']['fieldDelimiter'] is not None and len(delimiter) > 0: if len(delimiter) == 1 and (raw_table or sandbox): logging.info('single character delimiter ' + delimiter) table_create += 'ROW FORMAT DELIMITED FIELDS TERMINATED BY "' + delimiter + '" ' # table_create += 'ROW FORMAT DELIMITED' elif raw_table or sandbox: logging.info('multi character delimiter ' + delimiter) table_create += 'ROW FORMAT SERDE "org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe" ' + \ 'WITH SERDEPROPERTIES ("field.delim"="' + delimiter + '") ' else: logging.info(' no delimiter ') logging.info('delimiting done, moving to storage type') # add storage type, raw and sandbox is textfile, anything else attempts to use the compression mentioned in the registry if raw_table or sandbox: logging.info('STORED AS TEXTFILE ') table_create += 'STORED AS TEXTFILE ' else: logging.info('STORED AS compression ') if 'compression' in metadata['file']['technical'] and metadata['file']['technical']['compression'] != '' \ and metadata['file']['technical']['compression'] is not None: table_create += 'STORED AS ' + metadata['file']['technical'][ 'compression'] + ' ' else: table_create += 'STORED AS parquet ' logging.info('storage type done, moving to location') # add storage location for everything except the valid_work table if raw_table or not work_table: logging.info('location ') table_create += 'LOCATION \'' + reg.file_path(metadata, ** kwargs) + '/\' ' logging.info('location done, moving to header row') # set the flag to skip the header row on raw and sandbox files if neccesary if (raw_table or sandbox) and 'technical' in metadata['file'] and 'containsHeaderRow' in metadata['file']['technical'] \ and metadata['file']['technical']['containsHeaderRow']: table_create += 'tblproperties("skip.header.line.count" = "1")' # allow for resettting/deleting the table if "reset" in kwargs and kwargs["reset"]: logger.info('dropping ' + db_name + '.' + db_table) self._cur.execute('drop table if exists ' + db_name + '.' + db_table) logger.info('table creation script : ' + table_create) # execute the table creation try: self._cur.execute(table_create) logging.info('Table ' + db_name + '.' + db_table + 'created') except Exception as e: logging.error(e)
def copy_table_data(self, metadata, instance_guid, **kwargs): tgt_db_name = reg.db_name(metadata, stage="valid") work_copy = False if 'copy_type' in kwargs and kwargs['copy_type'] == 'compare': work_copy = True src_db_name = reg.db_name(metadata) src_db_table = reg.db_table(metadata, type='work') tgt_db_table = reg.db_table(metadata, type='work') else: src_db_name = reg.db_name(metadata, stage="valid") src_db_table = reg.db_table(metadata, type='work', stage="valid") tgt_db_table = reg.db_table(metadata) if 'update_type' in kwargs: update_type = kwargs['update_type'] else: update_type = 'append' field_order = sorted(metadata['fields'], key=lambda k: k['position']) select_list = partition = partition_conversion = '' partition_list = [] if 'copy_type' in kwargs and kwargs['copy_type'] == 'compare': work_copy = True for field in field_order: # build the field list for the create statement if 'partitionPosition' in field and not work_copy: partition_list.append(field) else: # select_list += field["name"] + ', ' select_list += field_conversion(field) + ', ' if update_type == 'delta': insert_statement = 'INSERT OVERWRITE TABLE ' + tgt_db_name + '.' + tgt_db_table else: insert_statement = 'INSERT INTO TABLE ' + tgt_db_name + '.' + tgt_db_table if len(partition_list) > 0: partition_list.sort(key=lambda k: k['partitionPosition']) for field in partition_list: partition += field["name"] + ', ' partition_conversion += field_conversion(field) + ', ' insert_statement += ' PARTITION (' + partition[:-2] + ')' if 'valid_copy' in kwargs and kwargs['valid_copy']: insert_statement += ' SELECT ' + select_list + ' instance_guid, ' + \ partition_conversion[:-2] + ' FROM ' + src_db_name + '.' + src_db_table else: insert_statement += ' SELECT ' + select_list + '"' + str(instance_guid) + '", ' + \ partition_conversion[:-2] + ' FROM ' + src_db_name + '.' + src_db_table else: if 'valid_copy' in kwargs and kwargs['valid_copy']: insert_statement += ' SELECT ' + select_list + ' instance_guid FROM ' + src_db_name + '.' + src_db_table else: insert_statement += ' SELECT ' + select_list + '"' + str(instance_guid) + \ '" FROM ' + src_db_name + '.' + src_db_table logger.info('insert statement : ' + insert_statement) try: self._cur.execute(insert_statement) except RuntimeError as e: logger.error("Metadata didn't match file" + e.message) raise except impala.dbapi.OperationalError as e: logger.error("Metadata didn't match file" + e.message) raise
def test_db_name_work(self): self.assertEqual( registry.db_name(registry.get_metadata('sample.txt'), type='work'), 'dev_none_test_raw')
def test_dev_db_name(self): meta = {"file": {"dataPartition": "xtrnl", "subjectArea": "UCI","technical": {"tableName" : "householdelectricpowerconsumption"}}} self.assertEquals(reg.db_name(meta), "dev_xtrnl_uci_raw", "Incorrect DB name")
def test_prod_db_name(self): meta = {"file": {"dataPartition": "xtrnl", "subjectArea": "UCI","technical": {"tableName" : "householdelectricpowerconsumption"}}} self.assertEquals(reg.db_name(meta, stage="raw", env="prod"), "xtrnl_uci_raw")
def delta(self, metadata): # Adding rows that were in the main file, but not in delta into the work tableff logging.info('Adding rows that were not new or edited to work table') src_db_name = reg.db_name(metadata, stage="valid") src_db_table = reg.db_table(metadata, stage="valid") tgt_db_name = reg.db_name(metadata, stage="valid") tgt_db_table = reg.db_table(metadata, type='work') join = select_list = "" partition_list = [] logging.debug('starting field list in delta') for field in metadata['fields']: if 'pk' in field and str(field['pk']).lower() == 'true': join += "valid." + field["name"] + " = delta." + field[ "name"] + " and " if 'partitionPosition' in field and field['partitionPosition'] == 1: partition_list.append(field) logging.debug('starting insert_statement in delta') insert_statement = 'INSERT INTO TABLE ' + tgt_db_name + '.' + tgt_db_table logging.debug('starting field_order in delta') field_order = sorted(metadata['fields'], key=lambda k: k['position']) for field in field_order: select_list += field_conversion(field) + ', ' insert_statement += ' SELECT ' + select_list + 'instance_guid ' + \ ' FROM ' + src_db_name + '.' + src_db_table + ' valid ' insert_statement += 'where not exists (select 1 from ' + tgt_db_name + '.' + tgt_db_table + ' delta where ' insert_statement += join if len(partition_list) > 0: partition_query = 'select ' + partition_list[0][ "name"] + ' from ' + tgt_db_name + '.' + tgt_db_table partition_query += ' group by ' + partition_list[0]["name"] logging.info('Partition list query : ' + partition_query) compare_values = self._query(partition_query) if compare_values is not None and len(compare_values) > 0: existing_partitions = '' logging.info('Found ' + str(len(compare_values)) + ' values') for row in compare_values: logging.info(str(row)) if str(partition_list[0]['datatype']).upper() in ( 'DECIMAL', 'FLOAT', 'INTEGER', 'DOUBLE', 'BIGINT', 'SMALLINT', 'TINYINT'): existing_partitions += str(row[0]) + ', ' elif str(partition_list[0]['datatype']).upper() in ( "TIMESTAMP"): existing_partitions += "cast('" + str( row[0]) + "' as timestamp), " elif str( partition_list[0]['datatype']).upper() in ("DATE"): existing_partitions += "cast('" + str( row[0]) + "' as date), " else: existing_partitions += "'" + str(row[0]) + "', " insert_statement += partition_list[0][ "name"] + ' in (' + existing_partitions[:-2] + ')) ' else: insert_statement = insert_statement[:-5] + ' )' else: insert_statement = insert_statement[:-5] + ' )' logger.info("delta insert to work : " + insert_statement) try: self._cur.execute(insert_statement) except RuntimeError as e: logger.error("delta update failed" + e.message) raise