def testWriteMixed(self): del self.table[:] for i in xrange(33): self.table.add(FieldSpec("float_{}".format(i), 'FLOAT')) self.table.add(FieldSpec("int_{}".format(i), 'INT')) self.table.add(FieldSpec("str_{}".format(i), 'VARCHAR', 6)) fts = FastTableStream(self.table, use_names=False) data = [] for j in xrange(1000): row = [] for i in xrange(33): k = 100 * j + i row.extend((int(k), float(k), str(k))) def do_write(): with fts: for j in xrange(20): fts.write_many(data) pr = cProfile.Profile() pr.enable() pr.runcall(do_write) pr.disable() filename = os.path.join(self.run_context.logs_dir, 'fast_table_write_mixed_profile.txt') with open(filename, 'w') as f: stats = pstats.Stats(pr, stream=f) stats.print_stats() self.LOGGER.info("Table name {}".format(self.table))
def setUp(self): self.run_context = SuiteContext('unittest') self.db_context = self.run_context.getDBContext('unittest') self.LOGGER = self.run_context.get_logger() # Set up a test table definition self.table = get_temp_table(self.db_context) (self.table.add(FieldSpec("col1", "NVARCHAR", 8)).add( FieldSpec("col2", "FLOAT")).add(FieldSpec("col3", "TINYINT")).add( FieldSpec("col4", "INT")).add(FieldSpec("col5", "BIGINT")))
def _initialize_table(self): """This function will do the housekeeping for new table creation ie; It will create a tablename, tableschema and preqc_id that will be used as a unique identifier """ self.table = TableSpec(self.db,self.tablename) self.table.table_name = self.tablename self.table.tableSchema = 'dbo' self.table.add(FieldSpec(field_name='ID', basic_type='int')) if self.item_table: self.table.add(FieldSpec(field_name='FLAT_TABLE_ID', basic_type='int')) self.item_table = False
def create_output( self, parent_summary, target_summary ): self.ds_out.add_all( self.parent_group_cols ) self.ds_out.add_all( self.extra_target_group_cols ) with get_temp_table(self.db_context) as ds_temp: ds_temp.add_all( self.ds_out ) ds_temp.add( FieldSpec( field_name="numerator", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="ttest_se", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="ttest_value", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="p_css", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="t_css", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="p_t", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="t_t", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="p_n", basic_type="FLOAT" ) ) ds_temp.add( FieldSpec( field_name="t_n", basic_type="FLOAT" ) ) self.db_context.executeNoResults( ds_temp.definition ) query = _TTEST_QUERY_1.format( ds_out=ds_temp, group_cols=Joiner( self.parent_group_cols, self.extra_target_group_cols ), parent_summary=parent_summary, target_summary=target_summary, parent_group_cols=Joiner( self.parent_group_cols), target_group_cols=Joiner( self.target_group_cols) ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_2.format( ds_out=ds_temp ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_3.format( ds_out=ds_temp ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_4.format( ds_out=ds_temp ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_5.format( ds_out=ds_temp, critval=self.critical_value ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_6.format( ds_out=ds_temp, critval=self.critical_value, output_col=self.output_col ) self.db_context.executeNoResults( query ) query = _TTEST_QUERY_7.format( ds_out=ds_temp, rrv=self.reciprocal_round_value, output_col=self.output_col ) self.db_context.executeNoResults( query ) self.db_context.executeNoResults( self.ds_out.definition ) query = _FINAL_OUTPUT_QUERY.format( ds_out=self.ds_out, ds_temp=ds_temp, columns=Joiner( self.ds_out ) ) self.db_context.executeNoResults( query )
def testWriteFloatMany(self): del self.table[:] for i in xrange(100): self.table.add(FieldSpec("col_{}".format(i), 'FLOAT')) fts = FastTableStream(self.table, use_names=False, dumpfile="C:\\Scratch\\float.dat") data = [[100.0 * j + i for i in xrange(100)] for j in xrange(1000)] def do_write(): with fts: for j in xrange(20): fts.write_many(data) pr = cProfile.Profile() pr.enable() pr.runcall(do_write) pr.disable() filename = os.path.join(self.run_context.logs_dir, 'fast_table_write_float_nonull_profile.txt') with open(filename, 'w') as f: stats = pstats.Stats(pr, stream=f) stats.print_stats() self.LOGGER.info("Table name {}".format(self.table))
def testWriteUnicodeNoNull(self): del self.table[:] for i in xrange(100): self.table.add( FieldSpec("col_{}".format(i), 'NVARCHAR', 8, nullable=False)) fts = FastTableStream(self.table, use_names=False, raw=True) data = [[unicode(100 * j + i) for i in xrange(100)] for j in xrange(1000)] def do_write(): with fts: for j in xrange(20): fts.write_many(data) pr = cProfile.Profile() pr.enable() pr.runcall(do_write) pr.disable() filename = os.path.join(self.run_context.logs_dir, 'fast_table_write_unicode_nonull_profile.txt') with open(filename, 'w') as f: stats = pstats.Stats(pr, stream=f) stats.print_stats() self.LOGGER.info("Table name {}".format(self.table))
def test1(self): run_context = SuiteContext('unittest') db_context = run_context.getDBContext('remote') self.LOGGER = run_context.get_logger() # Set up a test table definition with get_temp_table(db_context) as table: for i in xrange(100): table.add(FieldSpec("col_{}".format(i), 'NVARCHAR', 8)) fts = FastTableStream(table, use_names=False, raw=True) data = [[unicode(100 * j + i) for i in xrange(100)] for j in xrange(1000)] def do_write(): with fts: for j in xrange(5): fts.write_many(data) pr = cProfile.Profile() pr.enable() pr.runcall(do_write) pr.disable() filename = os.path.join( run_context.logs_dir, 'fast_table_write_remote_unicode_raw_profile.txt') with open(filename, 'w') as f: stats = pstats.Stats(pr, stream=f) stats.print_stats() self.LOGGER.info("Table name {}".format(table))
def testWriteIntegerMany(self): del self.table[:] for i in xrange(100): self.table.add(FieldSpec("col_{}".format(i), 'INT')) fts = FastTableStream(self.table, use_names=False) data = [[100 * j + i for i in xrange(100)] for j in xrange(1000)] def do_write(): with fts: for j in xrange(20): fts.write_many(data) pr = cProfile.Profile() pr.enable() pr.runcall(do_write) pr.disable() filename = os.path.join( self.run_context.logs_dir, 'fast_table_write_int_with_checks_profile.txt') with open(filename, 'w') as f: stats = pstats.Stats(pr, stream=f) stats.print_stats() self.LOGGER.info("Table name {}".format(self.table))
def setUp(self): # Set up a field to define table = TableSpec("my_table", None) self.field = FieldSpec("col1", "NVARCHAR", 8) table.add(self.field) # Set up the type processor self.proc = NVarcharProcessor(1, self.field, False)
def _process_required_key_dups_and_missing(self, db_context, input_table, side, allow_dups): # Define duplicates table dup_table = get_temp_table(db_context) dup_table.create_foreign_key(input_table, True, 'fk_{}_'.format(side)) dup_table.add_all(self.required_merge_keys) dup_table.add(FieldSpec(basic_type="TINYINT", field_name="has_dups")) dup_table.add(FieldSpec(basic_type="TINYINT", field_name="has_missing")) dup_table.add( FieldSpec(basic_type="TINYINT", field_name="has_dups_both")) db_context.executeNoResults(dup_table.definition) # Populate table query = _FIND_DUPLICATES_QUERY.format( dup_table=dup_table, input_table=input_table, foreign_key=Joiner(dup_table.foreign_keys[0]), required_key=Joiner(self.required_merge_keys), side=side) db_context.executeNoResults(query) # Define rejects table reject_table = get_temp_table(db_context) reject_table.create_foreign_key(input_table, True, 'fk_{}_'.format(side)) reject_table.add_all(self.required_merge_keys) reject_table.add( FieldSpec(basic_type="NVARCHAR", data_length=4000, field_name="merge_report")) db_context.executeNoResults(reject_table.definition) # Move missing keys to rejects table self._move_rejects(dup_table, reject_table, 0, "has_missing > 0", "'Missing required key on {}'".format(side)) # If required, move duplicates to rejects table if not allow_dups: self._move_rejects(dup_table, reject_table, 0, "has_dups > 0", "'Duplicate required key on {}'".format(side)) return dup_table, reject_table
def testWriteUnicodeCharacters(self): del self.table[:] for i in xrange(100): self.table.add(FieldSpec("col_{}".format(i), 'NVARCHAR', 8)) fts = FastTableStream(self.table, use_names=False) with fts: for j in xrange(10000): fts.write([100 * j + i for i in xrange(100)]) self.LOGGER.info("Table name {}".format(self.table))
def _build_table_columns(self, eachfield = '', tableind = 0, fieldind = 0, itemlist = ''): """This function determines the column type for each field to create the FieldSpec object""" create_missing_column = False field_type = self._define_type(eachfield) if field_type == 'VARCHAR': if eachfield in self.mc_items_table_field_names: ### For Varchar fields, +5 has been added as a buffer to store NOMINAL_ID fields where they are converted to integer,recoded(180 becomes 18.0(4 bytes) and stored back as strings. self.table.add(FieldSpec(field_name=self.mc_items_table_field_names[eachfield], basic_type=field_type, data_length=self.layoutdict[eachfield][3] + 5)) else: self.table.add(FieldSpec(field_name=eachfield, basic_type=field_type, data_length=self.layoutdict[eachfield][3] + 5)) elif field_type == 'FLOAT': if eachfield in self.mc_items_table_field_names: self.table.add(FieldSpec(field_name=self.mc_items_table_field_names[eachfield], basic_type=field_type, data_length=self.layoutdict[eachfield][3])) else: self.table.add(FieldSpec(field_name=eachfield, basic_type=field_type)) # This check will add a new missing field if '.' recoding values are encountered, if its those fields are tracked and a additional column called pre-fixed with MISSING_ is created for each in self.recodingsdict[eachfield][1]: if each == '.': create_missing_column = True #The self.missing_columns is a dictionary that will contain all the fields and their position in the item_list, this dictionary is later processed and inserted into the item_list if create_missing_column: if eachfield in self.mc_items_table_field_names: fieldname = 'MISSING_' + self.mc_items_table_field_names[eachfield] missing_collist = [] for each in range(len(self.mc_items_tables_collist[tableind][0])): missing_collist.append(fieldname) self.missing_columns[fieldname] = [itemlist,tableind,fieldind,missing_collist] self.table.add(FieldSpec(fieldname, basic_type='VARCHAR', data_length=5)) else: field_name='MISSING_' + eachfield self.table.add(FieldSpec(field_name, basic_type='VARCHAR', data_length=5))
def testValidateFailWrongColumnCount(self): drop_table_if_exists(self.table) self.db_context.executeNoResults(self.table.definition) self.table.add(FieldSpec('col7', 'NVARCHAR', 15)) table_stream = FastTableStream(self.table) try: table_stream.validate_write_inputs() except ValueError: # Expected error return self.fail( "Expected a ValueError if TableSpec has different column count from db" )
def validate(self): # Make sure that the input and output tables are in the form of # TableSpec objects self.ds_in = get_table_spec( self.ds_in, self.db_context ) self.db_context = self.ds_in.db_context self.run_context = self.db_context.runContext # Make sure that ds_in exists # (SAS 88-96) if not table_exists( self.ds_in ): raise ValueError( 'Input table {} does not exist'.format( self.ds_in ) ) self.ds_in.populate_from_connection() # Process output table name # (SAS 98-101) drop_table_if_exists( self.ds_out, self.db_context ) self.ds_out = self.db_context.getTableSpec( self.ds_out ) # Process output variable name # (SAS 103-111) self.output_col = FieldSpec( field_name=self.output_col_name, basic_type="TINYINT", nullable=True ) self.ds_out.add( self.output_col ) # Process critical value # (SAS 113-117) if self.critical_value is None: self.critical_value = DEFAULT_CRITICAL_VALUE self.run_context.warning( "Critical value has been set to default {}" .format( DEFAULT_CRITICAL_VALUE ) ) # Process target groups # (SAS 118-121; 135-140) if len( self.target_group_cols ) == 0: raise ValueError( "List of target group columns not supplied" ) try: for i in range( len( self.target_group_cols ) ): self.target_group_cols[ i ] = \ self.ds_in[ self.target_group_cols[ i ] ] except KeyError: raise ValueError( "Target group column {} not found in table {}" .format( self.target_group_cols[ i ], self.ds_in ) ) # Process parent groups # (SAS 123-126; 141-146) if len( self.parent_group_cols ) == 0: raise ValueError( "List of parent group columns not supplied" ) for i in range( len( self.parent_group_cols ) ): try: self.parent_group_cols[ i ] = \ self.ds_in[ self.parent_group_cols[ i ] ] except KeyError: raise ValueError( "Parent group column {} not found in table {}" .format( self.parent_group_cols[ i ], self.ds_in ) ) # Select target group columns that are not in parent group columns # No corresponding code in SAS. Deals with case where columns defining # the parent group and columns defining the target group overlap self.extra_target_group_cols = [] for col in self.target_group_cols: if col not in self.parent_group_cols: self.extra_target_group_cols.append( col ) # Process input variable # (SAS 128-134) try: self.input_col = self.ds_in[ self.input_col_name ] except KeyError: raise ValueError( "Input variable {} not found in table {}" .format( self.input_col_name, self.ds_in ) ) # Clean where clauses self.parent_where_expression = self.clean_where( self.parent_where_expression ) self.target_where_expression = self.clean_where( self.target_where_expression ) # A more useful number for rounding test self.reciprocal_round_value = 1.0/self.round_value
def _create_merge_table(self): merge_table = get_temp_table(self.db_context) merge_table.add_all(self.required_merge_keys) i = 1 left_fields = [] right_fields = [] for key in self.optional_merge_keys: lkey = MergeFieldSpec(key.left_field, None, PRIORITY_LEFT_ONLY) lkey.field_name = "LOptKey_" + str(i) merge_table.add(lkey) left_fields.append(lkey) rkey = MergeFieldSpec(None, key.right_field, PRIORITY_RIGHT_ONLY) rkey.field_name = "ROptKey_" + str(i) merge_table.add(rkey) right_fields.append(rkey) i += 1 i = 1 for keyset in self.fuzzy_merge_keys: if isinstance(keyset, MergeFieldSpec): keyset = (keyset, None) if len(keyset) == 1: keyset = (keyset[0], None) if len(keyset) != 2: raise ValueError( "Fuzzy keys must be supplied singly or in pairs; received {}" .format(len(keyset))) similarity_column = FieldSpec(field_name="Similarity_{}".format(i), basic_type="FLOAT") merge_table.add(similarity_column) j = 1 for key in keyset: if key is None: lkey = MergeFieldSpec(None, None, PRIORITY_LEFT_ONLY) lkey.field_name = "LFuzzyKey_{}_{}".format(i, j) lkey.basic_type = "NVARCHAR" lkey.data_length = 1 merge_table.add(lkey) left_fields.append(lkey) rkey = MergeFieldSpec(None, None, PRIORITY_RIGHT_ONLY) rkey.field_name = "RFuzzyKey_{}_{}".format(i, j) rkey.basic_type = "NVARCHAR" rkey.data_length = 1 merge_table.add(rkey) right_fields.append(rkey) else: lkey = MergeFieldSpec(key.left_field, None, PRIORITY_LEFT_ONLY) lkey.field_name = "LFuzzyKey_{}_{}".format(i, j) merge_table.add(lkey) left_fields.append(lkey) rkey = MergeFieldSpec(None, key.right_field, PRIORITY_RIGHT_ONLY) rkey.field_name = "RFuzzyKey_{}_{}".format(i, j) merge_table.add(rkey) right_fields.append(rkey) j += 1 i += 1 merge_table.create_foreign_key(self.left_input_table, True, 'fk_left_') merge_table.create_foreign_key(self.right_input_table, True, 'fk_right_') merge_table.add(FieldSpec('has_dups_l', 'TINYINT')) merge_table.add(FieldSpec('has_dups_r', 'TINYINT')) merge_table.add(FieldSpec('reject', 'TINYINT')) merge_table.add(FieldSpec('merge_report', 'NVARCHAR', 4000)) self.db_context.executeNoResults(merge_table.definition) return merge_table, left_fields, right_fields
def readAggData(self): # Validate the input file self.ds_in = get_table_spec( self.ds_in, self.db_context ) self.ds_in.populate_from_connection() self.db_context = self.ds_in.db_context self.run_context = self.db_context.runContext self.run_context.debug( "processing control file" ) if not table_exists( self.ds_in ): raise ValueError( "Input dataset {} does not exist".format( self.ds_in ) ) # Read the control file # SAS 3-9 if self.use_excel: reader = SafeExcelReader( self.run_context, self.agg_ds, self.agg_sheet ) self.agg_data = [ row for row in reader.getRows() ] else: self.agg_ds = get_table_spec( self.agg_ds, self.db_context ) self.agg_data = dump( self.agg_ds ) # Validate the control file columns # SAS 10-28 missing_vars = set() for var_name in [ 'outputvar', 'inputvar', 'targetlevel', 'targetid', 'wheret', 'wheret_value', 'parentlevel', 'parentid', 'wherep', 'wherep_value', 'critval' ]: if var_name not in self.agg_data[0]: missing_vars.add( var_name ) if missing_vars: raise ValueError( "TTest control sheet lacks required columns: {:', '}".format( Joiner( missing_vars ) ) ) # Validate existence of requested columns # SAS 29-86 for row in self.agg_data: if row.wheret is None: row.wheret = [] else: row.wheret = [ x.strip().lower() for x in row.wheret.strip().split( '*' )] if row.wherep is None: row.wherep = [] else: row.wherep = [ x.strip().lower() for x in row.wherep.strip().split( '*' )] if row.wheret_value is None: row.wheret_value = [] else: row.wheret_value = [ x.strip() for x in row.wheret_value.strip().split( ' ' )] if row.wherep_value is None: row.wherep_value = [] else: row.wherep_value = [ x.strip() for x in row.wherep_value.strip().split( ' ' )] row.inputvar = row.inputvar.lower().strip() row.targetid = row.targetid.lower().strip() row.parentid = row.parentid.lower().strip() row.targetlevel = row.targetlevel.lower().strip() row.parentlevel = row.parentlevel.lower().strip() for var_name in ( row.wheret + row.wherep + [ row.inputvar, row.targetid, row.parentid ] ): if var_name != '' and var_name not in self.ds_in: missing_vars.add( var_name ) if missing_vars: raise ValueError( "TTest input data lacks required variables: {:', '}".format( Joiner( missing_vars ) ) ) # Sort control data #SAS 87-90 self.agg_data.sort( key=lambda row : ( row.targetlevel, row.parentlevel ) ) # Check for consistency across "target" and "parent" variables. #SAS 91-222 last_targetlevel = _NONE_LEVEL last_parentlevel = _NONE_LEVEL self.target_levels = [] messages = [] for row in self.agg_data: wheret = tuple( row.wheret ) wheret_value = tuple ( row.wheret_value ) if len( wheret ) != len( wheret_value ): messages.append( 'Number of wheret_value items must match number of wheret items ("{0}" vs "{1}")'.format( row.wheret, row.wheret_value ) ) if row.targetlevel != last_targetlevel.level: last_targetlevel = LevelData( row.targetlevel, row.targetid, wheret, wheret_value ) self.target_levels.append( last_targetlevel ) last_parentlevel = _NONE_LEVEL # Create an output table in which to accumulate the results table_name = 'ttest_' + row.targetlevel last_targetlevel.output_table = TableSpec( self.db_context, table_name ) last_targetlevel.output_table.add( self.ds_in[ row.targetid ].clone() ) else: last_targetlevel.check( row.targetid, wheret, messages ) wherep = tuple( row.wherep ) wherep_value = tuple ( row.wherep_value ) if len( wherep ) != len( wherep_value ): messages.append( 'Number of wherep_value items must match number of wherep items ("{0}" vs "{1}")'.format( row.wherep, row.wherep_value ) ) if row.parentlevel != last_parentlevel.level: last_parentlevel = LevelData( row.parentlevel, row.parentid, wherep, wherep_value ) last_targetlevel.contents.append( last_parentlevel ) else: last_parentlevel.check( row.parentid, wherep, messages ) last_parentlevel.contents.append( row ) last_targetlevel.output_table.add( FieldSpec( row.outputvar, 'TINYINT' ) ) try: row.critval = float( row.critval ) if not MIN_CRITVAL <= row.critval <= MAX_CRITVAL: messages.append( "Bad critical value {} is not between {} and {}".format( row.critval, MIN_CRITVAL, MAX_CRITVAL ) ) except ValueError: messages.append( "Critical value {} is not a float".format( row.critval ) ) try: row.outputvar = db_identifier_quote( row.outputvar ) except ValueError: messages.append( "Output variable name {} is not a valid database identifier".format( row.outputvar ) ) try: row.targetlevel = db_identifier_quote( row.targetlevel ) except ValueError: messages.append( "Target level name {} is not a valid database identifier".format( row.targetlevel ) ) for message in messages: self.run_context.error( message ) if messages: raise ValueError( "Invalid inputs to ttest macro. See log for details" )
def test_identity(self): o_u_t = FieldSpec('my_field', 'int', identity=(1,1) ) self.assertEqual( o_u_t.definition, '[my_field] INT IDENTITY(1,1)', "Did not get correct field definiton" )