def test_split(): L = [(i*2, i*2+1) for i in range(10)] iters = common.split(L, n=2) assert zip(*iters) == L for i in [0, 1]: iters = common.split(L, n=2) iters[i].next() try: iters[i].next() assert False except IndexError as e: assert e.args == (i,)
def match_condition(nk): """FUTURE: t.udp_nk = s.udp_nk""" output = [] for nk_column in split(nk): source_nk_column = add_alias(nk_column, 's') target_nk_column = add_alias(nk_column, 't') output.append(f'{target_nk_column}={source_nk_column}') return ' and '.join(output)
def modify_idx_label(): df = get_df_1() # modify row names df.index.set_names('Prov.', inplace=True) # modify row index label df.rename_axis( {'SH': 'HLJ'}, inplace=True) # default return a new df if inplace is not set print(df) split() # modify column index label df.rename_axis({'b': 'B', 'c': 'cc'}, inplace=True, axis=1) print(df) split()
def read_data_input(filename): with open(filename) as fi: lines = fi.readlines() chunks = list(split(lines, "\n")) rules = chunks[0] values = [c.strip() for c in chunks[1]] rule_dict = parse_rule_dict(rules) return rule_dict, values
def read_passports(filename, required): with open(filename) as fi: passport_list = [] lines = fi.readlines() for passport_info in split(lines, "\n"): if passport_info: passport_fields = parse_fields(passport_info, required) passport_list.append(Passport(passport_fields, required)) return passport_list
def main(): listener = ListenerFile('listener.txt') listener.clear() while True: time.sleep(1) command = listener.listen() if command in split('bye end exit quit stop'): logger.info(f'Exit command: {command}') listener.clear() break
def ffnp(t, T): """Compute the ffnp reduction of t, using T as auxilary information. Args: t: a vector T: a ldl decomposition tree Format: coefficient """ n = len(t[0]) z = [None, None] if (n > 1): l10, T0, T1 = T z[1] = merge(ffnp(split(t[1]), T1)) t0b = add(t[0], mul(sub(t[1], z[1]), l10)) z[0] = merge(ffnp(split(t0b), T0)) return z elif (n == 1): z[0] = [round(t[0][0])] z[1] = [round(t[1][0])] return z
def format_join(text, schema_name): text = clean_sql(text) join_keywords = split( 'full, left, right, inner, outer, cross, join, on, and, or, not') output = [] last_token = '' for token in text.split(): if token in join_keywords or not token[0].isalpha(): output.append(token) else: if '..' in token: token = q(token.partition('..')[2]) elif token.startswith('dbo.'): token = q(token[4:]) elif '.' in token: alias_name, separator, table_name = token.partition('.') token = f'{q(alias_name)}.{q(table_name)}' else: token = q(token) # add schema name if last token ends with 'join' and token missing schema name if last_token.endswith('join') and '.' not in token: token = f'{q(schema_name)}.{token}' output.append(token) last_token = token text = ' '.join(output) # convert join keyword phrases to tokens for join_keyword_phrase in join_keyword_phrases: join_keyword_token = join_keyword_phrase.replace(' ', '::') text = text.replace(join_keyword_phrase, join_keyword_token) # format joins into 2-line clauses output = [] for token in text.split(): if token.endswith('join'): token = f'\n{spaces(2)}{token}' elif token == 'on': token = f'\n{spaces(4)}{token}' output.append(token + ' ') # expand join keyword tokens back to join keyword phrases text = ''.join(output) text = text.replace('::', ' ') return text
def format_join(text, schema_name): text = clean_sql(text) join_keywords = split( "full, left, right, inner, outer, cross, join, on, and, or, not") output = [] last_token = "" for token in text.split(): if token in join_keywords or not token[0].isalpha(): output.append(token) else: if ".." in token: token = q(token.partition("..")[2]) elif token.startswith("dbo."): token = q(token[4:]) elif "." in token: alias_name, separator, table_name = token.partition(".") token = f"{q(alias_name)}.{q(table_name)}" else: token = q(token) # add schema name if last token ends with 'join' and token missing schema name if last_token.endswith("join") and "." not in token: token = f"{q(schema_name)}.{token}" output.append(token) last_token = token text = " ".join(output) # convert join keyword phrases to tokens for join_keyword_phrase in join_keyword_phrases: join_keyword_token = join_keyword_phrase.replace(" ", "::") text = text.replace(join_keyword_phrase, join_keyword_token) # format joins into 2-line clauses output = [] for token in text.split(): if token.endswith("join"): token = f"\n{spaces(2)}{token}" elif token == "on": token = f"\n{spaces(4)}{token}" output.append(token + " ") # expand join keyword tokens back to join keyword phrases text = "".join(output) text = text.replace("::", " ") return text
def read_input_data(filename): with open(filename) as fi: lines = fi.readlines() decks = [] for chunk in split(lines, "\n"): deck = [] header = chunk[0] assert header[0:6] == "Player" for value in chunk[1:]: deck.append(int(value.strip())) decks.append(deck) return RecursiveCombatGame(deque(decks[0]), deque(decks[1]))
def main(): trainset = common.load_data(TRAINSET_PATH, sep=',') trainset = common.onehot_encode(trainset, 0) for i in range(N_MODEL): x_train, x_test, y_train, y_test = common.split(trainset, i) x_train, x_test = common.normalize(x_train, x_test) model, history = train(x_train, y_train, N_EPOCH) model.evaluate(x_test, y_test) model.save(common.numbering(MODEL_PATH, i)) save_history(history, common.numbering(HISTORY_PATH, i)) print(i, ' is done.')
def __init__(self, schema_name, table_name, column_names): self.schema_name = schema_name self.table_name = table_name self.column_names = split(column_names) self.table_prefix = "" self.table_suffix = "" self.natural_key = "" self.cdc = "" self.timestamp = "" self.first_timestamp = "" self.rowversion = "" self.first_rowversion = "" self.select = "" self.where = "" self.ignore = "" self.order = ""
def get_probs_for_uncertain(uncertainset): trainset = common.load_data(TRAINSET_PATH, sep=',') encoded_uncertainset = common.onehot_encode( uncertainset[:, common.N_DISASTER:], 0) encoded_trainset = common.onehot_encode(trainset, 0) prob_sums = np.zeros((len(uncertainset), common.N_CLASS)) for i in range(N_MODEL): x_train, _, _, _ = common.split(encoded_trainset, i) _, normalized_uncertainset = common.normalize(x_train, encoded_uncertainset) prob_sums += tf.keras.models.load_model(common.numbering( MODEL_PATH, i)).predict(normalized_uncertainset) print(f'{i} is done.') return prob_sums / N_MODEL
def timestamp_logic(self, current_timestamp, last_timestamp=None): timestamp_columns = add_aliases(split(self.table.timestamp)) if not timestamp_columns: self.timestamp_value = f"'{current_timestamp:%Y-%m-%d %H:%M:%S}'" self.timestamp_where_condition = '' else: if len(timestamp_columns) == 1: timestamp_value = q(timestamp_columns[0]) else: # build timestamp column values as ("created_at"), ("updated_at"), ("other_timestamp") timestamp_values = ', '.join([ f'({q(column_name)})' for column_name in timestamp_columns ]) timestamp_value = f'(select max("v") from (values {timestamp_values}) as value("v"))' self.timestamp_value = timestamp_value self.timestamp_where_condition = expand( self.timestamp_where_template)
def fft(f): """Compute the FFT of a polynomial mod (x ** n + 1). Args: f: a polynomial Format: input as coefficients, output as FFT """ n = len(f) if (n > 2): f0, f1 = split(f) f0_fft = fft(f0) f1_fft = fft(f1) f_fft = merge_fft([f0_fft, f1_fft]) elif (n == 2): f_fft = [0] * n f_fft[0] = f[0] + 1j * f[1] f_fft[1] = f[0] - 1j * f[1] return f_fft
def ntt(f, q): """Compute the NTT of a polynomial. Args: f: a polynomial Format: input as coefficients, output as NTT """ n = len(f) if (n > 2): f0, f1 = split(f) f0_ntt = ntt(f0, q) f1_ntt = ntt(f1, q) f_ntt = merge_ntt([f0_ntt, f1_ntt], q) elif (n == 2): f_ntt = [0] * n f_ntt[0] = (f[0] + sqr1[q] * f[1]) % q f_ntt[1] = (f[0] - sqr1[q] * f[1]) % q return f_ntt
def demodulate(symbols, filters, freqs, sampler): streams = [] symbol_list = [] errors = {} def error_handler(received, decoded, freq): errors.setdefault(freq, []).append(received / decoded) generators = common.split(symbols, n=len(freqs)) for freq, S in zip(freqs, generators): S = filters[freq](S) if pylab: equalized = [] S = common.icapture(S, result=equalized) symbol_list.append(equalized) freq_handler = functools.partial(error_handler, freq=freq) bits = modem.qam.decode(S, freq_handler) # list of bit tuples streams.append(bits) # stream per frequency stats['symbol_list'] = symbol_list stats['rx_bits'] = 0 stats['rx_start'] = time.time() log.info('Demodulation started') for i, block in enumerate(itertools.izip(*streams)): # block per frequency for bits in block: stats['rx_bits'] = stats['rx_bits'] + len(bits) yield bits if i and i % config.baud == 0: mean_err = np.array([e for v in errors.values() for e in v]) correction = np.mean(np.angle(mean_err)) / (2*np.pi) duration = time.time() - stats['rx_start'] log.debug('%10.1f kB, realtime: %6.2f%%, sampling error: %+.3f%%', stats['rx_bits'] / 8e3, duration * 100.0 / (i*config.Tsym), correction * 1e2) errors.clear() sampler.freq -= 0.01 * correction / config.Fc sampler.offset -= correction
def select_cell(): df = get_df_1() # select a cell by index labels via .at, which is faster than .loc a = df.at['JS', 'b'] print('df["JS"]["b"] = %d' % a) split() # equivalent to .at, but slower a = df.loc['JS', 'b'] print('df["JS"]["b"] = %d' % a) split() # select a cell by positions via .iat, which is faster than .iloc a = df.iat[3, 1] print('df[3][1] = %d' % a) split() # equivalent to .iloc, but slower a = df.iloc[3, 1] print('df[3][1] = %d' % a) split()
def sort(): df = get_df_1() # sort by row index label df_new = df.sort_index() print(df_new) split() # sort by row index label in descending order df_new = df.sort_index(ascending=False) print(df_new) split() # sort by column index label in descending order df_new = df.sort_index(ascending=False, axis=1) print(df_new) split() # sort by column value in descending order df_new = df.sort_values('b', ascending=False) print(df_new) split()
def aggr(): df = get_df_2() # select from year 2015, filter out data before this df = df.loc['2015':] print(df) split() # sum rows by year df = df.resample('A').sum() # 'A' means annually print(df) split() # add a column which denotes growth percentage df['growth'] = df.pct_change() * 100 print(df) split()
def handle_nan(): df = get_df_1() # filter rows with at least one NaN df_new = df.dropna(how='any') print(df_new) split() # filter rows with all NaN df_new = df.dropna(how='all') print(df_new) split() # fill NaN to a default value df_new = df.fillna(value=60) print(df_new) split()
def fft(f): """ Compute the FFT of a polynomial mod (x ** n + 1). Input: f A polynomial Output: f_fft The FFT of f Format: Coefficient (Input) FFT (Output) """ n = len(f) if (n > 2): f0, f1 = split(f) f0_fft = fft(f0) f1_fft = fft(f1) f_fft = merge_fft([f0_fft, f1_fft]) elif (n == 2): f_fft = [0] * n f_fft[0] = f[0] + 1j * f[1] f_fft[1] = f[0] - 1j * f[1] return f_fft
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp self.stats.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage output_stream = open(f'{self.work_folder_name}/{table_name}.table', 'wb') pickle.dump(table_object, output_stream) output_stream.close() # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): # use fnmatch() to provide glob style matching if fnmatch.fnmatch(column_name.lower(), pattern.lower()): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use output_stream = open(f'{self.work_folder_name}/{table_name}.schema', 'wb') pickle.dump(table_schema, output_stream) output_stream.close() # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key output_stream = open(f'{self.work_folder_name}/{table_name}.pk', 'w') output_stream.write(pk_columns) output_stream.close() # clear cdc if it doesn't match timestamp/rowversion table_object.cdc = table_object.cdc.lower() if not table_object.cdc or table_object.cdc not in ('timestamp', 'rowversion'): table_object.cdc = '' # if no pk_columns, then clear table cdc if not pk_columns: if table_object.cdc and table_object.cdc != 'none': logger.info( f'Warning: {table_name} cdc={table_object.cdc} but table has no pk column(s)' ) table_object.cdc = 'none' # we still keep timestamp because its required for filtering first_timestamp - current_timestamp # if table_object.timestamp: # logger.info(f'Warning: {table_name} timestamp={table_object.timestamp} but table has no pk column(s)') # table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # logger.info(f'Capture SQL:\n{sql}\n') # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 1_000_000 batch_number = 0 row_count = 0 file_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder_name}/{table_name}#{batch_number:04}.json' with open(output_file, 'w') as output_stream: # indent=2 for debugging json.dump(json_rows, output_stream, indent=2, default=json_serializer) # track stats row_count += len(json_rows) file_size += pathlib.Path(output_file).stat().st_size # if no cdc, but order set, do a file hash see if output the same time as last file hash if (not table_object.cdc or table_object.cdc == 'none') and table_object.order: print( f'Checking {table_name} file hash based on cdc={table_object.cdc} and order={table_object.order}' ) table_data_files = f'{self.work_folder_name}/{table_name}#*.json' current_filehash = hash_files(table_data_files) if table_history.last_filehash == current_filehash: # suppress this update print( f'Table({table_name}): identical file hash, update suppressed' ) logger.info( f'Table({table_name}): identical file hash, update suppressed' ) row_count = 0 file_size = 0 # delete exported json files delete_files(table_data_files) else: print( f'Table({table_name}): {table_history.last_filehash} != {current_filehash}' ) table_history.last_filehash = current_filehash # update table history with new last timestamp value table_history.last_timestamp = current_timestamp # track total row count and file size across all of a table's batched json files self.stats.stop(table_name, row_count, file_size) # save interim state of stats for diagnostics self.stats.save() self.job_row_count += row_count self.job_file_size += file_size # explicitly close cursor when finished # cursor.close() return
def display(): df = get_df_1() print(df.info()) # basic info split() print(df.shape ) # dimension as (n,m), rows = df.shape[0], columns = df.shape[1] split() print(df.index) # row index info split() print(df.columns) # column index info split() print(df.head(4)) # first 4 rows split() print(df.tail()) # last 5 rows split() print(df.describe()) # basic stats split() print(df.values) # all values split()
#Allons voir erreur pour cette image "/mnt/VegaSlowDataDisk/c3po/Images_aquises/DonneesPI/timeLapsePhotos_Pi1_4/image_2019-06-15_04-16-45.jpg" #Retrouve bien les fp #ds_to_analyze=["image_2019-06-15_04-16-45.jpg"] path_model_saved = "/mnt/BigFast/VegaFastExtension/Rpackages/c3po_all/c3po_interface_mark/Materiels/Models/Yolo_models/" neurone = "training_jeux_difficile_updated" neurone = "training_jeux_difficile" #neurone="generateur_sans_flip_2000" string = path_model_saved + neurone imagettes = pd.read_csv( "/mnt/BigFast/VegaFastExtension/Rpackages/c3po_all/c3po/Images_aquises/imagettes.csv" ) imagettes = common.to_reference_labels(imagettes, "classe") index_train, index_test = common.split(imagettes) #Choose index_test or index_train index = index_test #images, labels, labels2=common.read_imagettes(imagettes[imagettes["filename"].isin(index)]) images, labels, labels2 = common.read_imagettes( imagettes[imagettes["filename"].isin(index)]) images = np.array(images, dtype=np.float32) / 255 labels = np.array(labels, dtype=np.float32) dataset = tf.data.Dataset.from_tensor_slices( (images, labels)).batch(config.batch_size) Model = model.model(config.nbr_classes, config.nbr_boxes, config.cellule_y, config.cellule_x) checkpoint = tf.train.Checkpoint(model=Model)
def iterate_null_columns(configs, dbs, workbook, worksheet, h_index=0, v_index=0): config_index = 0 db_index = 0 # Workbook Formats header_format = workbook.add_format({'bold': True, 'underline': True}) sub_header_format = workbook.add_format({'bold': True}) percent_format = workbook.add_format({'num_format': '0.00%'}) # Loop through all configs and all dbs. config[1] is correlated with dbs[1] etc. while config_index < len(configs): worksheet.write(h_index, v_index, configs[config_index]('namespace').dataset, header_format) worksheet.write(h_index, v_index + 1, 'Null Count', header_format) worksheet.write(h_index, v_index + 2, 'UDP Null Count', header_format) worksheet.write(h_index, v_index + 3, 'Row Count Difference', header_format) worksheet.write(h_index, v_index + 4, 'Row Count % Difference', header_format) h_index += 1 # Set up UDP variables target_schema = configs[config_index]('namespace').dataset target_db = dbs[len(dbs) - 1] # Loop through all tables in configs # add logic to ignore tables where ignore_flag = 1 for table in (t for t in configs[config_index].sections if 'table:' in t): table_name = table.partition(':')[2] src_null_columns = dbs[db_index].select_nullable_columns(dbs[db_index].schema, table_name) worksheet.write(h_index, v_index, table_name, sub_header_format) # remove ignored columns from src_null_columns list ignore_columns = split(configs[config_index](table).ignore_columns) # ToDo: Switch this to support Glob Pattern (ex: first_*) if ignore_columns: for column in ignore_columns: if column.lower().strip() in src_null_columns: src_null_columns.remove(column.lower().strip()) # Gap between tables h_index += 1 # Loop through all nullable columns for null_column in src_null_columns: # ignore_flag = configs[config_index](table).ignore_columns worksheet.write(h_index, v_index, null_column) src_null_count = dbs[db_index].select_null_count(dbs[db_index].schema, table_name, null_column) target_null_count = target_db.select_null_count(target_schema, table_name, null_column) # write source null count worksheet.write(h_index, v_index + 1, src_null_count) if target_null_count is None: worksheet.write(h_index, v_index + 2, 'Column Not Found') else: # write udp null count worksheet.write(h_index, v_index + 2, target_null_count) # write excel functions worksheet.write_formula(h_index, v_index + 3, f'=B{h_index+1} - C{h_index+1}') worksheet.write_formula(h_index, v_index + 4, f'=IF(AND(B{h_index+1}=0,'f'C{h_index+1}=0),"100",'f'C{h_index+1}/B{h_index+1} *100)&"%"', percent_format) h_index += 1 h_index += 1 db_index += 1 config_index += 1
def set_nk_value(nk): nk_columns = split(nk) nk_column_names = ', '.join(add_aliases(nk_columns, 't')) return f"concat_ws(':', {nk_column_names})"
import pandas as pd from pandas import DataFrame import numpy as np import advanced import basic from common import split split() # basic.run() advanced.run()
def amp_out(time): """ A double-decay diminuendo. """ dest, times, curves = split(amp_in(time), 3) return interleave(dest[-1::-1], [0.] + times[-1:0:-1], curves)
def iterate_column_min_max(configs, dbs, workbook, worksheet, h_index=0, v_index=0): config_index = 0 db_index = 0 # Workbook Formats header_format = workbook.add_format({'bold': True, 'underline': True}) sub_header_format = workbook.add_format({'bold': True}) percent_format = workbook.add_format({'num_format': '0.00%'}) # Loop through all configs and all dbs. config[1] is correlated with dbs[1] etc. while config_index < len(configs): worksheet.write(h_index, v_index, configs[config_index]('namespace').dataset, header_format) worksheet.write(h_index, v_index + 1, 'Source Column Min Length', header_format) worksheet.write(h_index, v_index + 2, 'Target Column Min Length', header_format) worksheet.write(h_index, v_index + 3, 'Source Column Max Length', header_format) worksheet.write(h_index, v_index + 4, 'Target Column Max Length', header_format) h_index += 1 # Set up UDP variables target_schema = configs[config_index]('namespace').dataset target_db = dbs[len(dbs) - 1] # Loop through all tables in configs # add logic to ignore tables where ignore_flag = 1 for table in (t for t in configs[config_index].sections if 'table:' in t): table_name = table.partition(':')[2] src_columns = dbs[db_index].select_columns_with_datatype(dbs[db_index].schema, table_name) # Lower all fields in src_columns # src_columns = [column.column.lower() for column in src_columns] # Write table name worksheet.write(h_index, v_index, table_name, sub_header_format) # remove ignored columns from src_null_columns list # ToDo: Use the split() method from Common.py instead ignore_columns = split(configs[config_index](table).ignore_columns) # MMG: be "truthy" #if ignore_columns[0] == '': if not ignore_columns: final_src_columns = src_columns else: final_src_columns = list() ignore_columns = [column.lower() for column in ignore_columns] for column_desc in src_columns: if column_desc.column.lower() not in ignore_columns: final_src_columns.append(column_desc) # Increment each column h_index += 1 # print(final_src_columns) # Loop through all columns for column in final_src_columns: if column.data_type in ('char', 'nchar', 'nvarchar', 'varchar'): worksheet.write(h_index, v_index, column.column) src_column_min_max = dbs[db_index].select_min_max_len(dbs[db_index].schema, table_name, column.column) target_column_min_max = target_db.select_min_max_len(target_schema, table_name, column.column) worksheet.write(h_index, v_index + 1, src_column_min_max.min) worksheet.write(h_index, v_index + 2, target_column_min_max.min) worksheet.write(h_index, v_index + 3, src_column_min_max.max) worksheet.write(h_index, v_index + 4, target_column_min_max.max) h_index += 1 else: worksheet.write(h_index, v_index, column.column) if column.data_type == 'bit': src_column_min_max = dbs[db_index].select_min_max_len_cast(dbs[db_index].schema, table_name, column.column) target_column_min_max = target_db.select_min_max_len_cast(target_schema, table_name, column.column) else: src_column_min_max = dbs[db_index].select_min_max(dbs[db_index].schema, table_name, column.column) target_column_min_max = target_db.select_min_max(target_schema, table_name, column.column) if column.data_type in ('date', 'datetime', 'datetime2', 'datetime3', 'smalldatetime'): if src_column_min_max.min is None: # This means the table is empty. pass else: worksheet.write_datetime(h_index, v_index + 1, src_column_min_max.min) worksheet.write_datetime(h_index, v_index + 2, target_column_min_max.min) worksheet.write_datetime(h_index, v_index + 3, src_column_min_max.max) worksheet.write_datetime(h_index, v_index + 4, target_column_min_max.max) else: worksheet.write(h_index, v_index + 1, src_column_min_max.min) worksheet.write(h_index, v_index + 2, target_column_min_max.min) worksheet.write(h_index, v_index + 3, src_column_min_max.max) worksheet.write(h_index, v_index + 4, target_column_min_max.max) h_index += 1 h_index += 1 db_index += 1 config_index += 1
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp, current_sequence=0): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references) elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp # initialize table's last_sequence to first_sequence if not set yet if not table_history.last_sequence: if not table_object.first_sequence: table_object.first_sequence = 0 table_history.last_sequence = table_object.first_sequence self.events.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage table_file_name = f'{self.work_folder}/{table_name}.table' save_jsonpickle(table_file_name, table_object) # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # handle non-existent tables if table_schema is None: if table_object.optional_table: logger.info( f'Optional table not found; skipped ({table_name})') else: logger.warning(f'Table not found; skipped ({table_name})') return # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): if is_glob_match(column_name, pattern): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use schema_table_name = f'{self.work_folder}/{table_name}.schema' save_jsonpickle(schema_table_name, table_schema) # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key save_text(f'{self.work_folder}/{table_name}.pk', pk_columns) # normalize cdc setting table_object.cdc = table_object.cdc.lower() if table_object.cdc == 'none': table_object.cdc = '' # clear unknown cdc settings if table_object.cdc and table_object.cdc not in ( 'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'): logger.warning( f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # clear cdc setting when no pk_columns are present # NOTE: filehash cdc does not require pk_columns. if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns: logger.warning( f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # if no cdc, then clear cdc related attributes if not table_object.cdc: table_object.filehash = '' table_object.rowhash = '' table_object.rowversion = '' table_object.sequence = '' table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(db_engine, table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # save generated SQL to work folder for documentation purposes sql_file_name = f'{self.work_folder}/{table_name}.sql' save_text(sql_file_name, sql) # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 250_000 batch_number = 0 row_count = 0 data_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) self.progress_message( f'extracting({table_name}.{batch_number:04}) ...') # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json' save_jsonpickle(output_file, json_rows) # track metrics row_count += len(json_rows) data_size += file_size(output_file) # update table history with new last timestamp and sequence values table_history.last_timestamp = current_timestamp table_history.last_sequence = current_sequence # track total row count and file size across all of a table's batched json files self.events.stop(table_name, row_count, data_size) # save interim metrics for diagnostics self.events.save() self.job_row_count += row_count self.job_data_size += data_size # explicitly close cursor when finished # cursor.close() return
def main(): fullset = common.load_data(FULLSET_PATH, sep=',') codes = get_codes(fullset) uncertain_mask = (codes == common.N_CLASS) uncertain_set = fullset[uncertain_mask] uncertain_features = common.onehot_encode( uncertain_set[:, common.N_DISASTER:], 0) trainset = common.load_data(TRAINSET_PATH, sep=',') trainset = common.onehot_encode(trainset, 0) prob_sum = np.zeros((uncertain_features.shape[0], common.N_CLASS)) for i in range(N_MODEL): x_train, _, _, _ = common.split(trainset, i) _, normalized_features = common.normalize(x_train, uncertain_features) prob_sum += tf.keras.models.load_model(common.numbering( MODEL_PATH, i)).predict(normalized_features) print(i, ' is done.') probs = prob_sum / N_MODEL linenum_to_prob = { idx: prob for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs) } # unpredicted map common.save_map(codes.reshape(common.N_ROWS, -1), UNPRED_OUTPUT_PATH) # predicted map counter = [0] * common.N_CLASS predicted_map = codes.copy() for i, (row, code) in enumerate(zip(fullset, codes)): if code == common.N_CLASS: predicted_map[i], order = check_and_decide(row[:common.N_DISASTER], linenum_to_prob[i]) counter[order] += 1 common.save_map(predicted_map.reshape(common.N_ROWS, -1), PRED_OUTPUT_PATH) print(counter) # full_probs encoded_codes = get_encoded_codes(fullset) certain_mask = (codes < common.N_CLASS) & (codes >= 0) certain_set = codes[certain_mask] cerntain_probs = get_hunnit_prob_vecs(certain_set) linenum_to_certain_prob = { idx: prob for idx, prob in zip(np.nonzero(certain_mask)[0], cerntain_probs) } full_probs = [] for i, code in enumerate(encoded_codes): if i in linenum_to_prob: _, order = check_and_decide(fullset[i][:common.N_DISASTER], linenum_to_prob[i]) full_probs.append([code] + linenum_to_prob[i].tolist() + [order + 1]) elif i in linenum_to_certain_prob: full_probs.append([code] + linenum_to_certain_prob[i].tolist() + [0]) else: full_probs.append([0]) cur_id = 0 reversed_full_probs = [] for row in np.flipud(np.array(full_probs).reshape(common.N_ROWS, -1)).reshape(-1): if len(row) == 1: continue reversed_full_probs.append([cur_id] + row) cur_id += 1 common.save_data(reversed_full_probs, PROBS_OUTPUT_PATH)
PETSc.Sys.Print(variant, velocityspace, order, cell, coordorder, xbcs, nxs) # create mesh and spaces mesh = create_box_mesh(cell, nxs, xbcs, coordorder) elemdict = create_complex(cell, velocityspace, variant, order) adjust_coordinates(mesh, c) l2 = FunctionSpace(mesh, elemdict['l2']) hdiv = FunctionSpace(mesh, elemdict['hdiv']) mixedspace = MixedFunctionSpace([l2, hdiv]) hhat, uhat = TestFunctions(mixedspace) xhat = TestFunction(mixedspace) x = Function(mixedspace, name='x') h, u = split(x) # set boundary conditions fullbcs = [ DirichletBC(mixedspace.sub(1), 0.0, "on_boundary"), ] ubcs = [ DirichletBC(hdiv, 0.0, "on_boundary"), ] if cell in ['tpquad', 'tphex', 'tptri']: fullbcs.append(DirichletBC(mixedspace.sub(1), 0.0, "top")) fullbcs.append(DirichletBC(mixedspace.sub(1), 0.0, "bottom")) ubcs.append(DirichletBC(hdiv, 0.0, "top")) ubcs.append(DirichletBC(hdiv, 0.0, "bottom")) # set rhs/soln