Example #1
0
def test_split():
    L = [(i*2, i*2+1) for i in range(10)]
    iters = common.split(L, n=2)
    assert zip(*iters) == L

    for i in [0, 1]:
        iters = common.split(L, n=2)
        iters[i].next()
        try:
            iters[i].next()
            assert False
        except IndexError as e:
            assert e.args == (i,)
Example #2
0
 def match_condition(nk):
     """FUTURE: t.udp_nk = s.udp_nk"""
     output = []
     for nk_column in split(nk):
         source_nk_column = add_alias(nk_column, 's')
         target_nk_column = add_alias(nk_column, 't')
         output.append(f'{target_nk_column}={source_nk_column}')
     return ' and '.join(output)
Example #3
0
def modify_idx_label():
    df = get_df_1()

    # modify row names
    df.index.set_names('Prov.', inplace=True)

    # modify row index label
    df.rename_axis(
        {'SH': 'HLJ'},
        inplace=True)  # default return a new df if inplace is not set
    print(df)
    split()

    # modify column index label
    df.rename_axis({'b': 'B', 'c': 'cc'}, inplace=True, axis=1)
    print(df)
    split()
Example #4
0
def read_data_input(filename):
    with open(filename) as fi:
        lines = fi.readlines()
        chunks = list(split(lines, "\n"))
        rules = chunks[0]
        values = [c.strip() for c in chunks[1]]

        rule_dict = parse_rule_dict(rules)

        return rule_dict, values
def read_passports(filename, required):
    with open(filename) as fi:
        passport_list = []
        lines = fi.readlines()
        for passport_info in split(lines, "\n"):
            if passport_info:
                passport_fields = parse_fields(passport_info, required)
                passport_list.append(Passport(passport_fields, required))

        return passport_list
Example #6
0
def main():
	listener = ListenerFile('listener.txt')
	listener.clear()
	while True:
		time.sleep(1)
		command = listener.listen()
		if command in split('bye end exit quit stop'):
			logger.info(f'Exit command: {command}')
			listener.clear()
			break
def ffnp(t, T):
    """Compute the ffnp reduction of t, using T as auxilary information.

    Args:
        t: a vector
        T: a ldl decomposition tree

    Format: coefficient
    """
    n = len(t[0])
    z = [None, None]
    if (n > 1):
        l10, T0, T1 = T
        z[1] = merge(ffnp(split(t[1]), T1))
        t0b = add(t[0], mul(sub(t[1], z[1]), l10))
        z[0] = merge(ffnp(split(t0b), T0))
        return z
    elif (n == 1):
        z[0] = [round(t[0][0])]
        z[1] = [round(t[1][0])]
        return z
Example #8
0
def format_join(text, schema_name):
    text = clean_sql(text)

    join_keywords = split(
        'full, left, right, inner, outer, cross, join, on, and, or, not')
    output = []
    last_token = ''
    for token in text.split():
        if token in join_keywords or not token[0].isalpha():
            output.append(token)
        else:
            if '..' in token:
                token = q(token.partition('..')[2])
            elif token.startswith('dbo.'):
                token = q(token[4:])
            elif '.' in token:
                alias_name, separator, table_name = token.partition('.')
                token = f'{q(alias_name)}.{q(table_name)}'
            else:
                token = q(token)

            # add schema name if last token ends with 'join' and token missing schema name
            if last_token.endswith('join') and '.' not in token:
                token = f'{q(schema_name)}.{token}'

            output.append(token)

        last_token = token

    text = ' '.join(output)

    # convert join keyword phrases to tokens
    for join_keyword_phrase in join_keyword_phrases:
        join_keyword_token = join_keyword_phrase.replace(' ', '::')
        text = text.replace(join_keyword_phrase, join_keyword_token)

    # format joins into 2-line clauses
    output = []
    for token in text.split():
        if token.endswith('join'):
            token = f'\n{spaces(2)}{token}'
        elif token == 'on':
            token = f'\n{spaces(4)}{token}'
        output.append(token + ' ')

    # expand join keyword tokens back to join keyword phrases
    text = ''.join(output)
    text = text.replace('::', ' ')

    return text
def format_join(text, schema_name):
    text = clean_sql(text)

    join_keywords = split(
        "full, left, right, inner, outer, cross, join, on, and, or, not")
    output = []
    last_token = ""
    for token in text.split():
        if token in join_keywords or not token[0].isalpha():
            output.append(token)
        else:
            if ".." in token:
                token = q(token.partition("..")[2])
            elif token.startswith("dbo."):
                token = q(token[4:])
            elif "." in token:
                alias_name, separator, table_name = token.partition(".")
                token = f"{q(alias_name)}.{q(table_name)}"
            else:
                token = q(token)

            # add schema name if last token ends with 'join' and token missing schema name
            if last_token.endswith("join") and "." not in token:
                token = f"{q(schema_name)}.{token}"

            output.append(token)

        last_token = token

    text = " ".join(output)

    # convert join keyword phrases to tokens
    for join_keyword_phrase in join_keyword_phrases:
        join_keyword_token = join_keyword_phrase.replace(" ", "::")
        text = text.replace(join_keyword_phrase, join_keyword_token)

    # format joins into 2-line clauses
    output = []
    for token in text.split():
        if token.endswith("join"):
            token = f"\n{spaces(2)}{token}"
        elif token == "on":
            token = f"\n{spaces(4)}{token}"
        output.append(token + " ")

    # expand join keyword tokens back to join keyword phrases
    text = "".join(output)
    text = text.replace("::", " ")

    return text
Example #10
0
def read_input_data(filename):
    with open(filename) as fi:
        lines = fi.readlines()
        decks = []
        for chunk in split(lines, "\n"):
            deck = []
            header = chunk[0]
            assert header[0:6] == "Player"

            for value in chunk[1:]:
                deck.append(int(value.strip()))

            decks.append(deck)

        return RecursiveCombatGame(deque(decks[0]), deque(decks[1]))
Example #11
0
def main():
    trainset = common.load_data(TRAINSET_PATH, sep=',')
    trainset = common.onehot_encode(trainset, 0)

    for i in range(N_MODEL):
        x_train, x_test, y_train, y_test = common.split(trainset, i)
        x_train, x_test = common.normalize(x_train, x_test)

        model, history = train(x_train, y_train, N_EPOCH)
        model.evaluate(x_test, y_test)

        model.save(common.numbering(MODEL_PATH, i))
        save_history(history, common.numbering(HISTORY_PATH, i))

        print(i, ' is done.')
    def __init__(self, schema_name, table_name, column_names):
        self.schema_name = schema_name
        self.table_name = table_name
        self.column_names = split(column_names)

        self.table_prefix = ""
        self.table_suffix = ""
        self.natural_key = ""
        self.cdc = ""
        self.timestamp = ""
        self.first_timestamp = ""
        self.rowversion = ""
        self.first_rowversion = ""
        self.select = ""
        self.where = ""
        self.ignore = ""
        self.order = ""
Example #13
0
def get_probs_for_uncertain(uncertainset):
    trainset = common.load_data(TRAINSET_PATH, sep=',')

    encoded_uncertainset = common.onehot_encode(
        uncertainset[:, common.N_DISASTER:], 0)
    encoded_trainset = common.onehot_encode(trainset, 0)

    prob_sums = np.zeros((len(uncertainset), common.N_CLASS))
    for i in range(N_MODEL):
        x_train, _, _, _ = common.split(encoded_trainset, i)
        _, normalized_uncertainset = common.normalize(x_train,
                                                      encoded_uncertainset)
        prob_sums += tf.keras.models.load_model(common.numbering(
            MODEL_PATH, i)).predict(normalized_uncertainset)
        print(f'{i} is done.')

    return prob_sums / N_MODEL
Example #14
0
    def timestamp_logic(self, current_timestamp, last_timestamp=None):
        timestamp_columns = add_aliases(split(self.table.timestamp))
        if not timestamp_columns:
            self.timestamp_value = f"'{current_timestamp:%Y-%m-%d %H:%M:%S}'"
            self.timestamp_where_condition = ''
        else:
            if len(timestamp_columns) == 1:
                timestamp_value = q(timestamp_columns[0])
            else:
                # build timestamp column values as ("created_at"), ("updated_at"), ("other_timestamp")
                timestamp_values = ', '.join([
                    f'({q(column_name)})' for column_name in timestamp_columns
                ])
                timestamp_value = f'(select max("v") from (values {timestamp_values}) as value("v"))'

            self.timestamp_value = timestamp_value
            self.timestamp_where_condition = expand(
                self.timestamp_where_template)
Example #15
0
def fft(f):
    """Compute the FFT of a polynomial mod (x ** n + 1).

	Args:
		f: a polynomial

	Format: input as coefficients, output as FFT
	"""
    n = len(f)
    if (n > 2):
        f0, f1 = split(f)
        f0_fft = fft(f0)
        f1_fft = fft(f1)
        f_fft = merge_fft([f0_fft, f1_fft])
    elif (n == 2):
        f_fft = [0] * n
        f_fft[0] = f[0] + 1j * f[1]
        f_fft[1] = f[0] - 1j * f[1]
    return f_fft
Example #16
0
def ntt(f, q):
    """Compute the NTT of a polynomial.

    Args:
        f: a polynomial

    Format: input as coefficients, output as NTT
    """
    n = len(f)
    if (n > 2):
        f0, f1 = split(f)
        f0_ntt = ntt(f0, q)
        f1_ntt = ntt(f1, q)
        f_ntt = merge_ntt([f0_ntt, f1_ntt], q)
    elif (n == 2):
        f_ntt = [0] * n
        f_ntt[0] = (f[0] + sqr1[q] * f[1]) % q
        f_ntt[1] = (f[0] - sqr1[q] * f[1]) % q
    return f_ntt
Example #17
0
def demodulate(symbols, filters, freqs, sampler):
    streams = []
    symbol_list = []
    errors = {}

    def error_handler(received, decoded, freq):
        errors.setdefault(freq, []).append(received / decoded)

    generators = common.split(symbols, n=len(freqs))
    for freq, S in zip(freqs, generators):
        S = filters[freq](S)

        if pylab:
            equalized = []
            S = common.icapture(S, result=equalized)
            symbol_list.append(equalized)

        freq_handler = functools.partial(error_handler, freq=freq)
        bits = modem.qam.decode(S, freq_handler)  # list of bit tuples
        streams.append(bits)  # stream per frequency

    stats['symbol_list'] = symbol_list
    stats['rx_bits'] = 0
    stats['rx_start'] = time.time()

    log.info('Demodulation started')
    for i, block in enumerate(itertools.izip(*streams)):  # block per frequency
        for bits in block:
            stats['rx_bits'] = stats['rx_bits'] + len(bits)
            yield bits

        if i and i % config.baud == 0:
            mean_err = np.array([e for v in errors.values() for e in v])
            correction = np.mean(np.angle(mean_err)) / (2*np.pi)
            duration = time.time() - stats['rx_start']
            log.debug('%10.1f kB, realtime: %6.2f%%, sampling error: %+.3f%%',
                      stats['rx_bits'] / 8e3,
                      duration * 100.0 / (i*config.Tsym),
                      correction * 1e2)
            errors.clear()
            sampler.freq -= 0.01 * correction / config.Fc
            sampler.offset -= correction
Example #18
0
def select_cell():
    df = get_df_1()

    # select a cell by index labels via .at, which is faster than .loc
    a = df.at['JS', 'b']
    print('df["JS"]["b"] = %d' % a)
    split()

    # equivalent to .at, but slower
    a = df.loc['JS', 'b']
    print('df["JS"]["b"] = %d' % a)
    split()

    # select a cell by positions via .iat, which is faster than .iloc
    a = df.iat[3, 1]
    print('df[3][1] = %d' % a)
    split()

    # equivalent to .iloc, but slower
    a = df.iloc[3, 1]
    print('df[3][1] = %d' % a)
    split()
Example #19
0
def sort():
    df = get_df_1()

    # sort by row index label
    df_new = df.sort_index()
    print(df_new)
    split()

    # sort by row index label in descending order
    df_new = df.sort_index(ascending=False)
    print(df_new)
    split()

    # sort by column index label in descending order
    df_new = df.sort_index(ascending=False, axis=1)
    print(df_new)
    split()

    # sort by column value in descending order
    df_new = df.sort_values('b', ascending=False)
    print(df_new)
    split()
Example #20
0
def aggr():
    df = get_df_2()

    # select from year 2015, filter out data before this
    df = df.loc['2015':]
    print(df)
    split()

    # sum rows by year
    df = df.resample('A').sum()  # 'A' means annually
    print(df)
    split()

    # add a column which denotes growth percentage
    df['growth'] = df.pct_change() * 100
    print(df)
    split()
Example #21
0
def handle_nan():
    df = get_df_1()

    # filter rows with at least one NaN
    df_new = df.dropna(how='any')
    print(df_new)
    split()

    # filter rows with all NaN
    df_new = df.dropna(how='all')
    print(df_new)
    split()

    # fill NaN to a default value
    df_new = df.fillna(value=60)
    print(df_new)
    split()
Example #22
0
def fft(f):
    """
    Compute the FFT of a polynomial mod (x ** n + 1).

    Input:
    f           A polynomial

    Output:
    f_fft       The FFT of f

    Format:     Coefficient (Input)
                FFT (Output)
    """
    n = len(f)
    if (n > 2):
        f0, f1 = split(f)
        f0_fft = fft(f0)
        f1_fft = fft(f1)
        f_fft = merge_fft([f0_fft, f1_fft])
    elif (n == 2):
        f_fft = [0] * n
        f_fft[0] = f[0] + 1j * f[1]
        f_fft[1] = f[0] - 1j * f[1]
    return f_fft
Example #23
0
    def process_table(self, db, db_engine, schema_name, table_name,
                      table_object, table_history, current_timestamp):
        """Process a specific table."""

        # skip default table and ignored tables
        if table_name == 'default':
            return
        elif table_object.ignore_table:
            logger.info(f'Skipping table: {table_name} (ignore_table=1)')
            return
        elif table_object.drop_table:
            logger.info(f'Skipping table: {table_name} (drop_table=1)')
            return

        # initialize table history's last time stamp to first timestamp if not set yet
        if not table_history.last_timestamp:
            # default first timestamp to 1900-01-01 if project has no first timestamp
            if not table_object.first_timestamp:
                table_object.first_timestamp = '1900-01-01'
            table_history.last_timestamp = iso_to_datetime(
                table_object.first_timestamp)

        # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future
        if table_history.last_timestamp > current_timestamp:
            explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}'
            logger.info(f'Skipping table: {table_name} ({explanation})')
            return

        # if we're here then we have a legit last timestamp value to use for CDC
        last_timestamp = table_history.last_timestamp

        self.stats.start(table_name, 'table')
        # logger.info(f'Processing {table_name} ...')

        # create a fresh cursor for each table
        cursor = db.conn.cursor()

        # save table object for stage
        output_stream = open(f'{self.work_folder_name}/{table_name}.table',
                             'wb')
        pickle.dump(table_object, output_stream)
        output_stream.close()

        # discover table schema
        table_schema = db_engine.select_table_schema(schema_name, table_name)

        # remove ignored columns from table schema
        if table_object.ignore_columns:
            # find columns to ignore (remove) based on ignore column names/glob-style patterns
            ignore_columns = []
            for column_name in table_schema.columns:
                for pattern in split(table_object.ignore_columns):
                    # use fnmatch() to provide glob style matching
                    if fnmatch.fnmatch(column_name.lower(), pattern.lower()):
                        ignore_columns.append(column_name)

            # delete ignored columns from our table schema
            for column_name in ignore_columns:
                logger.info(f'Ignore_column: {table_name}.{column_name}')
                table_schema.columns.pop(column_name)

        # save table schema for stage to use
        output_stream = open(f'{self.work_folder_name}/{table_name}.schema',
                             'wb')
        pickle.dump(table_schema, output_stream)
        output_stream.close()

        # save table pk for stage to use
        pk_columns = db_engine.select_table_pk(schema_name, table_name)
        if not pk_columns and table_object.primary_key:
            pk_columns = table_object.primary_key
        output_stream = open(f'{self.work_folder_name}/{table_name}.pk', 'w')
        output_stream.write(pk_columns)
        output_stream.close()

        # clear cdc if it doesn't match timestamp/rowversion
        table_object.cdc = table_object.cdc.lower()
        if not table_object.cdc or table_object.cdc not in ('timestamp',
                                                            'rowversion'):
            table_object.cdc = ''

        # if no pk_columns, then clear table cdc
        if not pk_columns:
            if table_object.cdc and table_object.cdc != 'none':
                logger.info(
                    f'Warning: {table_name} cdc={table_object.cdc} but table has no pk column(s)'
                )
                table_object.cdc = 'none'

            # we still keep timestamp because its required for filtering first_timestamp - current_timestamp
            # if table_object.timestamp:
            # 	logger.info(f'Warning: {table_name} timestamp={table_object.timestamp} but table has no pk column(s)')
            # 	table_object.timestamp = ''

        # update table object properties for cdc select build
        column_names = list(table_schema.columns.keys())
        table_object.schema_name = schema_name
        table_object.table_name = table_name
        table_object.column_names = column_names
        select_cdc = cdc_select.SelectCDC(table_object)
        sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp)

        # logger.info(f'Capture SQL:\n{sql}\n')

        # run sql here vs via db_engine.capture_select
        # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp)
        cursor.execute(sql)

        # capture rows in fixed size batches to support unlimited size record counts
        # Note: Batching on capture side allows stage to insert multiple batches in parallel.

        if self.project.batch_size:
            batch_size = int(self.project.batch_size)
            # logger.info(f'Using project specific batch size: {self.project.batch_size}')
        else:
            batch_size = 1_000_000

        batch_number = 0
        row_count = 0
        file_size = 0
        while True:
            batch_number += 1
            rows = cursor.fetchmany(batch_size)
            if not rows:
                break

            logger.info(
                f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}'
            )

            # flatten rows to list of column values
            json_rows = [list(row) for row in rows]
            output_file = f'{self.work_folder_name}/{table_name}#{batch_number:04}.json'
            with open(output_file, 'w') as output_stream:
                # indent=2 for debugging
                json.dump(json_rows,
                          output_stream,
                          indent=2,
                          default=json_serializer)

            # track stats
            row_count += len(json_rows)
            file_size += pathlib.Path(output_file).stat().st_size

        # if no cdc, but order set, do a file hash see if output the same time as last file hash
        if (not table_object.cdc
                or table_object.cdc == 'none') and table_object.order:
            print(
                f'Checking {table_name} file hash based on cdc={table_object.cdc} and order={table_object.order}'
            )
            table_data_files = f'{self.work_folder_name}/{table_name}#*.json'
            current_filehash = hash_files(table_data_files)
            if table_history.last_filehash == current_filehash:
                # suppress this update
                print(
                    f'Table({table_name}): identical file hash, update suppressed'
                )
                logger.info(
                    f'Table({table_name}): identical file hash, update suppressed'
                )
                row_count = 0
                file_size = 0

                # delete exported json files
                delete_files(table_data_files)
            else:
                print(
                    f'Table({table_name}): {table_history.last_filehash} != {current_filehash}'
                )
                table_history.last_filehash = current_filehash

        # update table history with new last timestamp value
        table_history.last_timestamp = current_timestamp

        # track total row count and file size across all of a table's batched json files
        self.stats.stop(table_name, row_count, file_size)

        # save interim state of stats for diagnostics
        self.stats.save()

        self.job_row_count += row_count
        self.job_file_size += file_size

        # explicitly close cursor when finished
        # cursor.close()
        return
Example #24
0
def display():
    df = get_df_1()

    print(df.info())  # basic info
    split()

    print(df.shape
          )  # dimension as (n,m), rows = df.shape[0], columns = df.shape[1]
    split()

    print(df.index)  # row index info
    split()

    print(df.columns)  # column index info
    split()

    print(df.head(4))  # first 4 rows
    split()

    print(df.tail())  # last 5 rows
    split()

    print(df.describe())  # basic stats
    split()

    print(df.values)  # all values
    split()
Example #25
0
#Allons voir erreur pour cette image "/mnt/VegaSlowDataDisk/c3po/Images_aquises/DonneesPI/timeLapsePhotos_Pi1_4/image_2019-06-15_04-16-45.jpg"
#Retrouve bien les fp
#ds_to_analyze=["image_2019-06-15_04-16-45.jpg"]

path_model_saved = "/mnt/BigFast/VegaFastExtension/Rpackages/c3po_all/c3po_interface_mark/Materiels/Models/Yolo_models/"
neurone = "training_jeux_difficile_updated"
neurone = "training_jeux_difficile"
#neurone="generateur_sans_flip_2000"

string = path_model_saved + neurone

imagettes = pd.read_csv(
    "/mnt/BigFast/VegaFastExtension/Rpackages/c3po_all/c3po/Images_aquises/imagettes.csv"
)
imagettes = common.to_reference_labels(imagettes, "classe")
index_train, index_test = common.split(imagettes)

#Choose  index_test or index_train
index = index_test
#images, labels, labels2=common.read_imagettes(imagettes[imagettes["filename"].isin(index)])
images, labels, labels2 = common.read_imagettes(
    imagettes[imagettes["filename"].isin(index)])
images = np.array(images, dtype=np.float32) / 255
labels = np.array(labels, dtype=np.float32)

dataset = tf.data.Dataset.from_tensor_slices(
    (images, labels)).batch(config.batch_size)

Model = model.model(config.nbr_classes, config.nbr_boxes, config.cellule_y,
                    config.cellule_x)
checkpoint = tf.train.Checkpoint(model=Model)
Example #26
0
def iterate_null_columns(configs, dbs, workbook, worksheet, h_index=0, v_index=0):
    config_index = 0
    db_index = 0

    # Workbook Formats
    header_format = workbook.add_format({'bold': True, 'underline': True})
    sub_header_format = workbook.add_format({'bold': True})
    percent_format = workbook.add_format({'num_format': '0.00%'})

    # Loop through all configs and all dbs. config[1] is correlated with dbs[1] etc.
    while config_index < len(configs):
        worksheet.write(h_index, v_index, configs[config_index]('namespace').dataset, header_format)
        worksheet.write(h_index, v_index + 1, 'Null Count', header_format)
        worksheet.write(h_index, v_index + 2, 'UDP Null Count', header_format)
        worksheet.write(h_index, v_index + 3, 'Row Count Difference', header_format)
        worksheet.write(h_index, v_index + 4, 'Row Count % Difference', header_format)
        h_index += 1

        # Set up UDP variables
        target_schema = configs[config_index]('namespace').dataset
        target_db = dbs[len(dbs) - 1]

        # Loop through all tables in configs
        # add logic to ignore tables where ignore_flag = 1
        for table in (t for t in configs[config_index].sections if 'table:' in t):
            table_name = table.partition(':')[2]
            src_null_columns = dbs[db_index].select_nullable_columns(dbs[db_index].schema, table_name)
            worksheet.write(h_index, v_index, table_name, sub_header_format)

            # remove ignored columns from src_null_columns list
            ignore_columns = split(configs[config_index](table).ignore_columns)

            # ToDo: Switch this to support Glob Pattern (ex: first_*)
            if ignore_columns:
                for column in ignore_columns:
                    if column.lower().strip() in src_null_columns:
                        src_null_columns.remove(column.lower().strip())

            # Gap between tables
            h_index += 1

            # Loop through all nullable columns
            for null_column in src_null_columns:
                # ignore_flag = configs[config_index](table).ignore_columns
                worksheet.write(h_index, v_index, null_column)

                src_null_count = dbs[db_index].select_null_count(dbs[db_index].schema, table_name, null_column)
                target_null_count = target_db.select_null_count(target_schema, table_name, null_column)
                # write source null count
                worksheet.write(h_index, v_index + 1, src_null_count)
                if target_null_count is None:
                    worksheet.write(h_index, v_index + 2, 'Column Not Found')
                else:
                    # write udp null count
                    worksheet.write(h_index, v_index + 2, target_null_count)
                    # write excel functions
                    worksheet.write_formula(h_index, v_index + 3, f'=B{h_index+1} - C{h_index+1}')
                    worksheet.write_formula(h_index, v_index + 4,
                                            f'=IF(AND(B{h_index+1}=0,'f'C{h_index+1}=0),"100",'f'C{h_index+1}/B{h_index+1} *100)&"%"',
                                            percent_format)
                h_index += 1

        h_index += 1
        db_index += 1
        config_index += 1
 def set_nk_value(nk):
     nk_columns = split(nk)
     nk_column_names = ', '.join(add_aliases(nk_columns, 't'))
     return f"concat_ws(':', {nk_column_names})"
Example #28
0
import pandas as pd
from pandas import DataFrame
import numpy as np

import advanced
import basic
from common import split

split()

# basic.run()
advanced.run()
Example #29
0
def amp_out(time):
    """
    A double-decay diminuendo.
    """
    dest, times, curves = split(amp_in(time), 3)
    return interleave(dest[-1::-1], [0.] + times[-1:0:-1], curves)
Example #30
0
def iterate_column_min_max(configs, dbs, workbook, worksheet, h_index=0, v_index=0):
    config_index = 0
    db_index = 0

    # Workbook Formats
    header_format = workbook.add_format({'bold': True, 'underline': True})
    sub_header_format = workbook.add_format({'bold': True})
    percent_format = workbook.add_format({'num_format': '0.00%'})

    # Loop through all configs and all dbs. config[1] is correlated with dbs[1] etc.
    while config_index < len(configs):
        worksheet.write(h_index, v_index, configs[config_index]('namespace').dataset, header_format)
        worksheet.write(h_index, v_index + 1, 'Source Column Min Length', header_format)
        worksheet.write(h_index, v_index + 2, 'Target Column Min Length', header_format)
        worksheet.write(h_index, v_index + 3, 'Source Column Max Length', header_format)
        worksheet.write(h_index, v_index + 4, 'Target Column Max Length', header_format)
        h_index += 1

        # Set up UDP variables
        target_schema = configs[config_index]('namespace').dataset
        target_db = dbs[len(dbs) - 1]

        # Loop through all tables in configs
        # add logic to ignore tables where ignore_flag = 1
        for table in (t for t in configs[config_index].sections if 'table:' in t):
            table_name = table.partition(':')[2]
            src_columns = dbs[db_index].select_columns_with_datatype(dbs[db_index].schema, table_name)

            # Lower all fields in src_columns
            # src_columns = [column.column.lower() for column in src_columns]

            # Write table name
            worksheet.write(h_index, v_index, table_name, sub_header_format)

            # remove ignored columns from src_null_columns list
            # ToDo: Use the split() method from Common.py instead
            ignore_columns = split(configs[config_index](table).ignore_columns)

            # MMG: be "truthy"
            #if ignore_columns[0] == '':
            if not ignore_columns:
                final_src_columns = src_columns
            else:
                final_src_columns = list()
                ignore_columns = [column.lower() for column in ignore_columns]
                for column_desc in src_columns:
                    if column_desc.column.lower() not in ignore_columns:
                        final_src_columns.append(column_desc)

            # Increment each column
            h_index += 1
            # print(final_src_columns)
            # Loop through all columns
            for column in final_src_columns:
                if column.data_type in ('char', 'nchar', 'nvarchar', 'varchar'):
                    worksheet.write(h_index, v_index, column.column)

                    src_column_min_max = dbs[db_index].select_min_max_len(dbs[db_index].schema, table_name,
                                                                          column.column)
                    target_column_min_max = target_db.select_min_max_len(target_schema, table_name, column.column)

                    worksheet.write(h_index, v_index + 1, src_column_min_max.min)
                    worksheet.write(h_index, v_index + 2, target_column_min_max.min)
                    worksheet.write(h_index, v_index + 3, src_column_min_max.max)
                    worksheet.write(h_index, v_index + 4, target_column_min_max.max)

                    h_index += 1

                else:
                    worksheet.write(h_index, v_index, column.column)
                    if column.data_type == 'bit':
                        src_column_min_max = dbs[db_index].select_min_max_len_cast(dbs[db_index].schema, table_name,
                                                                                   column.column)
                        target_column_min_max = target_db.select_min_max_len_cast(target_schema, table_name,
                                                                                  column.column)
                    else:
                        src_column_min_max = dbs[db_index].select_min_max(dbs[db_index].schema, table_name,
                                                                          column.column)
                        target_column_min_max = target_db.select_min_max(target_schema, table_name, column.column)

                    if column.data_type in ('date', 'datetime', 'datetime2', 'datetime3', 'smalldatetime'):
                        if src_column_min_max.min is None:
                            # This means the table is empty.
                            pass
                        else:
                            worksheet.write_datetime(h_index, v_index + 1, src_column_min_max.min)
                            worksheet.write_datetime(h_index, v_index + 2, target_column_min_max.min)
                            worksheet.write_datetime(h_index, v_index + 3, src_column_min_max.max)
                            worksheet.write_datetime(h_index, v_index + 4, target_column_min_max.max)
                    else:
                        worksheet.write(h_index, v_index + 1, src_column_min_max.min)
                        worksheet.write(h_index, v_index + 2, target_column_min_max.min)
                        worksheet.write(h_index, v_index + 3, src_column_min_max.max)
                        worksheet.write(h_index, v_index + 4, target_column_min_max.max)

                    h_index += 1

        h_index += 1
        db_index += 1
        config_index += 1
Example #31
0
    def process_table(self,
                      db,
                      db_engine,
                      schema_name,
                      table_name,
                      table_object,
                      table_history,
                      current_timestamp,
                      current_sequence=0):
        """Process a specific table."""

        # skip default table and ignored tables
        if table_name == 'default':
            return

        # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references)
        elif table_object.ignore_table:
            logger.info(f'Skipping table: {table_name} (ignore_table=1)')
            return
        elif table_object.drop_table:
            logger.info(f'Skipping table: {table_name} (drop_table=1)')
            return

        # initialize table history's last time stamp to first timestamp if not set yet
        if not table_history.last_timestamp:
            # default first timestamp to 1900-01-01 if project has no first timestamp
            if not table_object.first_timestamp:
                table_object.first_timestamp = '1900-01-01'
            table_history.last_timestamp = iso_to_datetime(
                table_object.first_timestamp)

        # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future
        if table_history.last_timestamp > current_timestamp:
            explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}'
            logger.info(f'Skipping table: {table_name} ({explanation})')
            return

        # if we're here then we have a legit last timestamp value to use for CDC
        last_timestamp = table_history.last_timestamp

        # initialize table's last_sequence to first_sequence if not set yet
        if not table_history.last_sequence:
            if not table_object.first_sequence:
                table_object.first_sequence = 0
            table_history.last_sequence = table_object.first_sequence

        self.events.start(table_name, 'table')
        # logger.info(f'Processing {table_name} ...')

        # create a fresh cursor for each table
        cursor = db.conn.cursor()

        # save table object for stage
        table_file_name = f'{self.work_folder}/{table_name}.table'
        save_jsonpickle(table_file_name, table_object)

        # discover table schema
        table_schema = db_engine.select_table_schema(schema_name, table_name)

        # handle non-existent tables
        if table_schema is None:
            if table_object.optional_table:
                logger.info(
                    f'Optional table not found; skipped ({table_name})')
            else:
                logger.warning(f'Table not found; skipped ({table_name})')
            return

        # remove ignored columns from table schema
        if table_object.ignore_columns:
            # find columns to ignore (remove) based on ignore column names/glob-style patterns
            ignore_columns = []
            for column_name in table_schema.columns:
                for pattern in split(table_object.ignore_columns):
                    if is_glob_match(column_name, pattern):
                        ignore_columns.append(column_name)

            # delete ignored columns from our table schema
            for column_name in ignore_columns:
                logger.info(f'Ignore_column: {table_name}.{column_name}')
                table_schema.columns.pop(column_name)

        # save table schema for stage to use
        schema_table_name = f'{self.work_folder}/{table_name}.schema'
        save_jsonpickle(schema_table_name, table_schema)

        # save table pk for stage to use
        pk_columns = db_engine.select_table_pk(schema_name, table_name)
        if not pk_columns and table_object.primary_key:
            pk_columns = table_object.primary_key
        save_text(f'{self.work_folder}/{table_name}.pk', pk_columns)

        # normalize cdc setting
        table_object.cdc = table_object.cdc.lower()
        if table_object.cdc == 'none':
            table_object.cdc = ''

        # clear unknown cdc settings
        if table_object.cdc and table_object.cdc not in (
                'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'):
            logger.warning(
                f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # clear cdc setting when no pk_columns are present
        # NOTE: filehash cdc does not require pk_columns.
        if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns:
            logger.warning(
                f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # if no cdc, then clear cdc related attributes
        if not table_object.cdc:
            table_object.filehash = ''
            table_object.rowhash = ''
            table_object.rowversion = ''
            table_object.sequence = ''
            table_object.timestamp = ''

        # update table object properties for cdc select build
        column_names = list(table_schema.columns.keys())
        table_object.schema_name = schema_name
        table_object.table_name = table_name
        table_object.column_names = column_names
        select_cdc = cdc_select.SelectCDC(db_engine, table_object)
        sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp)

        # save generated SQL to work folder for documentation purposes
        sql_file_name = f'{self.work_folder}/{table_name}.sql'
        save_text(sql_file_name, sql)

        # run sql here vs via db_engine.capture_select
        # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp)
        cursor.execute(sql)

        # capture rows in fixed size batches to support unlimited size record counts
        # Note: Batching on capture side allows stage to insert multiple batches in parallel.

        if self.project.batch_size:
            batch_size = int(self.project.batch_size)
            # logger.info(f'Using project specific batch size: {self.project.batch_size}')
        else:
            batch_size = 250_000

        batch_number = 0
        row_count = 0
        data_size = 0
        while True:
            batch_number += 1
            rows = cursor.fetchmany(batch_size)
            if not rows:
                break

            logger.info(
                f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}'
            )
            self.progress_message(
                f'extracting({table_name}.{batch_number:04}) ...')

            # flatten rows to list of column values
            json_rows = [list(row) for row in rows]
            output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json'
            save_jsonpickle(output_file, json_rows)

            # track metrics
            row_count += len(json_rows)
            data_size += file_size(output_file)

        # update table history with new last timestamp and sequence values
        table_history.last_timestamp = current_timestamp
        table_history.last_sequence = current_sequence

        # track total row count and file size across all of a table's batched json files
        self.events.stop(table_name, row_count, data_size)

        # save interim metrics for diagnostics
        self.events.save()

        self.job_row_count += row_count
        self.job_data_size += data_size

        # explicitly close cursor when finished
        # cursor.close()
        return
Example #32
0
def main():
    fullset = common.load_data(FULLSET_PATH, sep=',')
    codes = get_codes(fullset)
    uncertain_mask = (codes == common.N_CLASS)

    uncertain_set = fullset[uncertain_mask]
    uncertain_features = common.onehot_encode(
        uncertain_set[:, common.N_DISASTER:], 0)

    trainset = common.load_data(TRAINSET_PATH, sep=',')
    trainset = common.onehot_encode(trainset, 0)

    prob_sum = np.zeros((uncertain_features.shape[0], common.N_CLASS))
    for i in range(N_MODEL):
        x_train, _, _, _ = common.split(trainset, i)
        _, normalized_features = common.normalize(x_train, uncertain_features)
        prob_sum += tf.keras.models.load_model(common.numbering(
            MODEL_PATH, i)).predict(normalized_features)
        print(i, ' is done.')
    probs = prob_sum / N_MODEL
    linenum_to_prob = {
        idx: prob
        for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs)
    }

    # unpredicted map
    common.save_map(codes.reshape(common.N_ROWS, -1), UNPRED_OUTPUT_PATH)

    # predicted map
    counter = [0] * common.N_CLASS
    predicted_map = codes.copy()
    for i, (row, code) in enumerate(zip(fullset, codes)):
        if code == common.N_CLASS:
            predicted_map[i], order = check_and_decide(row[:common.N_DISASTER],
                                                       linenum_to_prob[i])
            counter[order] += 1
    common.save_map(predicted_map.reshape(common.N_ROWS, -1), PRED_OUTPUT_PATH)
    print(counter)

    # full_probs
    encoded_codes = get_encoded_codes(fullset)
    certain_mask = (codes < common.N_CLASS) & (codes >= 0)

    certain_set = codes[certain_mask]
    cerntain_probs = get_hunnit_prob_vecs(certain_set)
    linenum_to_certain_prob = {
        idx: prob
        for idx, prob in zip(np.nonzero(certain_mask)[0], cerntain_probs)
    }

    full_probs = []
    for i, code in enumerate(encoded_codes):
        if i in linenum_to_prob:
            _, order = check_and_decide(fullset[i][:common.N_DISASTER],
                                        linenum_to_prob[i])
            full_probs.append([code] + linenum_to_prob[i].tolist() +
                              [order + 1])
        elif i in linenum_to_certain_prob:
            full_probs.append([code] + linenum_to_certain_prob[i].tolist() +
                              [0])
        else:
            full_probs.append([0])

    cur_id = 0
    reversed_full_probs = []
    for row in np.flipud(np.array(full_probs).reshape(common.N_ROWS,
                                                      -1)).reshape(-1):
        if len(row) == 1:
            continue
        reversed_full_probs.append([cur_id] + row)
        cur_id += 1
    common.save_data(reversed_full_probs, PROBS_OUTPUT_PATH)
Example #33
0
PETSc.Sys.Print(variant, velocityspace, order, cell, coordorder, xbcs, nxs)

# create mesh and spaces
mesh = create_box_mesh(cell, nxs, xbcs, coordorder)
elemdict = create_complex(cell, velocityspace, variant, order)
adjust_coordinates(mesh, c)

l2 = FunctionSpace(mesh, elemdict['l2'])
hdiv = FunctionSpace(mesh, elemdict['hdiv'])

mixedspace = MixedFunctionSpace([l2, hdiv])
hhat, uhat = TestFunctions(mixedspace)
xhat = TestFunction(mixedspace)
x = Function(mixedspace, name='x')
h, u = split(x)

# set boundary conditions
fullbcs = [
    DirichletBC(mixedspace.sub(1), 0.0, "on_boundary"),
]
ubcs = [
    DirichletBC(hdiv, 0.0, "on_boundary"),
]
if cell in ['tpquad', 'tphex', 'tptri']:
    fullbcs.append(DirichletBC(mixedspace.sub(1), 0.0, "top"))
    fullbcs.append(DirichletBC(mixedspace.sub(1), 0.0, "bottom"))
    ubcs.append(DirichletBC(hdiv, 0.0, "top"))
    ubcs.append(DirichletBC(hdiv, 0.0, "bottom"))

# set rhs/soln