def backward(s, transitions, emissions):
    s_length = len(s)  # n.Rows
    num_of_states = len(emissions)  # k.Columns

    b = np.zeros((num_of_states, s_length), dtype=float)

    # initialize b[n, i]
    for i in range(0, num_of_states):
        b[i, len(s) - 1] = math.log(1)  # The most left column

    for i in reversed(range(0, len(s) - 1)):
        for j in range(0, num_of_states):
            a_max = sys.float_info.min
            a_l = []
            for l in range(0, num_of_states):
                emission = emissions[l].get(
                    s[i + 1]
                )  # emission inserted into the "l" for because he is being dependent on l

                curr = b[l, i + 1] + mylog(transitions[j, l]) + mylog(emission)
                if curr > a_max:
                    a_max = curr
                a_l.append(curr)

                # Regular
                # b[j, i] += b[l, i + 1] * transition * emission

            b[j, i] = 0
            for l in range(0, num_of_states):
                b_l = a_l[l] - a_max
                b[j, i] += myexp(b_l)

            b[j, i] = mylog(b[j, i]) + a_max

    return b.T
Example #2
0
def main(srcdir, destdir):
    def _dest_name(fname):
        rx = re.search(r"(.+?)(\d+)\.(\w+)$", fname)
        if rx:
            rx = rx.groups()
            # we need to rename/renumber the file for its destpath
            fname = "{}-{}.{}".format(rx[0], rx[1].rjust(3, "0"), rx[2])
        return fname

    zipnames = sorted(list(srcdir.glob("*.zip")))
    myinfo(f"Found {len(zipnames)} zipfiles")

    for zn in zipnames:
        # we actually make a subdir for each zip file
        _zsub = re.search(r"(\w+)_\d{8}", zn.stem).groups()[0]
        zdir = destdir.joinpath(_zsub)
        zdir.mkdir(exist_ok=True, parents=True)

        mylog(zn, label="Unpacking")
        zfile = ZipFile(zn)
        zlist = sorted(zfile.filelist, key=lambda x: x.filename)
        for _z in zlist:
            zname = _z.filename
            fname = _dest_name(zname) if len(zfile.filelist) > 1 else zname
            destpath = zdir.joinpath(fname)
            destpath.write_bytes(zfile.read(zname))
            mylog(
                destpath.name,
                destpath.parent,
                f"{existed_size(destpath)} bytes",
                label="Extracted",
            )
Example #3
0
def main(src_dir):
    def _get_data_paths(src_dir):
        paths = sorted(p for p in src_dir.rglob('*.csv')
                       if all(_sk not in p.name for _sk in SKIPPED_FILES))
        # return [p for p in paths if 'violation_event' in  p.name]
        return paths

    def _group_data_paths(paths):
        d = defaultdict(list)
        for p in paths:
            q = p.parent.stem
            d[q].append(p)
        return d

    TARGET_DB_PATH.parent.mkdir(exist_ok=True, parents=True)
    mylog(TARGET_DB_PATH, label="Connecting to")
    conn = connect_to_db(TARGET_DB_PATH)

    mylog("Creating tables")
    create_tables(conn, schema_path=CREATE_PATH)

    allpaths = _get_data_paths(src_dir)
    myinfo(f"{len(allpaths)} total files")
    gpaths = _group_data_paths(allpaths)
    myinfo(f"{len(gpaths.keys())} groups")

    for i, (gname, srcpaths) in enumerate(gpaths.items()):
        myinfo(f"#{i+1} Group {gname} has {len(srcpaths)} files")
        for j, path in enumerate(srcpaths):
            myinfo(f"File {j+1} of {len(srcpaths)} {path}")
            insert_from_csv(conn, path)

    conn.close()
Example #4
0
def main():
    TARGET_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    mylog(TARGET_DB_PATH, label="Connecting to")
    conn = connect_to_db(TARGET_DB_PATH)
    conn = load_custom_functions(conn)

    mylog("Creating tables")
    create_tables(conn, schema_path=CREATE_DB_PATH)
    inserts(conn)

    conn.close()
def fetch_catalog_urls():
    mylog(CATALOG_URL, label="Fetching catalog")
    resp = requests.post(CATALOG_URL, data={"agency": "osha"})
    soup = lxsoup(resp.text)
    urls = soup.xpath('//a[contains(@href, "csv.zip")]/@href')
    """
    each url will look like this:
        https://enfxfr.dol.gov/../data_catalog/OSHA/osha_accident_injury_20200727.csv.zip
    so we tidy it to:
        https://enfxfr.dol.gov/data_catalog/OSHA/osha_accident_injury_20200727.csv.zip
    """
    return [u.replace("../data_catalog", "data_catalog") for u in urls]
Example #6
0
def main(stash_dir, target_dir):
    def _fix_header(fields):
        """
        we expect every actual data table to have a "load_dt" column as its last column,
            but some tables have load_date or ld_dt
        """
        header = fields.copy()
        if fields[-1] in (
                'load_date',
                'ld_dt',
        ):
            header[-1] = 'load_dt'
            myinfo(f"From {fields[-1]} to {header[-1]}", label='Fixed header')
        return header

    # def init_csv(seriesname):
    #     """seriesname is a expected to be a string corresponding to
    #        a subdir like 'osha_violations/'
    #     """

    data_dirs = sorted(d for d in stash_dir.iterdir() if d.is_dir())
    myinfo(f"{stash_dir}",
           f"{len(data_dirs)} data directories",
           label="Main stash dir")

    target_dir.mkdir(parents=True, exist_ok=True)

    for datadir in data_dirs:
        src_paths = sorted(datadir.glob('*.csv'))
        myinfo(f"{datadir}", f"{len(src_paths)} files", label="Stash subdir")

        targetpath = target_dir.joinpath(f'{datadir.name}.csv')
        targetfile = open(targetpath, 'w')
        target = csv.writer(targetfile)

        _rowcount = 0
        for series_idx, srcpath in enumerate(src_paths):
            mylog(
                f"{series_idx}. {srcpath.name} | {existed_size(srcpath)} bytes",
                label="Reading")
            with open(srcpath) as srcfile:
                src = csv.reader(srcfile)
                header = next(src)

                if series_idx == 0:
                    # first file in series, write header to target
                    xheader = _fix_header(header)
                    target.writerow(xheader)
                for row in src:
                    target.writerow(row)
                    _rowcount += 1
        myinfo(targetpath, f"{_rowcount} rows total (+ header)", label="Wrote")
        targetfile.close()
Example #7
0
def viterbi(s, transitions, emissions):
    s_length = len(s)  # n.Rows
    num_of_states = len(emissions)  # k.Columns

    v = np.zeros((num_of_states, s_length), dtype=object)
    v[0, 0] = (
        math.log(1), -1
    )  # the tuple is to know from what i value in the previous column the maximum was chosen.
    # initialize v[0, j]
    for i in range(1, num_of_states):
        v[i,
          0] = (mylog(emissions[0].get(s[0])), -1
                )  # there is no previous because this is the most left column.

    for i in range(1, len(s)):
        for j in range(0, num_of_states):
            curr_max = -math.inf
            max_prev_state_index = -1
            emission = emissions[j].get(s[i])
            for l in range(0, num_of_states):
                score = mylog(emission) + float(v[l, i - 1][0]) + mylog(
                    transitions[l, j])

                if score > curr_max:
                    curr_max = score
                    max_prev_state_index = l
            v[j, i] = (curr_max, max_prev_state_index)

    last_column_max = -math.inf
    result = []
    # Find the max in the last column
    prev_index = -1
    for idx in range(num_of_states):
        if v[idx, len(s) - 1][0] > last_column_max:
            last_column_max = v[idx, len(s) - 1][0]
            prev_index = idx

    # Reconstructing
    for k in reversed(range(0, len(s))):
        result.append((s[k], prev_index, v[prev_index, k][0]))
        prev_index = v[prev_index, k][1]

    result.reverse()

    return result, last_column_max
def forward(s, transitions, emissions):
    s_length = len(s)  # n.Rows
    num_of_states = len(emissions)  # k.Columns

    f = np.zeros((num_of_states, s_length), dtype=float)

    # initialize f[0, i]
    # Regular
    # f[0, 0] = 1
    f[0, 0] = math.log(1)
    for i in range(1, num_of_states):
        # Regular
        # f[i, 0] = 0
        f[i, 0] = mylog(0)

    for i in range(1, len(s)):
        for j in range(0, num_of_states):
            emission = emissions[j].get(s[i])
            a_max = sys.float_info.min
            a_l = []
            for l in range(0, num_of_states):
                curr = f[l, i - 1] + mylog(transitions[l, j])
                if curr > a_max:
                    a_max = curr
                a_l.append(curr)

                # Regular
                # f[j, i] += f[l, i - 1] * transitions[l, j] * emission

            f[j, i] = 0
            for l in range(0, num_of_states):
                b_l = a_l[l] - a_max
                f[j, i] += math.exp(b_l)

            f[j, i] = mylog(f[j, i]) + a_max + mylog(emission)

    likelihood = 0
    for i in range(0, num_of_states):
        curr = f[i, len(s) - 1]
        if curr > -math.inf:
            likelihood += curr
    print(f"forward likelihood is: {likelihood}")
    return f
Example #9
0
def inserts(connection):
    def _get_paths():
        return sorted(INSERTS_DIR.glob("*.sql"))

    cursor = connection.cursor()
    cursor.execute(f"ATTACH DATABASE '{SRC_DB_PATH}' AS src_db;")
    cursor.execute(f"ATTACH DATABASE '{TARGET_DB_PATH}' AS target_db;")

    srcpaths = _get_paths()
    myinfo(f"{len(srcpaths)} INSERT SQL scripts", label="File count")

    for i, insertpath in enumerate(srcpaths):
        tname = re.match(r"insert_(\w+)", insertpath.stem).groups()[0]
        targettbl = f"target_db.{tname}"
        mylog(f"#{i+1} {targettbl}", insertpath, label="Running insert")
        stmt = insertpath.read_text()
        cursor.execute(stmt)

        myinfo(f"{count_rows(cursor, targettbl)} rows in {targettbl}",
               label="Inserted")
def main(dbpath):
    mylog(dbpath, label="Connecting to")
    conn = connect_to_db(dbpath)
    colindexes = collate_indexes(conn)

    outs = csv.DictWriter(
        stdout, fieldnames=("table_name", "rowcount", "colgroup", "pct_total_count")
    )
    outs.writeheader()

    for tablename, colstrings in colindexes.items():
        total_rows = count_table_rows(conn, tablename)

        outs.writerow({"table_name": tablename, "rowcount": total_rows})

        for colstr in colstrings:
            d = {"table_name": tablename, "colgroup": colstr}
            d["rowcount"] = count_colgroup_rows(conn, tablename, colstr)
            d["pct_total_count"] = floor(100.0 * d["rowcount"] / total_rows)
            outs.writerow(d)

    conn.close()
def fetch_and_save(url, destpath):
    xb = existed_size(destpath)
    purl = Path(url)
    if xb:
        mylog(f"{destpath}", f"{xb} bytes", label="Exists")
        mylog(purl.name, purl.parent, label="Skipping")
    else:
        mylog(purl.name, purl.parent, label="Downloading")
        resp = fetch(url)
        destpath.parent.mkdir(exist_ok=True, parents=True)
        with open(destpath, "wb") as dest:
            for data in resp:
                dest.write(data)

        mylog(destpath, f"{existed_size(destpath)} bytes", label="Saved")
def create_tables(connection, schema_path):
    def _parse_statements(txt):
        """
        assumes each create statement is delimited by ';'
        Returns a dict, with table names as keys, create statements as values
        """
        d = {}
        create_stmts = [s.strip("\n ") for s in txt.split(CREATE_TABLE_DELIMITER)]
        create_stmts = [s for s in create_stmts if s]

        for t in create_stmts:
            stmt = t.strip()
            tbl = re.search(r'CREATE TABLE[^"]*?"([^"]+)" *\(', stmt).groups()[0]
            d[tbl] = stmt
        return d

    txt = schema_path.read_text()
    statements = _parse_statements(txt)

    myinfo(f"Read {len(statements.keys())} create table statements")

    for tbl, stmt in statements.items():
        mylog(f'CREATE TABLE "{tbl}"...')
        connection.cursor().execute(stmt)
Example #13
0
def index_table(connection):
    mylog(INDEXES_PATH, label="Indexing tables")
    stmt = INDEXES_PATH.read_text()
    connection.cursor().execute()
Example #14
0
def honestParty(pid, N, t, controlChannel, broadcast, receive, send, B=-1):
    # RequestChannel is called by the client and it is the client's duty to broadcast the tx it wants to include
    if B < 0:
        B = int(math.ceil(N * math.log(N)))
    transactionCache = []
    finishedTx = set()
    proposals = []
    receivedProposals = False
    commonSet = []
    locks = defaultdict(lambda: Queue(1))
    doneCombination = defaultdict(lambda: False)
    ENC_THRESHOLD = N - 2 * t
    global finishcount
    encPK, encSKs = getEncKeys()
    encCounter = defaultdict(lambda: {})
    includeTransactionChannel = Queue()

    def probe(i):
        if len(
                encCounter[i]
        ) >= ENC_THRESHOLD and receivedProposals and not locks[i].full(
        ) and not doneCombination[i]:  # by == this part only executes once.
            oriM = encPK.combine_shares(
                deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]),
                dict(itertools.islice(encCounter[i].iteritems(),
                                      ENC_THRESHOLD)))
            doneCombination[i] = True
            locks[i].put(oriM)

    def listener():
        while True:
            sender, msgBundle = receive()
            if msgBundle[0] == 'O':
                encCounter[msgBundle[1]][sender] = msgBundle[2]
                probe(msgBundle[1])
            else:
                includeTransactionChannel.put(
                    (sender, msgBundle))  # redirect to includeTransaction

    Greenlet(listener).start()

    while True:
        op, msg = controlChannel.get()
        if op == "IncludeTransaction":
            if isinstance(msg, Transaction):
                # transactionCache.add(msg)
                transactionCache.append(msg)
            elif isinstance(msg, set):
                for tx in msg:
                    transactionCache.append(tx)
            elif isinstance(msg, list):
                transactionCache.extend(msg)
        elif op == "Halt":
            break
        elif op == "Msg":
            broadcast(eval(msg))  # now the msg is something we mannually send
        mylog("timestampB (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
        if len(transactionCache) < B:  # Let's wait for many transactions. : )
            time.sleep(0.5)
            print "Not enough transactions", len(transactionCache)
            continue

        oldest_B = transactionCache[:B]
        selected_B = random.sample(oldest_B, min(B / N, len(oldest_B)))
        print "[%d] proposing %d transactions" % (pid, min(
            B / N, len(oldest_B)))
        aesKey = random._urandom(32)  #
        encrypted_B = encrypt(aesKey, ''.join(selected_B))
        encryptedAESKey = encPK.encrypt(aesKey)
        proposal = serializeEnc(encryptedAESKey) + encrypted_B
        mylog("timestampIB (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
        commonSet, proposals = includeTransaction(
            pid, N, t, proposal, broadcast, includeTransactionChannel.get,
            send)
        mylog("timestampIE (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
        receivedProposals = True
        for i in range(N):
            probe(i)
        for i, c in enumerate(commonSet):  # stx is the same for every party
            if c:
                share = encSKs[pid].decrypt_share(
                    deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]))
                broadcast(('O', i, share))
        mylog("timestampIE2 (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
        recoveredSyncedTxList = []

        def prepareTx(i):
            rec = locks[i].get()
            encodedTxSet = decrypt(rec, proposals[i][ENC_SERIALIZED_LENGTH:])
            assert len(encodedTxSet) % TR_SIZE == 0
            recoveredSyncedTx = [
                encodedTxSet[i:i + TR_SIZE]
                for i in range(0, len(encodedTxSet), TR_SIZE)
            ]
            recoveredSyncedTxList.append(recoveredSyncedTx)

        thList = []
        for i, c in enumerate(commonSet):  # stx is the same for every party
            if c:
                s = Greenlet(prepareTx, i)
                thList.append(s)
                s.start()
        gevent.joinall(thList)
        mylog("timestampE (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
        for rtx in recoveredSyncedTxList:
            finishedTx.update(set(rtx))

        mylog("[%d] %d distinct tx synced and %d tx left in the pool." %
              (pid, len(finishedTx), len(transactionCache) - len(finishedTx)),
              verboseLevel=-2)
        lock.get()
        finishcount += 1
        lock.put(1)
        if finishcount >= N - t:  # convenient for local experiments
            sys.exit()
    mylog("[%d] Now halting..." % (pid))
def honestParty(pid, N, t, controlChannel, broadcast, receive, send, B = -1):
    # RequestChannel is called by the client and it is the client's duty to broadcast the tx it wants to include
    if B < 0:
        B = int(math.ceil(N * math.log(N)))
    transactionCache = []
    finishedTx = set()
    proposals = []
    receivedProposals = False
    commonSet = []
    locks = defaultdict(lambda : Queue(1))
    doneCombination = defaultdict(lambda : False)
    ENC_THRESHOLD = N - 2 * t
    global finishcount
    encPK, encSKs = getEncKeys()
    encCounter = defaultdict(lambda : {})
    includeTransactionChannel = Queue()

    def probe(i):
        if len(encCounter[i]) >= ENC_THRESHOLD and receivedProposals and not locks[i].full() and not doneCombination[i]:  # by == this part only executes once.
            oriM = encPK.combine_shares(deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]),
                                        dict(itertools.islice(encCounter[i].iteritems(), ENC_THRESHOLD))
                                        )
            doneCombination[i] = True
            locks[i].put(oriM)

    def listener():
        while True:
            sender, msgBundle = receive()
            if msgBundle[0] == 'O':
                encCounter[msgBundle[1]][sender] = msgBundle[2]
                probe(msgBundle[1])
            else:
                includeTransactionChannel.put((sender, msgBundle))  # redirect to includeTransaction

    Greenlet(listener).start()

    while True:
            op, msg = controlChannel.get()
            if op == "IncludeTransaction":
                if isinstance(msg, Transaction):
                    # transactionCache.add(msg)
                    transactionCache.append(msg)
                elif isinstance(msg, set):
                    for tx in msg:
                        transactionCache.append(tx)
                elif isinstance(msg, list):
                    transactionCache.extend(msg)
            elif op == "Halt":
                break
            elif op == "Msg":
                broadcast(eval(msg))  # now the msg is something we mannually send
            mylog("timestampB (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
            if len(transactionCache) < B:  # Let's wait for many transactions. : )
                time.sleep(0.5)
                print "Not enough transactions", len(transactionCache)
                continue

            oldest_B = transactionCache[:B]
            selected_B = random.sample(oldest_B, min(B/N, len(oldest_B)))
            print "[%d] proposing %d transactions" % (pid, min(B/N, len(oldest_B)))
            aesKey = random._urandom(32)  #
            encrypted_B = encrypt(aesKey, ''.join(selected_B))
            encryptedAESKey = encPK.encrypt(aesKey)
            proposal = serializeEnc(encryptedAESKey) + encrypted_B
            mylog("timestampIB (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
            commonSet, proposals = includeTransaction(pid, N, t, proposal, broadcast, includeTransactionChannel.get, send)
            mylog("timestampIE (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
            receivedProposals = True
            for i in range(N):
                probe(i)
            for i, c in enumerate(commonSet):  # stx is the same for every party
                if c:
                    share = encSKs[pid].decrypt_share(deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]))
                    broadcast(('O', i, share))
            mylog("timestampIE2 (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
            recoveredSyncedTxList = []
            def prepareTx(i):
                rec = locks[i].get()
                encodedTxSet = decrypt(rec, proposals[i][ENC_SERIALIZED_LENGTH:])
                assert len(encodedTxSet) % TR_SIZE == 0
                recoveredSyncedTx = [encodedTxSet[i:i+TR_SIZE] for i in range(0, len(encodedTxSet), TR_SIZE)]
                recoveredSyncedTxList.append(recoveredSyncedTx)
            thList = []
            for i, c in enumerate(commonSet):  # stx is the same for every party
                if c:
                    s = Greenlet(prepareTx, i)
                    thList.append(s)
                    s.start()
            gevent.joinall(thList)
            mylog("timestampE (%d, %lf)" % (pid, time.time()), verboseLevel=-2)
            for rtx in recoveredSyncedTxList:
                finishedTx.update(set(rtx))

            mylog("[%d] %d distinct tx synced and %d tx left in the pool." % (pid, len(finishedTx), len(transactionCache) - len(finishedTx)), verboseLevel=-2)
            lock.get()
            finishcount += 1
            lock.put(1)
            if finishcount >= N - t:  # convenient for local experiments
                sys.exit()
    mylog("[%d] Now halting..." % (pid))
Example #16
0
    def _init_model(self):
        # Placeholders for the input values
        self.X = Input(shape=(self.params['img_height'],
                              self.params['img_width'],
                              self.params['img_channels']))
        self.z = Input(shape=(self.params['z_dim'], ))
        self.c = Input(shape=(self.params['c_dim'], ))

        self.D = self.create_discriminator()
        self.E = self.create_encoder()
        self.G = self.create_generator()

        # Encoding of real image
        self.Z_hat = self.encode(self.X)
        # Fake image generated by G
        self.X_hat = self.generate(Concatenate()([self.z, self.c]))
        # Encoding of fake image
        self.Z_gen = self.encode(self.X_hat)

        # D prediction for real images
        D_enc = self.discriminate(self.X, self.Z_hat)
        # D prediction for generated images
        D_gen = self.discriminate(self.X_hat, Concatenate()([self.z, self.c]))

        # Get disentangled components of the encoding
        c_gen = self.Z_gen[:, self.params['z_dim']:]
        c_gen_cont = c_gen[:, self.params['num_disc_vars']:]
        c_cont = self.c[:, self.params['num_disc_vars']:]
        c_gen_cat = c_gen[:, :self.params['num_disc_vars']]
        c_cat = self.c[:, :self.params['num_disc_vars']]

        # Crossentropy in continuous variables
        cont_stddev_c_gen = K.ones_like(c_gen_cont)
        eps_c_gen = (c_cont - c_gen_cont) / (cont_stddev_c_gen + 1e-8)
        crossent_c_gen_cont = K.mean(
            -K.sum(0.5*np.log(2*np.pi) - mylog(cont_stddev_c_gen) \
            - 0.5*K.square(eps_c_gen), 1))
        # Crossentropy in categorical variables
        crossent_c_gen_cat = K.mean(-K.sum(mylog(c_gen_cat) * c_cat, 1))

        # Loss for Discriminator and Generator/Encoder
        D_loss = -K.mean(mylog(D_enc) + mylog(1 - D_gen))
        G_loss = -K.mean(mylog(D_gen) + mylog(1 - D_enc)) + \
            crossent_c_gen_cat + crossent_c_gen_cont

        # Collect the trainable weights
        weights_D = self.D.trainable_weights
        weights_GE = self.G.trainable_weights + self.E.trainable_weights

        training_updates_D = Adam(lr=self.params['lr_D'],
                                  beta_1=self.params['beta1_D'],
                                  decay=self.params['ld_D']).get_updates(
                                      weights_D, [], D_loss)
        training_updates_GE = Adam(lr=self.params['lr_G'],
                                   beta_1=self.params['beta1_G'],
                                   decay=self.params['ld_G']).get_updates(
                                       weights_GE, [], G_loss)

        self.train_D_fn = K.function(inputs=[self.X, self.z, self.c],
                                     outputs=[D_loss],
                                     updates=training_updates_D)
        self.train_GE_fn = K.function(inputs=[self.X, self.z, self.c],
                                      outputs=[G_loss],
                                      updates=training_updates_GE)
Example #17
0
def insert_from_csv(connection, src_path):
    NULL_CELL_COUNT = 0
    NULL_ROW_COUNT = 0

    def _convert_blank_to_null(iterdata):
        """
        every cell is expected to be a string
        """
        nonlocal NULL_CELL_COUNT
        nonlocal NULL_ROW_COUNT
        for row in iterdata:
            _row_nulled = False
            for i, v in enumerate(row):
                val = v.strip()
                if not val:
                    row[i] = None
                    NULL_CELL_COUNT += 1
                    if _row_nulled is False:
                        _row_nulled = True
                        NULL_ROW_COUNT += 1
                else:
                    row[i] = val
            yield row

    def _get_insert_statement(tablename, fields):
        fields_qstr = ', '.join(fields)
        vals_qstr = ', '.join('?' for f in fields)
        return f"INSERT INTO {tablename}({fields_qstr}) VALUES ({vals_qstr})"

    def _get_table_name(path):
        """
        path can be anything from:
            osha_violation.csv
            to: osha_violation-004.csv

        Returns:
            violation
        """
        mx = re.match(r'osha_(\w+)(?:-\d+)?', path.stem)
        return mx.groups()[0]

    # ---------------------------------------------------------
    mylog(src_path.name, src_path.parent, label="Reading")
    srcfile = src_path.open()
    records = csv.reader(srcfile)

    tablename = _get_table_name(src_path)
    fieldnames = next(records)
    mylog(tablename, label="Inserting into table")

    iq = _get_insert_statement(tablename, fieldnames)
    #    myinfo(iq, label="INSERT query")
    xrecords = _convert_blank_to_null(records)

    # cursor = connection.cursor()
    # cursor.executemany(iq, xrecords)
    # myinfo(f"{count_rows(cursor, tablename)} rows in table: {tablename}", label="Row count")

    with connection as conn:  # context helper provides MASSIVE speed boost
        cursor = conn.cursor()
        cursor.executemany(iq, xrecords)

        myinfo(f"{count_rows(cursor, tablename)} rows in table: {tablename}",
               label="Row count")

    myinfo(NULL_ROW_COUNT, label="Empty rows NULLED")
    myinfo(NULL_CELL_COUNT, label="Empty cells NULLED")

    srcfile.close()