Example #1
0
    def __init__(self, name, cmd_config=None, yield_single=False):

        # yield_single returns one item at a time,
        # not in chunks like (table_name, f_sql)
        
        self.yield_single = yield_single

        score_config = simple_config.load("parse")
        input_data_dir = score_config["output_data_directory"]

        F_SQL = sorted(glob.glob(os.path.join(input_data_dir,'*')))

        # If there is a whitelist only keep the matching filename
        try:
            whitelist = cmd_config["command_whitelist"].strip()
        except:
            whitelist = None
        if whitelist:
            assert(type(whitelist)==list)

            F_SQL2 = set()
            for f_sql in F_SQL:
                for token in whitelist:
                    if token in f_sql:
                        F_SQL2.add(f_sql)
            F_SQL = F_SQL2

        # Randomize the order of the input files (why? not needed for scoring)
        # F_SQL = random.sample(sorted(F_SQL), len(F_SQL))

        DB_ITR = itertools.product(F_SQL, config["target_columns"])

        # Get database sizes for progress bar
        self.total_items = 0
        for f_sql, target_col in DB_ITR:
            conn = sqlite3.connect(f_sql, check_same_thread=False)
            self.total_items += count_rows(conn, target_col)
            conn.close()
        
        self.F_SQL = F_SQL
        self.config = config
Example #2
0
    DB_ITR = itertools.product(F_SQL, config["target_columns"])

    for f_sql, target_col in DB_ITR:

        f_sql_out = os.path.join(output_dir, os.path.basename(f_sql))
        mkdir(output_dir)
        conn_out  = sqlite3.connect(f_sql_out)
        conn = sqlite3.connect(f_sql, check_same_thread=False)

        tables = list_tables(conn_out)

        if target_col in tables:

            if not _FORCE:

                row_n_conn = count_rows(conn, import_column)
                row_n_conn_out = count_rows(conn_out, target_col)

                if row_n_conn == row_n_conn_out:
                    msg = "{}:{} already exists, skipping"
                    print msg.format(f_sql,target_col)
                    continue

                msg = "{} already exists but there is a size mismatch {} to {}"
                print msg.format(target_col, row_n_conn, row_n_conn_out)

            # Remove the table if it exists
            print "Removing table {}:{}".format(f_sql,target_col)
            conn_out.execute("DROP TABLE {}".format(target_col))