def __init__(self, name, cmd_config=None, yield_single=False): # yield_single returns one item at a time, # not in chunks like (table_name, f_sql) self.yield_single = yield_single score_config = simple_config.load("parse") input_data_dir = score_config["output_data_directory"] F_SQL = sorted(glob.glob(os.path.join(input_data_dir,'*'))) # If there is a whitelist only keep the matching filename try: whitelist = cmd_config["command_whitelist"].strip() except: whitelist = None if whitelist: assert(type(whitelist)==list) F_SQL2 = set() for f_sql in F_SQL: for token in whitelist: if token in f_sql: F_SQL2.add(f_sql) F_SQL = F_SQL2 # Randomize the order of the input files (why? not needed for scoring) # F_SQL = random.sample(sorted(F_SQL), len(F_SQL)) DB_ITR = itertools.product(F_SQL, config["target_columns"]) # Get database sizes for progress bar self.total_items = 0 for f_sql, target_col in DB_ITR: conn = sqlite3.connect(f_sql, check_same_thread=False) self.total_items += count_rows(conn, target_col) conn.close() self.F_SQL = F_SQL self.config = config
DB_ITR = itertools.product(F_SQL, config["target_columns"]) for f_sql, target_col in DB_ITR: f_sql_out = os.path.join(output_dir, os.path.basename(f_sql)) mkdir(output_dir) conn_out = sqlite3.connect(f_sql_out) conn = sqlite3.connect(f_sql, check_same_thread=False) tables = list_tables(conn_out) if target_col in tables: if not _FORCE: row_n_conn = count_rows(conn, import_column) row_n_conn_out = count_rows(conn_out, target_col) if row_n_conn == row_n_conn_out: msg = "{}:{} already exists, skipping" print msg.format(f_sql,target_col) continue msg = "{} already exists but there is a size mismatch {} to {}" print msg.format(target_col, row_n_conn, row_n_conn_out) # Remove the table if it exists print "Removing table {}:{}".format(f_sql,target_col) conn_out.execute("DROP TABLE {}".format(target_col))