def process_azm_file(args): proc_start_time = time.time() ret = -9 use_popen_mode = True sql_dump_file = None try: dir_processing_azm = None dry_str = args['dry'] dry_str = dry_str.strip().lower() dry_mode = (dry_str == "true") print "dry_mode setting: ", dry_mode if dry_mode: print "dry_mode - dont unzip azm for azqdata.db - let preprocess func handle itself" else: print "normal import mode" dir_processing_azm = unzip_azm_to_tmp_folder(args) args['dir_processing_azm'] = dir_processing_azm preprocess_module = args[ 'call_preprocess_func_in_module_before_import'] if not preprocess_module is None: preprocess_module = preprocess_module.replace(".py", "", 1) print "get preprocess module: ", preprocess_module importlib.import_module(preprocess_module) mod = sys.modules[preprocess_module] preprocess = getattr(mod, 'preprocess') print "exec preprocess module > preprocess func" preprocess(dir_processing_azm, args['azm_file']) if dry_mode: print "dry_mode - end here" mv_azm_to_target_folder(args) return 0 check_azm_azq_app_version(args) g_check_and_dont_create_if_empty = args[ 'check_and_dont_create_if_empty'] use_popen_mode = not args['dump_to_file_mode'] if args['target_db_type'] == "sqlite3": if args['target_sqlite3_file'] is None: raise Exception( "INVALID: sqlite3 merge mode requires --target_sqlite3_file option to be specified - ABORT" ) else: use_popen_mode = False # dump to .sql file for .read print "NOTE: now we delete pre y2k rows and if it was popen then the delete would error as 'database is locked' so always dump schema to sql - so force set use_popen_mode = False" use_popen_mode = False if (use_popen_mode): print "using live in-memory pipe of sqlite3 dump output parse mode" else: print "using full dump of sqlite3 to file mode" dump_process = None dumped_sql_fp = None if (use_popen_mode): print("starting sqlite3 subporcess...") dump_process = popen_sqlite3_dump(args) if dump_process is None: raise Exception( "FATAL: dump_process is None in popen_mode - ABORT") else: print("starting sqlite3 to dump db to .sql file...") dumped_sql_fp = dump_db_to_sql(dir_processing_azm) if dumped_sql_fp is None: raise Exception( "FATAL: dumped_sql_fp is None in non popen_mode - ABORT") # sqlite3 merge is simple run .read on args['dumped_sql_fp'] if args['target_db_type'] == "sqlite3": is_target_exists = os.path.isfile(args['target_sqlite3_file']) print "sqlite3 - import to {} from {}".format( args['target_sqlite3_file'], dumped_sql_fp) dumped_sql_fp_adj = dumped_sql_fp + "_adj.sql" of = open(dumped_sql_fp, "r") nf = open( dumped_sql_fp_adj, "wb" ) # wb required for windows so that \n is 0x0A - otherwise \n will be 0x0D 0x0A and doest go with our fmt file and only 1 row will be inserted per table csv in bulk inserts... while True: ofl = of.readline() if ofl == "": break ofl = ofl.replace( "CREATE TABLE android_metadata (locale TEXT);", "", 1) ofl = ofl.replace('CREATE TABLE "', 'CREATE TABLE IF NOT EXISTS "', 1) if ofl.startswith('INSERT INTO "android_metadata"'): ofl = "" if is_target_exists: # dont insert or create qgis tables if ofl.startswith( "CREATE TABLE geometry_columns") or ofl.startswith( "CREATE TABLE spatial_ref_sys" ) or ofl.startswith( 'INSERT INTO "spatial_ref_sys"' ) or ofl.startswith( 'INSERT INTO "geometry_columns"'): ofl = "" nf.write(ofl) #nf.write('\n') nf.close() of.close() cmd = [ args['sqlite3_executable'], args['target_sqlite3_file'], ".read {}".format(dumped_sql_fp_adj.replace("\\", "\\\\")) ] print "cmd: ", cmd ret = call(cmd, shell=False) print "import ret: " + str(ret) if (ret == 0): print("\n=== SUCCESS - import completed in %s seconds" % (time.time() - proc_start_time)) if debug_helpers.debug == 1: print "debug mode keep_tmp_dir..." else: cleanup_tmp_dir(dir_processing_azm) return 0 else: if debug_helpers.debug == 1: print "debug mode keep_tmp_dir..." else: cleanup_tmp_dir(dir_processing_azm) raise Exception( "\n=== FAILED - ret %d - operation completed in %s seconds" % (ret, time.time() - proc_start_time)) raise Exception( "FATAL: sqlite3 mode merge process failed - invalid state") # now we use bulk insert done at create/commit funcs instead g_insert_function = getattr(mod, 'handle_sqlite3_dump_insert') print "### connecting to dbms..." ret = g_connect_function(args) if ret == False: raise Exception("FATAL: connect_function failed") if (args['unmerge']): print "### unmerge mode" # unmerge mode would be handled by same check_if_already_merged_function below - the 'unmerge' flag is in args # check if this azm is already imported/merged in target db (and exit of already imported) # get log_hash sqlstr = "select log_hash from logs limit 1" cmd = [args['sqlite3_executable'], args['file'], sqlstr] print "call cmd:", cmd outstr = subprocess.check_output(cmd) log_hash = outstr.strip() args['log_hash'] = long(log_hash) if log_hash == 0: raise Exception("FATAL: invalid log_hash == 0 case") args['log_start_time_str'] = get_sql_result( "select log_start_time from logs limit 1", args) args['log_end_time_str'] = get_sql_result( "select log_end_time from logs limit 1", args) args['log_start_time'] = get_sql_result( "select strftime('%s', log_start_time) from logs limit 1", args) print "parse log_start_time:", args['log_start_time'] args['log_start_time'] = datetime.fromtimestamp( long(args['log_start_time'])) print "args['log_start_time']:", args['log_start_time'] print "args['log_start_time_str']:", args['log_start_time_str'] args['log_end_time'] = get_sql_result( "select strftime('%s', log_end_time) from logs limit 1", args) print "parse log_end_time:", args['log_start_time'] args['log_end_time'] = datetime.fromtimestamp( long(args['log_end_time'])) print "args['log_end_time']:", args['log_end_time'] print "args['log_end_time_str']:", args['log_end_time_str'] args['log_data_min_time'] = args['log_start_time'] - timedelta( hours=48) print "args['log_data_min_time']:", args['log_data_min_time'] args['log_data_max_time'] = args['log_end_time'] + timedelta(hours=48) print "args['log_data_max_time']:", args['log_data_max_time'] if log_hash == 0: raise Exception("FATAL: invalid log_hash == 0 case") g_check_if_already_merged_function(args, log_hash) ''' now we're connected and ready to import, open dumped file and hadle CREATE/INSERT operations for current target_type (DBMS type)''' if (use_popen_mode == False): sql_dump_file = open(dumped_sql_fp, 'rb') # output for try manual import mode # args['out_sql_dump_file'] = open("out_for_dbtype_{}.sql".format(args['file']), 'w') dprint("entering main loop") n_lines_parsed = 0 while (True): if (use_popen_mode): line = dump_process.stdout.readline() else: line = sql_dump_file.readline() dprint("read line: " + line) # when EOF is reached, we'd get an empty string if (line == ""): print "\nreached end of file/output" break else: n_lines_parsed = n_lines_parsed + 1 handle_sql3_dump_line(args, line) # finally call commit again in case the file didn't have a 'commit' line at the end print "### calling handler's commit func as we've reached the end..." handle_ret = g_commit_function(args, line) # call close() for that dbms handler operation = "merge/import" if (args['unmerge']): operation = "unmerge/delete" if (n_lines_parsed != 0): print( "\n=== SUCCESS - %s completed in %s seconds - tatal n_lines_parsed %d (not including bulk-inserted-table-content-lines)" % (operation, time.time() - proc_start_time, n_lines_parsed)) ret = 0 mv_azm_to_target_folder(args) else: raise Exception( "\n=== FAILED - %s - no lines parsed - tatal n_lines_parsed %d operation completed in %s seconds ===" % (operation, n_lines_parsed, time.time() - proc_start_time)) except Exception as e: type_, value_, traceback_ = sys.exc_info() exstr = traceback.format_exception(type_, value_, traceback_) mv_target_folder = args['move_failed_import_azm_files_to_folder'] if not mv_target_folder is None and not os.path.exists( mv_target_folder): os.makedirs(mv_target_folder) if not mv_target_folder is None: azm_fp = os.path.abspath(args['azm_file']) target_fp = os.path.join(mv_target_folder, os.path.basename(azm_fp)) try: os.remove(target_fp) os.remove(target_fp + "_output.txt") except Exception as x: pass print "move the failed_import_azm_files_to_folder: mv {} to {}".format( azm_fp, target_fp) try: os.rename(azm_fp, target_fp) try: os.rename(azm_fp + "_output.txt", target_fp + "_output.txt") except: pass except Exception as x: print "WARNING: move_failed_import_azm_files_to_folder failed" pass print "re-raise exception e - ", exstr raise e finally: print "cleanup start..." if (use_popen_mode): # clean-up dump process try: dump_process.kill() dump_process.terminate() except: pass else: try: sql_dump_file.close() except: pass try: g_close_function(args) except: pass if debug_helpers.debug == 1: print "debug mode keep_tmp_dir..." pass # keep files for analysis of exceptions in debug mode else: print "cleanup_tmp_dir..." cleanup_tmp_dir(dir_processing_azm) return ret
def handle_sql3_dump_line(args, line): global g_is_in_insert global g_is_in_create global g_insert_buf global g_create_buf global g_insert_function if g_is_in_insert is True: g_insert_buf = g_insert_buf + line if line.strip().endswith(");"): handle_ret = g_insert_function(args, g_insert_buf.strip()) g_is_in_insert = False g_insert_buf = None # dprint("multi line insert END:") return handle_ret else: # dprint("multi line insert still not ending - continue") return True if g_is_in_create: g_create_buf += line.strip() if line.strip().endswith(");"): line = g_create_buf print "multi line create END\ng_is_in_create final line:", line else: return True is_omit_table = False if line.startswith("CREATE TABLE ") or g_is_in_create: g_is_in_create = False # in case user is using already 'sqlite3 merged azqdata.db' there will be the CREATE TABLE IF NOT EXISTS lines which we created - restore it... line = line.replace("CREATE TABLE IF NOT EXISTS ", "CREATE TABLE ", 1) if not line.strip().endswith(");"): print("multi line create START") g_is_in_create = True g_create_buf = line.strip() return True table_name = line.split(" (")[0].replace("CREATE TABLE ", "").replace("\"", "") dprint("check table_name is_omit_table: " + table_name) is_omit_table = table_name in args['omit_tables_array'] dprint("is this table in --omit_tables ? " + table_name + " = " + str(is_omit_table)) if args['only_tables_on']: dprint( "--only_tables on - check if we should exclude this table: " + table_name) is_omit_table = True if table_name in args['only_tables_array']: is_omit_table = False dprint("--only_tables on - exclude this table? " + table_name + " = " + str(is_omit_table)) if (line.startswith("CREATE TABLE ") and not line.startswith("CREATE TABLE android_metadata") and not ("_layer_statistics" in line) and not is_omit_table): # get table name: table_name = line.split(" ")[2].replace("\"", "") if not args['unmerge']: print "firt delete all rows with wrong modem timestamp before 48h of log_start_time and log_end_time for this table:", table_name sqlstr = "delete from {} where time < '{}' or time > '{}';".format( table_name, args['log_data_min_time'], args['log_data_max_time']) cmd = [args['sqlite3_executable'], args['file'], sqlstr] print "call cmd:", cmd try: outstr = subprocess.check_output(cmd) print "delete from ret outstr:", outstr except Exception as se: print "WARNING: delete pre y2k rows from table failed exception:", se print("\nprocessing: create/alter/insert for table_name: " + table_name) # check if create is required for this table (omit if empty) create = True if (not g_check_and_dont_create_if_empty): pass # print "create_empty_tables specified in option - do create" # always create - flag override else: # checking can make final processing slower... print("checking if table is empty ...") sqlstr = "SELECT 1 FROM {} LIMIT 1".format(table_name) cmd = [args['sqlite3_executable'], args['file'], sqlstr] print "call cmd:", cmd outstr = subprocess.check_output(cmd) # print "check has_rows out: "+outstr has_rows = (outstr.strip() == "1") # print "check has_rows ret: " + str(has_rows) if (has_rows): print "table is not empty - do create" create = True else: print "table is empty - omit create" create = False if create: print "processing create at handler module..." # always create - flag override handle_ret = g_create_function(args, line) elif (line.startswith("COMMIT;")): print("\nprocessing: commit") handle_ret = g_commit_function(args, line) return handle_ret elif (line.startswith("INSERT INTO")): raise Exception( "ABORT: currently bulk insert mode is used so only scheme should be dumped/read... found INSERT INTO - abort" ) table_name = line.split(" ")[2].replace("\"", "") if (table_name == "android_metadata"): return True #omit line_stripped = line.strip() if line_stripped.endswith(");"): # dprint("single line insert") handle_ret = g_insert_function(args, line_stripped) return handle_ret else: dprint("multi line insert START") g_is_in_insert = True g_insert_buf = line return True else: # dprint "omit line: "+line return True return False
def unzip_azm_to_tmp_folder(args): dprint("unzip_azm_to_tmp_folder 0") print "args['azm_file']: " + args['azm_file'] azm_fp = os.path.abspath(args['azm_file']) print "azm_fp: " + azm_fp if os.path.isfile(azm_fp): pass else: raise Exception("INVALID: - azm file does not exist at given path: " + str(azm_fp) + " - ABORT") dir_azm_unpack = os.path.dirname(azm_fp) print "dir_azm_unpack: " + dir_azm_unpack azm_name_no_ext = os.path.splitext(os.path.basename(azm_fp))[0] print "azm_name_no_ext: " + azm_name_no_ext if 'TMP_GEN_PATH' in os.environ: dir_azm_unpack = os.environ['TMP_GEN_PATH'] print "dir_azm_unpack using TMP_GEN_PATH:", dir_azm_unpack dir_processing_azm = os.path.join( dir_azm_unpack, "tmp_azm_db_merge_" + str(uuid.uuid4()) + "_" + azm_name_no_ext.replace(" ", "-")) # replace 'space' in azm file name args['dir_processing_azm'] = dir_processing_azm dprint("unzip_azm_to_tmp_folder 1") # try clear tmp processing folder just in case it exists from manual unzip or previous failed imports try: shutil.rmtree(dir_processing_azm) except Exception as e: estr = str(e) if ("cannot find the path specified" in estr or "No such file or" in estr): pass else: print("rmtree dir_processing_azm: " + str(e)) raise e dprint("unzip_azm_to_tmp_folder 2") os.mkdir(dir_processing_azm) dprint("unzip_azm_to_tmp_folder 3") try: azm = zipfile.ZipFile(args['azm_file'], 'r') azm.extract("azqdata.db", dir_processing_azm) try: # handle malformed db cases import pandas as pd import sqlite3 dbfile = os.path.join(dir_processing_azm, "azqdata.db") dbcon = sqlite3.connect(dbfile) integ_check_df = pd.read_sql("PRAGMA integrity_check;", dbcon) try: dbcon.close( ) # we dont use dbcon in further azm_db_merge code, and db file can be removed if integ not ok - avoid file locks except: pass print "azm_db_merge: sqlite db integ_check_df first row:", integ_check_df.iloc[ 0] if integ_check_df.iloc[0].integrity_check == "ok": print "azm_db_merge: sqlite3 db integrity check ok" else: print "azm_db_merge: sqlite3 db integrity check failed - try recover..." dump_ret = subprocess.call( "sqlite3 '{}' .dump > '{}.txt'".format(dbfile, dbfile), shell=True) print "azm_db_merge: dump_ret:", dump_ret if dump_ret != 0: print "WARNING: azm_db_merge: recov corrupt sqlite db file - failed to dump sqlite db file" else: os.remove(dbfile) import_ret = subprocess.call( "sqlite3 '{}' < '{}.txt'".format(dbfile, dbfile), shell=True) print "azm_db_merge: recov corrupt db file import ret:", import_ret except: type_, value_, traceback_ = sys.exc_info() exstr = traceback.format_exception(type_, value_, traceback_) print "WARNING: check malformed db exception:", exstr if args['get_schema_shasum_and_exit']: print "get_schema_shasum_and_exit start" sha1 = hashlib.sha1() #print "get_schema_shasum_and_exit 1" dbfile = os.path.join(dir_processing_azm, "azqdata.db") #print "get_schema_shasum_and_exit 2" cmd = [args['sqlite3_executable'], dbfile, ".schema"] print "call cmd:", cmd schema = subprocess.check_output(cmd) #print "get_schema_shasum_and_exit 3" sha1.update(schema) #print "get_schema_shasum_and_exit 4" print str( sha1.hexdigest() ) + " is the sha1 for the schema of azqdata.db inside azm: " + args[ 'azm_file'] print "get_schema_shasum_and_exit done" azm.close() cleanup_tmp_dir(dir_processing_azm) exit(0) azm.close() except Exception as e: try: cleanup_tmp_dir(dir_processing_azm) except: pass raise Exception( "Invalid azm_file: azm file does not contain azqdata.db database - exception: " + str(e)) dprint("unzip_azm_to_tmp_folder 4") args['file'] = os.path.join(dir_processing_azm, "azqdata.db") return dir_processing_azm
def create(args, line): global g_cursor, g_conn global g_prev_create_statement_table_name global g_prev_create_statement_column_names global g_exec_buf global g_is_ms, g_is_postgre global g_unmerge_logs_row g_prev_create_statement_column_names = None if args["pg_schema"] != "public": g_cursor.execute("SET search_path = '{}','public';".format( args["pg_schema"])) line_adj = sql_adj_line(line) table_name = get_table_name(line_adj) schema_per_month_name = "per_month_{}".format(table_name) if table_name == "logs": uline = line.replace('"log_hash" BIGINT,', '"log_hash" BIGINT UNIQUE,', 1) print "'logs' table cre - make log_hash unique for this table: ", uline line_adj = sql_adj_line(uline) if table_name == "wifi_scanned": wifi_scanned_MIN_APP_V0 = 3 wifi_scanned_MIN_APP_V1 = 0 wifi_scanned_MIN_APP_V2 = 742 print "check azm apk ver for wifi_scanned table omit: ", args[ "azm_apk_version"] if args["azm_apk_version"] < wifi_scanned_MIN_APP_V0 * 1000 * 1000 + wifi_scanned_MIN_APP_V1 * 1000 + wifi_scanned_MIN_APP_V2: print "omit invalidly huge wifi_scanned table in older app vers requested by a customer - causes various db issues" return False if args['import_geom_column_in_location_table_only'] and table_name != "location": line_adj = sql_adj_line(line.replace(',"geom" BLOB', '', 1)) if (g_unmerge_logs_row is not None): print "### unmerge mode - delete all rows for this azm in table: " + table_name """ now we use log_hash - no need to parse time # remove 3 traling 000 from microsecs str start_dt_str = str(g_unmerge_logs_row['log_start_time'])[:-3] end_dt_str = str(g_unmerge_logs_row['log_end_time'])[:-3] """ sqlstr = "delete from \"" + table_name + "\" where \"log_hash\" = {}".format( g_unmerge_logs_row['log_hash']) g_exec_buf.append(sqlstr) return True g_prev_create_statement_table_name = table_name sqlstr = line_adj ''' Now get local columns Example sqlstr: CREATE TABLE "browse" ("time" DATETIME,"time_ms" INT,"posid" INT,"seqid" INT,"netid" INT, "Browse_All_Session_Throughput_Avg" real, "Data_Browse_Throughput" real, "Data_Browse_Throughput_Avg" real, "Data_Browse_Total_Loaded_Obj" smallint, "Data_Browse_Total_Page_Obj" smallint, "Data_Browse_Page_Load_Time" real, "Data_Browse_Page_Load_Time_Avg" real, "Data_Browse_Total_Sessions" smallint, "Data_Browse_Total_Success" smallint, "Data_Browse_Total_Fail_Page" smallint, "Data_Browse_Total_Fail_Obj" smallint, "Data_Browse_Total_Timeout" smallint, "Data_Browse_Exterior_Fail_Page" smallint, "Data_Browse_Exterior_Fail_Obj" smallint, "Browse_Throughput" real, "Browse_Throughput_max" real, "Browse_Throughput_min" real, "Browse_Duration" real, "Browse_Duration_max" real, "Browse_Duration_min" real); ''' # get part inside parenthesis ls = line_adj.split('" (') dprint("ls :" + str(ls)) ls = ls[1].split(");")[0] # split by comma ls = ls.split(",") # parse column names and keep for insert commands local_columns = [] local_column_names = [] for lsp in ls: splitted = lsp.split('"') dprint("splitted: " + str(splitted)) col_name = splitted[1] col_type = splitted[2].strip() omit_col = False """ import_geom_column_in_location_table_only feature already implemented at line_adj above if args['import_geom_column_in_location_table_only'] and col_name == "geom" and table_name != "location": omit_col = True """ if omit_col == False: local_columns.append([col_name, col_type]) local_column_names.append(col_name) # args['prev_create_statement_column_names'] g_prev_create_statement_column_names = str(local_column_names).replace( "'", "").replace("[", "(").replace("]", ")") remote_column_names = None is_contains_geom_col = False try: dprint("create sqlstr: " + sqlstr) if g_is_postgre: if args['pg10_partition_by_month']: if table_name == "logs": # dont partition logs table pass else: # create target partition for this log + table # ok - partition this table sqlstr = sqlstr.replace(";", "") + " PARTITION BY RANGE (time);" try: with g_conn as c: g_cursor.execute( "SELECT schema_name FROM information_schema.schemata WHERE schema_name = '{}';" .format(schema_per_month_name)) if bool(g_cursor.rowcount): print "schema_per_month_name already exists:", schema_per_month_name pass else: print "cre schema now because: NOT schema_per_month_name already exists:", schema_per_month_name with g_conn as c: c_table_per_month_sql = "create schema {};".format( schema_per_month_name) ret = g_cursor.execute( c_table_per_month_sql) g_conn.commit() print "success: create per_month [" + c_table_per_month_sql + "] success" except: type_, value_, traceback_ = sys.exc_info() exstr = str( traceback.format_exception(type_, value_, traceback_)) print "WARNING: create table_per_month schema failed - next insert/COPY commands would likely faile now - exstr:", exstr dprint("create sqlstr postgres mod: " + sqlstr) is_contains_geom_col = True # postgis automatically creates/maintains "geometry_columns" 'view' if g_is_ms: dprint("create sqlstr mod mssql geom: " + sqlstr) is_contains_geom_col = True if g_is_postgre: with g_conn as c: g_cursor.execute( "select * from information_schema.tables where table_schema=%s and table_name=%s", ( args["pg_schema"], table_name, )) if bool(g_cursor.rowcount): print "omit already existing table - raise exception to check columns instead" raise Exception( "table {} already exists - no need to create".format( table_name)) ret = None # use with for auto rollback() on g_conn on expected fails like already exists with g_conn as c: ret = g_cursor.execute(sqlstr) # commit now otherwise COPY might not see partitions g_conn.commit() dprint("create execute ret: " + str(ret)) """ if control reaches here then the create is successful - table was not existing earlier - so remote cols must be the same """ remote_column_names = local_column_names except Exception as e: emsg = str(e) dprint("create failed: " + emsg + "\n from sqlstr:\n" + sqlstr + "\nori line:\n" + line) if ("There is already an object named" in emsg or " already exists" in emsg): dprint("""This table already exists - checking if all local columns already exist in remote - otherwise will add each missing cols to remote table before inserting to it.""") remote_columns = get_remote_columns(args, table_name) remote_column_names = get_col_names(remote_columns) if (len(remote_columns) == 0): raise Exception("FATAL: failed to parse/list remote columns") # now get local columns that are not in remote local_columns_not_in_remote = [] for col in local_columns: col_name = col[0] col_type = col[1] ####### quickfix: col_type override for unsigned int32 cols from sqlite (bindLong already) - conv to bigint in pg as pg doesnt have unsigned if col_name == "lte_volte_rtp_source_ssrc" or col_name == "lte_volte_rtp_timestamp": # might need to psql to do first manually if log was already imported using older azm_db_merge: # alter table all_logs.lte_volte_rtp_msg alter column lte_volte_rtp_source_ssrc type bigint; # alter table all_logs.lte_volte_rtp_msg alter column lte_volte_rtp_timestamp type bigint; col_type = "bigint" ####################### is_already_in_table = col_name in remote_column_names dprint("local_col_name: " + col_name + " col_type: " + col_type + " - is_already_in_table: " + str(is_already_in_table)) if (not is_already_in_table): local_columns_not_in_remote.append(' "{}" {}'.format( col_name, col_type)) # TODO: handle if different type? n_cols_to_add = len(local_columns_not_in_remote) if (n_cols_to_add == 0): dprint("n_cols_to_add == 0 - no need to alter table") else: print "n_cols_to_add: " + str( n_cols_to_add ) + " - need to alter table - add cols:" + str( local_columns_not_in_remote) + "\nremote_cols:\n" + str( remote_columns) # example: ALTER TABLE dbo.doc_exa ADD column_b VARCHAR(20) NULL, column_c INT NULL ; alter_str = "ALTER TABLE \"{}\" ".format(table_name) alter_cols = "" for new_col in local_columns_not_in_remote: # not first prefix = "" if (alter_cols != ""): prefix = ", " alter_cols = alter_cols + prefix + " ADD " + new_col alter_str = alter_str + alter_cols + ";" sqlstr = sql_adj_line(alter_str) print "execute alter_str: " + sqlstr exec_creatept_or_alter_handle_concurrency(sqlstr) # re-get remote cols remote_columns = get_remote_columns(args, table_name) remote_column_names = get_col_names(remote_columns) print("get_remote_columns after alter: " + str(remote_column_names)) else: raise Exception("FATAL: create table error - : \nemsg:\n " + emsg + " \nsqlstr:\n" + sqlstr) if g_bulk_insert_mode: if args['pg10_partition_by_month']: if table_name == "logs": # dont partition logs table pass else: log_hash = args['log_hash'] ## check/create partitions for month for log_hash, prev month, after month ori_log_hash_datetime = datetime.datetime.fromtimestamp( log_hash & 0xffffffff) # log_hash lower 32 bits is the timestamp months_pt_check_list = [ ori_log_hash_datetime + relativedelta(months=-1), ori_log_hash_datetime, ori_log_hash_datetime + relativedelta(months=+1) ] for log_hash_datetime in months_pt_check_list: log_hash_ym_str = log_hash_datetime.strftime('%Y_%m') print "log_hash_datetime:", log_hash_datetime ntn = "logs_{}".format( log_hash_ym_str ) # simpler name because we got cases where schema's table name got truncated: activate_dedicated_eps_bearer_context_request_params_3170932708 pltn = "{}.{}".format(schema_per_month_name, ntn) per_month_table_already_exists = False with g_conn as c: check_sql = "select * from information_schema.tables where table_schema='{}' and table_name='{}'".format( schema_per_month_name, ntn) print "check_sql:", check_sql g_cursor.execute(check_sql) if bool(g_cursor.rowcount): per_month_table_already_exists = True if per_month_table_already_exists: print "omit create already existing per_month table:", pltn else: print "NOT omit create already existing per_month table:", pltn cre_target_pt_sql = "CREATE TABLE {} PARTITION OF {} FOR VALUES from ('{}-1') to ('{}-1');".format( pltn, table_name, log_hash_datetime.strftime("%Y-%m"), (log_hash_datetime + relativedelta(months=+1)).strftime("%Y-%m")) if args['pg10_partition_index_log_hash']: cre_index_for_pt_sql = "CREATE INDEX ON {} (log_hash);".format( pltn) cre_target_pt_sql += " " + cre_index_for_pt_sql print("cre_target_pt_sql:", cre_target_pt_sql) exec_creatept_or_alter_handle_concurrency( cre_target_pt_sql, allow_exstr_list=[" already exists"]) ###### let sqlite3 dump contents of table into file table_dump_fp = os.path.join(g_dir_processing_azm, table_name + ".csv") table_dump_format_fp = os.path.join(g_dir_processing_azm, table_name + ".fmt") #print("table_dump_fp: "+table_dump_fp) #print("table_dump_format_fp: "+table_dump_format_fp) # create dump csv of that table """ example dump of logs table: sqlite3 azqdata.db -list -newline "|" -separator "," ".out c:\\azq\\azq_report_gen\\azm_db_merge\\logs.csv" "select * from logs" """ # get col list, and hex(col) for blob coulumns geom_col_index = -1 i = 0 col_select = "" first = True dprint("local_columns: " + str(local_columns)) for col in local_columns: col_name = col[0] col_type = col[1] if first: first = False else: col_select = col_select + "," pre = " " post = "" if col_type == "geometry" or ( g_is_postgre and col_type == "bytea") or ( g_is_ms and col_type.startswith("varbinary")): pre = " hex(" post = ")" if col_name == "geom": geom_col_index = i col_select = col_select + pre + col_name + post i = i + 1 dprint("col_select: " + col_select) if g_is_ms: ret = call( [ args['sqlite3_executable'], args['file'], "-ascii", "-list", '-separator', azm_db_constants.BULK_INSERT_COL_SEPARATOR_VALUE, '-newline', azm_db_constants.BULK_INSERT_LINE_SEPARATOR_VALUE, '.out ' + '"' + table_dump_fp.replace("\\", "\\\\") + '"', # double backslash because it needs to go inside sqlite3 cmd parsing again 'select ' + col_select + ' from ' + table_name ], shell=False) if g_is_postgre: dump_cmd = [ args['sqlite3_executable'], args['file'], "-ascii", "-csv", '-separator', ',', '-newline', '\n', '.out ' + '"' + table_dump_fp.replace("\\", "\\\\") + '"', # double backslash because it needs to go inside sqlite3 cmd parsing again 'select ' + col_select + ' from ' + table_name ] dprint("dump_cmd:", dump_cmd) ret = call(dump_cmd, shell=False) table_dump_fp_adj = table_dump_fp + "_adj.csv" # dont add lines where all cols are null - bug in older azm files causing COPY to fail... all_cols_null_line = "" for ci in range(len(local_columns)): if ci != 0: all_cols_null_line += "," print "all_cols_null_line:", all_cols_null_line pd_csvadj_success = False # trying preliminary version (without geom conv yet) of pandas proved that it was slower than python file looping and also failing at events table so disable for now.... """ if g_is_postgre: # try use pandas to adjust csv instead of looping through file... try: import pandas as pd df = pd.read_csv(table_dump_fp, header=None, names=local_column_names) print "df.columns:", df.columns print "pd table_dump_fp df len:", len(df) df.geom = None df = df.dropna(how='all') df.to_csv(table_dump_fp_adj, header=None) pd_csvadj_success = True except: type_, value_, traceback_ = sys.exc_info() exstr = str(traceback.format_exception(type_, value_, traceback_)) print "pd_csvadj exception:", exstr print "pd_csvadj_success:", pd_csvadj_success """ if not pd_csvadj_success: with open(table_dump_fp, "rb") as of: with open( table_dump_fp_adj, "wb" ) as nf: # wb required for windows so that \n is 0x0A - otherwise \n will be 0x0D 0x0A and doest go with our fmt file and only 1 row will be inserted per table csv in bulk inserts... while True: ofl = of.readline() if g_is_postgre: ofl = ofl.replace(',""', ',') """ no need to check this, only old stale thread versions would have these cases and will have other cases too so let it crash in all those cases if ofl.strip() == all_cols_null_line: continue """ ofl = find_and_conv_spatialite_blob_to_wkb(ofl) if ofl == "": break nf.write(ofl) table_dump_fp = table_dump_fp_adj dprint("dump table: " + table_name + " for bulk insert ret: " + str(ret)) if (ret != 0): print "WARNING: dump table: " + table_name + " for bulk insert failed - likely sqlite db file error like: database disk image is malformed. In many cases, data is still correct/complete so continue." if (os.stat(table_dump_fp).st_size == 0): print "this table is empty..." return True # if control reaches here then the table is not empty if g_is_ms and is_contains_geom_col: # add this table:geom to 'geometry_columns' (table_name was set to UNIQUE so it will fail if already exists... """ somehow not working - let QGIS detect automatically itself for now... try: insert_geomcol_sqlstr = "INSERT INTO \"geometry_columns\" VALUES('azq','dbo','{}','geom',NULL,4326,'POINT');".format(table_name) dprint("insert_geomcol_sqlstr: "+insert_geomcol_sqlstr) ret = g_cursor.execute(insert_geomcol_sqlstr) print "insert this table:geom into geometry_columns done" except Exception as e: estr = str(e) dprint("insert this table:geom into geometry_columns exception: "+estr) pass """ # create fmt format file for that table """ generate format file: https://msdn.microsoft.com/en-us/library/ms178129.aspx format file contents: https://msdn.microsoft.com/en-us/library/ms191479(v=sql.110).aspx """ n_local_cols = len(local_column_names) fmt = open(table_dump_format_fp, "w") fmt.write("11.0\n") # ver - 11.0 = SQL Server 2012 fmt.write(str(n_local_cols) + "\n") # n cols host_field_order = 0 # dyn gen - first inc wil get it to 1 host_file_data_type = "SQLCHAR" prefix_length = 0 host_file_data_length = 0 # When a delimited text file having a prefix length of 0 and a terminator is imported, the field-length value is ignored, because the storage space used by the field equals the length of the data plus the terminator terminator = None # dyn gen server_col_order = None # dyn gen server_col_name = None # dyn gen col_coalition = "" for col in local_column_names: host_field_order = host_field_order + 1 if (n_local_cols == host_field_order): #last terminator = azm_db_constants.BULK_INSERT_LINE_SEPARATOR_PARAM else: terminator = azm_db_constants.BULK_INSERT_COL_SEPARATOR_PARAM if not table_name.startswith("wifi_scanned"): dprint("remote_column_names: " + str(remote_column_names)) pass dprint("col: " + str(col)) server_col_order = remote_column_names.index( col) + 1 # not 0 based server_col_name = col # always same col name fmt.write('{}\t{}\t{}\t{}\t"{}"\t{}\t"{}"\t"{}"\n'.format( host_field_order, host_file_data_type, prefix_length, host_file_data_length, terminator, server_col_order, server_col_name, col_coalition)) fmt.flush() fmt.close() # both dump csv and format fmt files are ready # execute bulk insert sql now if g_is_ms: sqlstr = "bulk insert \"{}\" from '{}' with ( formatfile = '{}' );".format( table_name, table_dump_fp, table_dump_format_fp) if g_is_postgre: colnames = "" first = True for col in local_column_names: if not first: colnames = colnames + "," if first: first = False colnames = colnames + '"' + col + '"' sqlstr = "copy \"{}\" ({}) from STDIN with (format csv, NULL '')".format( table_name, colnames) dprint("START bulk insert sqlstr: " + sqlstr) g_exec_buf.append((sqlstr, table_dump_fp)) # print("DONE bulk insert - nrows inserted: "+str(ret.rowcount)) return True
def connect(args): global g_bulk_insert_mode global g_dir_processing_azm global g_cursor, g_conn global g_exec_buf global g_is_ms, g_is_postgre if (args['target_db_type'] == 'postgresql'): print "PostgreSQL mode initializing..." g_is_postgre = True import psycopg2 elif (args['target_db_type'] == 'mssql'): g_is_ms = True import pyodbc # cleanup old stuff just in case close(args) g_bulk_insert_mode = True # always bulk insert mode now g_dir_processing_azm = args['dir_processing_azm'] if g_is_ms: print "Connecting... Target DBMS type: mssql" dprint("connect args: {} {} {} {}".format(args['server_url'], args['server_user'], args['server_password'], args['server_database'])) driver = args['mssql_odbc_driver'] connect_str = 'DRIVER={};SERVER={};DATABASE={};UID={};PWD={}'.format( driver, args['server_url'], args['server_database'], args['server_user'], args['server_password']) #unsafe as users might see in logs print "using connect_str: "+connect_str """ https://msdn.microsoft.com/en-us/library/ms131281.aspx ODBC applications should not use Transact-SQL transaction statements such as BEGIN TRANSACTION, COMMIT TRANSACTION, or ROLLBACK TRANSACTION because this can cause indeterminate behavior in the driver. An ODBC application should run in autocommit mode and not use any transaction management functions or statements, or run in manual-commit mode and use the ODBC SQLEndTran function to either commit or roll back transactions. https://mkleehammer.github.io/pyodbc/api.html >> 'autocommit' in our case set to false and buffer all atomic cmds into g_exec_buf for run once before commit """ g_conn = pyodbc.connect(connect_str, autocommit=False) elif g_is_postgre: print "Connecting... Target DBMS type: PostgreSQL" # example: conn = psycopg2.connect("dbname=azqdb user=azqdb") connect_str = "dbname={} user={} password={} port={}".format( args['server_database'], args['server_user'], args['server_password'], args['pg_port']) print connect_str if args['pg_host'] != None: connect_str = "host=" + args['pg_host'] + " " + connect_str #unsafe as users might see in logs print "using connect_str: "+connect_str args['connect_str'] = connect_str g_conn = psycopg2.connect(connect_str) if (g_conn is None): print "psycopg2.connect returned None" return False print "connected" g_cursor = g_conn.cursor() # post connect steps for each dbms if g_is_postgre and not args['unmerge']: try_cre_postgis( schema="public") # create postgis at public schema first if args["pg_schema"] != "public": print "pg mode create pg_schema:", args["pg_schema"] try: with g_conn as c: ret = g_cursor.execute("create schema " + args["pg_schema"]) c.commit() print "success: create schema " + args[ "pg_schema"] + " success" except Exception as e: estr = str(e) if 'already exists' in estr: dprint("schema already exists") pass else: print("FATAL: CREATE schema failed:" + args["pg_schema"]) raise e # create postgis in public only - print "pg using schema start" # try_cre_postgis(schema=args["pg_schema"]) # inside new schema if g_is_ms: pass ''' somehow not working - let qgis detect itself for now... try: # set 'f_table_name' to unique so we can blindly insert table_name:geom (on create handlers) to it without checking (let mssql check) ret = g_cursor.execute(""" CREATE TABLE [dbo].[geometry_columns]( [f_table_catalog] [varchar](50) NULL, [f_table_schema] [varchar](50) NULL, [f_table_name] [varchar](100) NULL UNIQUE, [f_geometry_column] [varchar](50) NULL, [coord_dimension] [int] NULL, [srid] [int] NULL, [geometry_type] [varchar](50) NULL ) """) print "created qgis table: geometry_columns" except Exception as e: pass try: # below execute would raise an exception if it is already created ret = g_cursor.execute(""" CREATE TABLE spatial_ref_sys (srid INTEGER NOT NULL PRIMARY KEY,auth_name VARCHAR(256) NOT NULL,auth_srid INTEGER NOT NULL,ref_sys_name VARCHAR(256),proj4text VARCHAR(2048) NOT NULL); """) print "created qgis table: spatial_ref_sys" # if control reaches here means the table didn't exist (table was just created and is empty) so insert wgs84 into it... ret = g_cursor.execute(""" INSERT INTO "spatial_ref_sys" VALUES(4326,'epsg',4326,'WGS 84','+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'); """) print "added wgs84 to qgis table: spatial_ref_sys" except Exception as e: pass ''' return True
def find_and_conv_spatialite_blob_to_wkb(csv_line): #print "fac csv_line:", csv_line spat_blob_offset = csv_line.find('0001E6100000') if spat_blob_offset == -1: return csv_line part = csv_line[spat_blob_offset:spat_blob_offset + 120 + 1] #print "part[120]:", part[120] dprint("csv_line spatialite_geom_part: " + part) spatialite_geom_contents = "" if (g_is_postgre and (part[120] == ',' or part[120] == '\n')) or (g_is_ms and part[120] == '\t'): spatialite_geom_contents = part[0:120] else: dprint("check of spatialite_geom_part - failed - abort") return csv_line dprint("spatialite_geom_contents: len " + str(len(spatialite_geom_contents)) + " val: " + spatialite_geom_contents) # convert spatialite geometry blob to wkb """ Spatialite BLOB Format (Point) ------------------------------ http://www.gaia-gis.it/gaia-sins/BLOB-Geometry.html example: 0001E6100000DD30C0F46C2A594041432013008E2B40DD30C0F46C2A594041432013008E2B407C01000000DD30C0F46C2A594041432013008E2B40FE parse: spatialite header: 00 (str_off 0 str_len 2) endian: 01 (str_off 2 str_len 2) SRID: E6 10 00 00 (str_off 4 str_len 8) MBR_MIN_X: DD 30 C0 F4 6C 2A 59 40 (str_off 12 str_len 16) MBR_MIN_Y: 41 43 20 13 00 8E 2B 40 (str_off 28 str_len 16) MBR_MAX_X: DD 30 C0 F4 6C 2A 59 40 (str_off 42 str_len 16) MBR_MAX_Y: 41 43 20 13 00 8E 2B 40 (str_off 58 str_len 16) MBR_END: 7C (str_off 76 str_len 2) CLASS_TYPE: 01 00 00 00 (str_off 78 str_len 8) POINT: X: DD 30 C0 F4 6C 2A 59 40 (str_off 86 str_len 16) Y: 41 43 20 13 00 8E 2B 40 (str_off 102 str_len 16) END: FE (str_off 118 str_len 2) --- WKB Format ---------- See "3.3.2.6 Description of WKBGeometry Representations" in https://portal.opengeospatial.org/files/?artifact_id=829 Point { double x; double y; }; WKBPoint { byte byteOrder; uint32 wkbType; // 1 Point point; } Therefore, for "Point" we need from spatialite blob parts: endian, CLASS_TYPE, POINT """ # spatialite blob point size is 60 bytes = 120 chars in hex - as in above example and starts with 00 if len(spatialite_geom_contents) == 120 and spatialite_geom_contents[ 0] == '0' and spatialite_geom_contents[1] == '0': endian = spatialite_geom_contents[2:4] # 2 + len 2 class_type = "<unset>" if g_is_postgre: """ old code: class_type = spatialite_geom_contents[78:86] # 78 + 8 change class_type to 'point' BITWISE_OR SRID flag as per https://trac.osgeo.org/postgis/browser/trunk/doc/ZMSgeoms.txt " wkbSRID = 0x20000000 If the SRID flag is set it's value is encoded as a 4byte integer right after the type integer. " so our class is pont | wkbSRID = 0x20000001 (little endian 32: 01000020) then add srid "right after the type integer" our srid = 4326 = 0x10E6 (little endian 32: E6100000) therefore, class_type_point_with_srid_wgs84 little_endian is 01000020E6100000 """ class_type = "01000020E6100000" elif g_is_ms: class_type = "" point = spatialite_geom_contents[86:118] # 86 + 16 + 16 wkb = "" if g_is_postgre: wkb = endian + class_type + point if g_is_ms: """ https://msdn.microsoft.com/en-us/library/ee320529.aspx 0xE6100000 01 0C 0000000000001440 0000000000002440 This string is interpreted as shown in the following table. Binary value Description E6100000 SRID = 4326 01 Version = 1 0C Serialization Properties = V + P (geometry is valid, single point) 0000000000001440 X = 5 0000000000002440 Y = 10 """ wkb = "E6100000010C" + point dprint("wkb: " + wkb) csv_line = csv_line.replace(spatialite_geom_contents, wkb, 1) else: dprint("not entering spatialite blob parse - len " + str(len(spatialite_geom_contents))) dprint("find_and_conv_spatialite_blob_to_wkb ret: " + csv_line) return csv_line
def check_if_already_merged(args, log_hash): global g_unmerge_logs_row global g_cursor global g_exec_buf global g_is_ms, g_is_postgre if args["pg_schema"] != "public": g_cursor.execute("SET search_path = '{}','public';".format( args["pg_schema"])) try: print "checking if this log_hash has already been imported/merged: " + log_hash sqlstr = "select \"log_hash\" from \"logs\" where \"log_hash\" = ?" if g_is_postgre: sqlstr = sqlstr.replace("?", "%s") print("check log cmd: " + sqlstr) row = None # use with for auto rollback() on g_conn on exception - otherwise we cant use the cursor again - would fail as: current transaction is aborted, commands ignored until end of transaction block ret = None with g_conn as c: ret = g_cursor.execute(sqlstr, [log_hash]) row = g_cursor.fetchone() print("after cmd check if exists row:", row) if (row is None): # azm never imported if (args['unmerge']): # unmerge mode - this azm is not in target db raise Exception( "ABORT: This azm is already not present in target db's logs' table" ) else: print "This log hasn't been imported into target db yet - ok to proceed" pass return True else: # azm already imported if (args['unmerge']): dprint("um0: row: " + str(row)) if g_is_postgre or g_is_ms: dprint("upg 0") # row is a tuple - make it a dict dprint("upg 01") # now we only need 'log_hash' to unmerge and the used odbc cant parse geom too - cols = get_remote_columns(args,'logs') cols = [['log_hash', 'bigint']] dprint("upg 1: cols: " + str(cols)) drow = {} i = 0 for col in cols: print row[i] drow[col[0]] = row[i] i = i + 1 row = drow dprint("um1") print "### unmerge mode - delete start for azm: log_hash {}".format( row['log_hash']) g_unmerge_logs_row = row sqlstr = "delete from \"logs\" where \"log_hash\" = '{}'".format( log_hash) g_exec_buf.append(sqlstr) print "delete from logs table added to g_exec_buf..." else: raise Exception( "ABORT: This log ({}) has already been imported/exists in target db (use --unmerge to remove first if you want to re-import)." .format(log_hash)) except Exception as e: estr = str(e) if ("Invalid object name 'logs'" in estr or '42S02' in estr or 'relation "logs" does not exist' in estr): print "looks like this is the first-time log import - no table named logs exists yet - ok..." if args['unmerge']: raise Exception( "--unmerge mode called on an empty database: no related 'logs' table exist yet" ) # first time import - no table named logs exists yet else: type_, value_, traceback_ = sys.exc_info() exstr = str(traceback.format_exception(type_, value_, traceback_)) print "re-raise exception e - ", exstr raise e return False
def get_remote_columns(args, table_name): global g_cursor global g_is_ms, g_is_postgre dprint("table_name: " + table_name) sqlstr = "" if g_is_ms: sqlstr = "sp_columns @table_name=\"{}\"".format(table_name) if g_is_postgre: sqlstr = "select * from \"{}\" where false".format(table_name) dprint("check table columns sqlstr: " + sqlstr) ret = g_cursor.execute(sqlstr) dprint("query execute ret: " + str(ret)) rows = g_cursor.fetchall() ''' Now get remote column list for this table... ''' remote_columns = [] if g_is_postgre: colnames = [desc[0] for desc in g_cursor.description] for col in colnames: remote_columns.append([col, ""]) return remote_columns if g_is_ms: # MS SQL for row in rows: ''' MSSQL Column str return example: row n: {0: u'azqdemo', 1: u'dbo', 2: u'android_metadata', 3: u'locale', 4: -1, 5: u'text', u'DATA_TYPE': -1, 7: 2147483647, 8: None, 9: None, 10: 1, 11: None, 12: None, 13: -1, 14: None, 15: 2147483647, u'COLUMN_DEF': None, 17: u'YES', 18: 35, u'SCALE': None, u'TABLE_NAME': u'android_metadata', u'SQL_DATA_TYPE': -1, 6: 2147483647, u'NULLABLE': 1, u'REMARKS': None, u'CHAR_OCTET_LENGTH': 2147483647, u'COLUMN_NAME': u'locale', u'SQL_DATETIME_SUB': None, u'TABLE_OWNER': u'dbo', 16: 1, u'RADIX': None, u'SS_DATA_TYPE': 35, u'TYPE_NAME': u'text', u'PRECISION': 2147483647, u'IS_NULLABLE': u'YES', u'LENGTH': 2147483647, u'ORDINAL_POSITION': 1, u'TABLE_QUALIFIER': u'azqdemo'} Result: col_name: locale col_type: text ''' rs = str(row) dprint("row n: " + rs) splitted = rs.split(", u") col_name = splitted[3].split("'")[1] dprint("col_name: " + col_name) col_type = splitted[4].split("'")[1] dprint("col_type: " + col_type) remote_columns.append([col_name, col_type]) return remote_columns