def parse_code_macros(patches): # Parse file, look for PATCH_SYMBOL macros # This implementation leaves a bit to be desired - namely, it doesn't account for commented-out # code or #ifdef'd out code, and won't correctly mangle templates that have default parameters # print("Reading file %s" % sourcePath) regex = re.compile("^(.*)\((.*)\)(.*)$") pos = 0 for patch in patches: # Replace spaces except ones denoting const parameters original = strip_spaces_except_const(patch[0]) replacement = strip_spaces_except_const(patch[1]) originalMatch = regex.search(original) replacementMatch = regex.search(replacement) if originalMatch is None or replacementMatch is None: raise IOError(f"Unable to parse patch: {original} -> {patch}") origSymbol = "%s(%s)%s" % (originalMatch.group(1), originalMatch.group(2), originalMatch.group(3)) patchSymbol = "%s(%s)%s" % (replacementMatch.group(1), replacementMatch.group(2), replacementMatch.group(3)) dolPatches.extend( dolFile.generate_patches(mangle(origSymbol), patchSymbol))
def _make_data(totalrows, dict_reader, _tbl, tablename, dates, exit_on_error=False): # TODO Possible alternative to dropping rows # create an error table and append bad rows (with original data as all text cols) data = '' index = 0 max_errors_per_row = 5 logger.info(False, "totalrows %s" % totalrows) for row in dict_reader: index += 1 outrow = [] errors_in_row = 0 for k in dict_reader.fieldnames: assert k in row try: _k = mangle(k) if _k in _tbl and 'type' in _tbl[_k]: dt = _tbl[_k]['type'] else: dt = str validify_date_len(dates, k, _tbl) maybe_col_data = psqlencode(row[k], dt) outrow.append(maybe_col_data) except ValueError as e: errors_in_row += 1 if errors_in_row > max_errors_per_row: outrow = None break _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error) #append NULL outrow.append('') except Exception as e: errors_in_row += 1 if errors_in_row > max_errors_per_row: outrow = None break _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error) outrow.append('') #skip dead or poorly formatted rows if outrow: #tab data += "\t".join(outrow) #newline data += "\n" else: logger.error(False, "%s table has CSV ERROR: skipping row %s" % (tablename, str(index))) if index % 10000 == 0 and index != 0: logger.info(False, "\n%s table has progressed to the %s row.\n" % (tablename, str(index))) percent = ((index * 1.0) / totalrows) * 100 logger.info(False, "\n%s %% complete for table %s.\n" % (str(percent), tablename)) return data
def _create_table(tablename, cascade, _tbl, f, default_to_null, default_user, pkey, uniquekey, serial=None, timestamp=None): sql = '' sql += "DROP TABLE IF EXISTS %s" % tablename sql += "CASCADE;" if cascade else ";\n" sql += "CREATE TABLE %s (\n\t" % tablename cols = [] for k in f.fieldnames: _k = mangle(k) if _k is None or len(_k) < 1: continue (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width']) if dt == str: if dw > 0 and dw <= 1024: sqldt = "VARCHAR(%d)" % (dw) else: sqldt = "TEXT" elif dt == int: if dw > 4: sqldt = "BIGINT" else: if dw > 2: sqldt = "INTEGER" else: sqldt = "SMALLINT" elif dt == float: if dw > 4: sqldt = "DOUBLE PRECISION" else: sqldt = "REAL" else: sqldt = "TEXT" # unlimited length if not default_to_null: sqldt += " NOT NULL" cols.append('%s %s' % (_psql_identifier(_k), sqldt)) sql += ",\n\t".join(cols) sql += ");" if default_user is not None: sql += "ALTER TABLE %s OWNER TO %s;\n" % (tablename, default_user) # TODO remove as this is basically duplicated in joinKeys, also pKey looks to never have # been flushed out, this is the only part that does anything, the copy part does nothing on pkey if pkey is not None: sql += "ALTER TABLE %s ADD PRIMARY KEY (%s);\n" % (tablename, pkey) if uniquekey is not None: sql += "ALTER TABLE %s ADD UNIQUE (%s);\n" % (tablename, uniquekey) return sql
def _sniffer(f, maxsniff=-1, datatype={}, do_log=False): '''sniffs out data types''' _tbl = dict() if do_log: logger.info(True, "-- fieldnames: %s" % f.fieldnames) logger.info(True, "-- datatype: %s" % datatype) # initialize data types for k in f.fieldnames: _k = mangle(k) assert len(_k) > 0 _tbl[_k] = {'type': str, 'width': _grow_varchar(None)} # default data type if _k in datatype: dt = datatype[_k] if dt in ['int', 'int4', 'integer']: _tbl[_k] = {'type': int, 'width': 4} elif dt in ['smallint', 'short']: _tbl[_k] = {'type': int, 'width': 2} elif dt in ['float', 'double', 'float8']: _tbl[_k] = {'type': float, 'width': 8} elif dt in ['text', 'str']: _tbl[_k] = {'type': str, 'width': -1} elif dt in ['int8', 'bigint']: _tbl[_k] = {'type': int, 'width': 8} _need_sniff = False for k in f.fieldnames: if mangle(k) not in datatype: _need_sniff = True break # sniff out data types if maxsniff <> 0 and _need_sniff: i = 0 for row in f: i += 1 if maxsniff > 0 and i > maxsniff: break # if _verbose: print >>sys.stderr, 'sniffing row', i, '...', row, _tbl # sniff each data field for k in f.fieldnames: _k = mangle(k) assert len(_k) > 0 v = row[k] assert type(v) == str if len(v) == 0: continue # skip empty strings if _k in datatype: continue # skip already typed column (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width']) try: if (_isbool(v) or int(v) is not None) and not (dt == float): _tbl[_k] = {'type': int, 'width': 4} except ValueError, e: try: if dt == int: # revert to string _tbl[_k] = {'type': str, 'width': _grow_varchar(v)} if float(v) is not None: _tbl[_k] = {'type': float, 'width': 8} except ValueError, e: if dt == float: _tbl[_k] = {'type': str, 'width': _grow_varchar(v)} if dt == str and dw < len(v): _tbl[_k] = {'type': dt, 'width': _grow_varchar(v)}
def _psql_identifier(s): '''wraps any reserved word with double quote escapes''' k = mangle(s) if k.lower() in psql_reserved_words: return '"%s"' % (k) return k
def csv2psql(stream, tablename, analyze_table=True, cascade=False, create_table=True, datatype={}, default_to_null=True, default_user=None, delimiter='\t', force_utf8=False, load_data=True, maxsniff=-1, pkey=None, quiet=True, schema=None, strip_prefix=False, truncate_table=False, uniquekey=None, database_name='', is_merge=False, joinkeys=None, dates=None, is_dump=False, make_primary_key_first=False, serial=None, timestamp=None, do_add_cols=False, is_std_in=True, result_prints_std_out=True, csv_filename=None, postgres_url=None, append_sql=False, new_table_name=None, skipp_stored_proc_modified_time=False, delete_temp_table=False, modified_timestamp=None): # maybe copy? _sql = '' _copy_sql = '' drop_temp_table_sql = '' _alter_sql = '' orig_tablename = tablename + "" skip = is_merge or is_dump logger.info(True, "-- skip: %s" % skip) if skip: tablename = "temp_" + tablename if schema is None and not skip: schema = os.getenv('CSV2PSQL_SCHEMA', 'public').strip() if schema == '': schema = None if default_user is None and not skip: default_user = os.getenv('CSV2PSQL_USER', '').strip() if default_user == '': default_user = None if not append_sql: # pass 1 _tbl = {} # back_up stream / data data = '' if not skip or is_merge: data += get_stdin() f = dict_reader(data, delimiter) mangled_field_names = [] for key in f.fieldnames: mangled_field_names.append(mangle(key)) _tbl = _sniffer(f, maxsniff, datatype) # logger.info(True, "-- _tbl: %s" % _tbl) if default_user is not None and not skip: _sql += "SET ROLE %s;\n" % default_user obj = get_schema_sql(schema, tablename, strip_prefix, skip) _sql += obj.sql tablename = obj.tablename # add explicit client encoding if force_utf8: _sql += "\\encoding UTF8\n" if quiet and not skip: _sql += "SET client_min_messages TO ERROR;\n" if create_table and not skip: create_ctr = 0 logger.info(True, "-- CREATING TABLE\n") _sql += _create_table( tablename, cascade, _tbl, f, default_to_null, default_user, pkey, uniquekey, serial, timestamp) create_ctr += 1 logger.info(True, "-- CREATE COUNTER: %s" % create_ctr) _sql += sql_procedures.modified_time_procedure.procedure_str # _s1ql += sql_triggers.modified_time_trigger(tablename) if truncate_table and not load_data and not skip: _sql += "TRUNCATE TABLE %s;\n" % tablename # pass 2 if load_data and not skip: total_rows = data.count("\n") reader = dict_reader(data, delimiter) if is_std_in: _copy_sql = out_as_copy_stdin(total_rows, reader, tablename, delimiter, _tbl, dates) else: _copy_sql = out_as_copy_csv(total_rows, reader, tablename, delimiter, _tbl, csv_filename, dates) if load_data and analyze_table and not skip: _sql += "ANALYZE %s;\n" % tablename # fix bad dates ints or stings to correct int format if dates is not None: for date_format, cols in dates.iteritems(): _alter_sql += sql_alters.dates(tablename, cols, date_format) # take cols and merge them into one primary_key join_keys_key_name = None if joinkeys is not None: (keys, key_name) = joinkeys join_keys_key_name = key_name _alter_sql += sql_alters.fast_delete_dupes(keys, key_name, tablename, True) # doing additional cols here as some types are not moved over correctly (with table copy in dupes) _alter_sql += additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge, modified_timestamp) _alter_sql += sql_alters.make_primary_key_w_join(tablename, key_name, keys) if do_add_cols and joinkeys is None: _alter_sql = additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge, modified_timestamp) primary_key = pkey if pkey is not None else join_keys_key_name if is_array(primary_key): primary_key = primary_key[0] # take temporary table and merge it into a real table if primary_key is not None and is_dump: if create_table and database_name: _alter_sql += sql_alters.pg_dump(database_name, schema, tablename, new_table_name) # TODO re-order the primary_key to first column if is_merge and primary_key is not None: logger.info(True, "-- mangled_field_names: %s" % mangled_field_names) logger.info(True, "-- make_primary_key_first %s" % make_primary_key_first) time_tablename = new_table_name if new_table_name else orig_tablename if not skipp_stored_proc_modified_time: _sql += sql_triggers.modified_time_trigger(time_tablename) _sql += sql_alters.merge(mangled_field_names, orig_tablename, primary_key, make_primary_key_first, tablename, new_table_name) if delete_temp_table: logger.info(True, "dropping temp table: %s" % tablename) drop_temp_table_sql = "DROP TABLE %s;" % tablename # logger.info(True, _sql) if append_sql: obj = get_schema_sql(schema, tablename, strip_prefix, skip) _sql += obj.sql _sql += get_stdin() if result_prints_std_out: c_sql = '' if _copy_sql: c_sql = _copy_sql.to_psql() logger.info(False, "PRIOR CHAIN ATTEMPT") logger.info(False, "c_sql: %s" % c_sql) logger.info(False, "_alter_sql: %s" % _alter_sql) logger.info(False, "drop_temp_table_sql: %s" % drop_temp_table_sql) chained = chain(_sql + c_sql + _alter_sql + drop_temp_table_sql) chained.pipe() else: assert postgres_url, "postgres_url undefined" # first send regular sql, if we have it chained = chain(_sql) chained.to_postgres(postgres_url) # send copied data if not append_sql and _copy_sql: chained = chain(_copy_sql.copy_statement) chained.to_postgres_copy(postgres_url, _copy_sql.data) if _alter_sql: chained.to_postgres(postgres_url, _alter_sql) if drop_temp_table_sql: chained.to_postgres(postgres_url, drop_temp_table_sql) return chained
def main(argv=None): # import pydevd # pydevd.settrace('localhost', port=9797, stdoutToServer=True, stderrToServer=True, suspend=False) '''command-line interface''' tablename = None if argv is None: argv = sys.argv[1:] # print "argv: " # print argv # print "end argv: " try: # init default flags flags = dict() flags['maxsniff'] = 1000 opts, args = \ getopt.getopt(argv, "ak:s:q", ["help", "version", "schema=", "key=", "unique=", "cascade", "append", "utf8", "sniff=", "delimiter=", "datatype=", "role=", "is_merge=", "joinkeys=", "dates=", "tablename=", "databasename=", "is_dump=", "is_merge=", "primaryfirst=", "serial=", "timestamp=", "do_add_cols=", "analyze_table=", "now", "postgres_url=", "append_sql", "new_table_name=", "skipp_stored_proc_modified_time", "delete_temp_table", "modified_timestamp="]) # print "opts: " # print opts # print "end opts" # print for o, a in opts: # print a if o in ("--version"): print __version__ return 0 elif o in ("--help"): _usage() return 0 elif o in ("--cascade"): flags['cascade'] = True elif o in ("-a", "--append"): flags['create_table'] = False flags['truncate_table'] = False flags['load_data'] = True flags['maxsniff'] = 0 elif o in ("-s", "--schema"): flags['schema'] = a elif o in ("--role"): flags['default_user'] = a elif o in ("--sniff"): flags['maxsniff'] = int(a) elif o in ("-k", "--key"): flags['pkey'] = a.split(':') elif o in ("--unique"): flags['uniquekey'] = a.split(':') elif o in ("--utf8"): flags['force_utf8'] = True elif o in ("--delimiter"): flags['delimiter'] = a elif o in ("--datatype"): if 'datatype' not in flags: flags['datatype'] = dict() (k, v) = a.split(':') v = v.strip().lower() if v in _data_types: for k in [mangle(_k) for _k in k.split(',')]: flags['datatype'][k] = v else: raise getopt.GetoptError('unknown data type %s (use %s)' % (v, _data_types)) elif o in ("-q"): _verbose = False elif o in ("--is_merge"): flags['is_merge'] = True if a.lower() == 'true' else False elif o in ("--tablename"): tablename = a.lower() elif o in ("--joinkeys"): ( keys, key_name ) = a.lower().split(':') keys = keys.lower().split(',') flags['joinkeys'] = (keys, key_name) elif o in ("--dates"): (dates_commas, date_format) = a.split(':') dates = dates_commas.lower().split(',') if not flags.has_key('dates'): flags['dates'] = dict() flags['dates'][date_format] = dates elif o in ("--databasename"): flags["database_name"] = a.lower() elif o in ("--is_dump"): flags["is_dump"] = True if a.lower() == 'true' else False elif o in ("--primaryfirst"): flags["make_primary_key_first"] = True if a.lower() == 'true' else False elif o in ("--serial"): flags["serial"] = a.lower() elif o in ("--timestamp"): flags["timestamp"] = a.lower() elif o in ("--do_add_cols"): flags["do_add_cols"] = True if a.lower() == 'true' else False elif o in ("--analyze_table"): flags["analyze_table"] = True if a.lower() == 'true' else False elif o in ("--now"): flags["result_prints_std_out"] = False # inverse of now elif o in ("--postgres_url"): flags['postgres_url'] = a elif o in ("--append_sql"): flags['append_sql'] = True elif o in ("--new_table_name"): flags['new_table_name'] = a.lower() elif o in ("--skipp_stored_proc_modified_time"): flags['skipp_stored_proc_modified_time'] = True elif o in ("--delete_temp_table"): flags['delete_temp_table'] = True elif o in ("--modified_timestamp"): flags['modified_timestamp'] = a.lower() else: raise getopt.GetoptError('unknown option %s' % (o)) print "-- flags: %s" % flags if not tablename: assert False, 'tablename is required via --tablename' if flags.has_key('postgres_url'): if not flags.has_key('result_prints_std_out') or (flags["result_prints_std_out"] and flags['postgres_url']): assert False, '--postgres_url required if --now is specified' print "-- tablename %s" % tablename tablename = mangle_table(tablename) print "-- mangled tablename %s" % tablename csv2psql(sys.stdin, tablename, **flags) return 0 except getopt.GetoptError, err: print >> sys.stderr, 'ERROR:', str(err), "\n\n" _usage() return -1
def csv2psql(stream, tablename, analyze_table=True, cascade=False, create_table=True, datatype={}, default_to_null=True, default_user=None, delimiter=',', force_utf8=False, load_data=True, maxsniff=-1, pkey=None, quiet=True, schema=None, strip_prefix=False, truncate_table=False, uniquekey=None, database_name='', is_merge=False, joinkeys=None, dates=None, is_dump=False, make_primary_key_first=False, serial=None, timestamp=None, do_add_cols=False, is_std_in=True, result_prints_std_out=True, csv_filename=None, postgres_url=None, append_sql=False, new_table_name=None, skipp_stored_proc_modified_time=False, delete_temp_table=False, modified_timestamp=None): # maybe copy? _sql = '' _copy_sql = '' drop_temp_table_sql = '' _alter_sql = '' orig_tablename = tablename + "" skip = is_merge or is_dump logger.info(True, "-- skip: %s" % skip) if skip: tablename = "temp_" + tablename if schema is None and not skip: schema = os.getenv('CSV2PSQL_SCHEMA', 'public').strip() if schema == '': schema = None if default_user is None and not skip: default_user = os.getenv('CSV2PSQL_USER', '').strip() if default_user == '': default_user = None if not append_sql: # pass 1 _tbl = {} # back_up stream / data data = '' if not skip or is_merge: data += get_stdin() f = dict_reader(data, delimiter) mangled_field_names = [] for key in f.fieldnames: mangled_field_names.append(mangle(key)) _tbl = _sniffer(f, maxsniff, datatype) # logger.info(True, "-- _tbl: %s" % _tbl) if default_user is not None and not skip: _sql += "SET ROLE %s;\n" % default_user obj = get_schema_sql(schema, tablename, strip_prefix, skip) _sql += obj.sql tablename = obj.tablename # add explicit client encoding if force_utf8: _sql += "\\encoding UTF8\n" if quiet and not skip: _sql += "SET client_min_messages TO ERROR;\n" if create_table and not skip: create_ctr = 0 logger.info(True, "-- CREATING TABLE\n") _sql += _create_table( tablename, cascade, _tbl, f, default_to_null, default_user, pkey, uniquekey, serial, timestamp) create_ctr += 1 logger.info(True, "-- CREATE COUNTER: %s" % create_ctr) _sql += sql_procedures.modified_time_procedure.procedure_str # _s1ql += sql_triggers.modified_time_trigger(tablename) if truncate_table and not load_data and not skip: _sql += "TRUNCATE TABLE %s;\n" % tablename # pass 2 if load_data and not skip: total_rows = data.count("\n") reader = dict_reader(data, delimiter) if is_std_in: _copy_sql = out_as_copy_stdin(total_rows, reader, tablename, delimiter, _tbl, dates) else: _copy_sql = out_as_copy_csv(total_rows, reader, tablename, delimiter, _tbl, csv_filename, dates) if load_data and analyze_table and not skip: _sql += "ANALYZE %s;\n" % tablename # fix bad dates ints or stings to correct int format if dates is not None: for date_format, cols in dates.iteritems(): _alter_sql += sql_alters.dates(tablename, cols, date_format) # take cols and merge them into one primary_key join_keys_key_name = None if joinkeys is not None: (keys, key_name) = joinkeys join_keys_key_name = key_name _alter_sql += sql_alters.fast_delete_dupes(keys, key_name, tablename, True) # doing additional cols here as some types are not moved over correctly (with table copy in dupes) _alter_sql += additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge, modified_timestamp) _alter_sql += sql_alters.make_primary_key_w_join(tablename, key_name, keys) if do_add_cols and joinkeys is None: _alter_sql = additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge, modified_timestamp) primary_key = pkey if pkey is not None else join_keys_key_name if is_array(primary_key): primary_key = primary_key[0] # take temporary table and merge it into a real table if primary_key is not None and is_dump: if create_table and database_name: _alter_sql += sql_alters.pg_dump(database_name, schema, tablename, new_table_name) # TODO re-order the primary_key to first column if is_merge and primary_key is not None: logger.info(True, "-- mangled_field_names: %s" % mangled_field_names) logger.info(True, "-- make_primary_key_first %s" % make_primary_key_first) time_tablename = new_table_name if new_table_name else orig_tablename if not skipp_stored_proc_modified_time: _sql += sql_triggers.modified_time_trigger(time_tablename) _sql += sql_alters.merge(mangled_field_names, orig_tablename, primary_key, make_primary_key_first, tablename, new_table_name) if delete_temp_table: logger.info(True, "dropping temp table: %s" % tablename) drop_temp_table_sql = "DROP TABLE %s;" % tablename # logger.info(True, _sql) if append_sql: obj = get_schema_sql(schema, tablename, strip_prefix, skip) _sql += obj.sql _sql += get_stdin() if result_prints_std_out: c_sql = '' if _copy_sql: c_sql = _copy_sql.to_psql() logger.info(False, "PRIOR CHAIN ATTEMPT") logger.info(False, "c_sql: %s" % c_sql) logger.info(False, "_alter_sql: %s" % _alter_sql) logger.info(False, "drop_temp_table_sql: %s" % drop_temp_table_sql) chained = chain(_sql + c_sql + _alter_sql + drop_temp_table_sql) chained.pipe() else: assert postgres_url, "postgres_url undefined" # first send regular sql, if we have it chained = chain(_sql) chained.to_postgres(postgres_url) # send copied data if not append_sql and _copy_sql: chained = chain(_copy_sql.copy_statement) chained.to_postgres_copy(postgres_url, _copy_sql.data) if _alter_sql: chained.to_postgres(postgres_url, _alter_sql) if drop_temp_table_sql: chained.to_postgres(postgres_url, drop_temp_table_sql) return chained
def _make_data(totalrows, dict_reader, _tbl, tablename, dates, exit_on_error=False): # TODO Possible alternative to dropping rows # create an error table and append bad rows (with original data as all text cols) data = '' index = 0 max_errors_per_row = 5 logger.info(False, "totalrows %s" % totalrows) for row in dict_reader: index += 1 outrow = [] errors_in_row = 0 for k in dict_reader.fieldnames: assert k in row try: _k = mangle(k) if _k in _tbl and 'type' in _tbl[_k]: dt = _tbl[_k]['type'] else: dt = str validify_date_len(dates, k, _tbl) maybe_col_data = psqlencode(row[k], dt) outrow.append(maybe_col_data) except ValueError as e: errors_in_row += 1 if errors_in_row > max_errors_per_row: outrow = None break _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error) #append NULL outrow.append('') except Exception as e: errors_in_row += 1 if errors_in_row > max_errors_per_row: outrow = None break _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error) outrow.append('') #skip dead or poorly formatted rows if outrow: #tab data += "\t".join(outrow) #newline data += "\n" else: logger.error( False, "%s table has CSV ERROR: skipping row %s" % (tablename, str(index))) if index % 10000 == 0 and index != 0: logger.info( False, "\n%s table has progressed to the %s row.\n" % (tablename, str(index))) percent = ((index * 1.0) / totalrows) * 100 logger.info( False, "\n%s %% complete for table %s.\n" % (str(percent), tablename)) return data