def parse_code_macros(patches):
    # Parse file, look for PATCH_SYMBOL macros
    # This implementation leaves a bit to be desired - namely, it doesn't account for commented-out
    # code or #ifdef'd out code, and won't correctly mangle templates that have default parameters
    # print("Reading file %s" % sourcePath)
    regex = re.compile("^(.*)\((.*)\)(.*)$")
    pos = 0

    for patch in patches:
        # Replace spaces except ones denoting const parameters
        original = strip_spaces_except_const(patch[0])
        replacement = strip_spaces_except_const(patch[1])

        originalMatch = regex.search(original)
        replacementMatch = regex.search(replacement)

        if originalMatch is None or replacementMatch is None:
            raise IOError(f"Unable to parse patch: {original} -> {patch}")
        origSymbol = "%s(%s)%s" % (originalMatch.group(1),
                                   originalMatch.group(2),
                                   originalMatch.group(3))
        patchSymbol = "%s(%s)%s" % (replacementMatch.group(1),
                                    replacementMatch.group(2),
                                    replacementMatch.group(3))
        dolPatches.extend(
            dolFile.generate_patches(mangle(origSymbol), patchSymbol))
Example #2
0
def _make_data(totalrows, dict_reader, _tbl, tablename, dates, exit_on_error=False):
    # TODO Possible alternative to dropping rows
    # create an error table and append bad rows (with original data as all text cols)

    data = ''
    index = 0
    max_errors_per_row = 5

    logger.info(False, "totalrows %s" % totalrows)

    for row in dict_reader:
        index += 1
        outrow = []
        errors_in_row = 0
        for k in dict_reader.fieldnames:
            assert k in row
            try:
                _k = mangle(k)
                if _k in _tbl and 'type' in _tbl[_k]:
                    dt = _tbl[_k]['type']
                else:
                    dt = str
                validify_date_len(dates, k, _tbl)
                maybe_col_data = psqlencode(row[k], dt)
                outrow.append(maybe_col_data)
            except ValueError as e:
                errors_in_row += 1
                if errors_in_row > max_errors_per_row:
                    outrow = None
                    break
                _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error)
                #append NULL
                outrow.append('')
            except Exception as e:
                errors_in_row += 1
                if errors_in_row > max_errors_per_row:
                    outrow = None
                    break
                _handle_error(e, k, _k, row, index, dt, tablename, exit_on_error)
                outrow.append('')
        #skip dead or poorly formatted rows
        if outrow:
            #tab
            data += "\t".join(outrow)
            #newline
            data += "\n"
        else:
            logger.error(False, "%s table has CSV ERROR: skipping row %s" % (tablename, str(index)))

        if index % 10000 == 0 and index != 0:
            logger.info(False, "\n%s table has progressed to the %s row.\n" % (tablename, str(index)))

            percent = ((index * 1.0) / totalrows) * 100
            logger.info(False, "\n%s %% complete for table %s.\n" % (str(percent), tablename))


    return data
Example #3
0
def _create_table(tablename, cascade, _tbl, f, default_to_null,
                  default_user, pkey, uniquekey, serial=None, timestamp=None):
    sql = ''
    sql += "DROP TABLE IF EXISTS %s" % tablename
    sql += "CASCADE;" if cascade else ";\n"

    sql += "CREATE TABLE %s (\n\t" % tablename
    cols = []
    for k in f.fieldnames:
        _k = mangle(k)
        if _k is None or len(_k) < 1:
            continue

        (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width'])

        if dt == str:
            if dw > 0 and dw <= 1024:
                sqldt = "VARCHAR(%d)" % (dw)
            else:
                sqldt = "TEXT"
        elif dt == int:
            if dw > 4:
                sqldt = "BIGINT"
            else:
                if dw > 2:
                    sqldt = "INTEGER"
                else:
                    sqldt = "SMALLINT"
        elif dt == float:
            if dw > 4:
                sqldt = "DOUBLE PRECISION"
            else:
                sqldt = "REAL"
        else:
            sqldt = "TEXT"  # unlimited length

        if not default_to_null:
            sqldt += " NOT NULL"
        cols.append('%s %s' % (_psql_identifier(_k), sqldt))

    sql += ",\n\t".join(cols)
    sql += ");"
    if default_user is not None:
        sql += "ALTER TABLE %s OWNER TO %s;\n" % (tablename, default_user)
    # TODO remove as this is basically duplicated in joinKeys, also pKey looks to never have
    # been flushed out, this is the only part that does anything, the copy part does nothing on pkey
    if pkey is not None:
        sql += "ALTER TABLE %s ADD PRIMARY KEY (%s);\n" % (tablename, pkey)
    if uniquekey is not None:
        sql += "ALTER TABLE %s ADD UNIQUE (%s);\n" % (tablename, uniquekey)

    return sql
Example #4
0
def _create_table(tablename, cascade, _tbl, f, default_to_null,
                  default_user, pkey, uniquekey, serial=None, timestamp=None):
    sql = ''
    sql += "DROP TABLE IF EXISTS %s" % tablename
    sql += "CASCADE;" if cascade else ";\n"

    sql += "CREATE TABLE %s (\n\t" % tablename
    cols = []
    for k in f.fieldnames:
        _k = mangle(k)
        if _k is None or len(_k) < 1:
            continue

        (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width'])

        if dt == str:
            if dw > 0 and dw <= 1024:
                sqldt = "VARCHAR(%d)" % (dw)
            else:
                sqldt = "TEXT"
        elif dt == int:
            if dw > 4:
                sqldt = "BIGINT"
            else:
                if dw > 2:
                    sqldt = "INTEGER"
                else:
                    sqldt = "SMALLINT"
        elif dt == float:
            if dw > 4:
                sqldt = "DOUBLE PRECISION"
            else:
                sqldt = "REAL"
        else:
            sqldt = "TEXT"  # unlimited length

        if not default_to_null:
            sqldt += " NOT NULL"
        cols.append('%s %s' % (_psql_identifier(_k), sqldt))

    sql += ",\n\t".join(cols)
    sql += ");"
    if default_user is not None:
        sql += "ALTER TABLE %s OWNER TO %s;\n" % (tablename, default_user)
    # TODO remove as this is basically duplicated in joinKeys, also pKey looks to never have
    # been flushed out, this is the only part that does anything, the copy part does nothing on pkey
    if pkey is not None:
        sql += "ALTER TABLE %s ADD PRIMARY KEY (%s);\n" % (tablename, pkey)
    if uniquekey is not None:
        sql += "ALTER TABLE %s ADD UNIQUE (%s);\n" % (tablename, uniquekey)

    return sql
Example #5
0
def _sniffer(f, maxsniff=-1, datatype={}, do_log=False):
    '''sniffs out data types'''
    _tbl = dict()
    if do_log:
        logger.info(True, "-- fieldnames: %s" % f.fieldnames)
        logger.info(True, "-- datatype: %s" % datatype)
    # initialize data types
    for k in f.fieldnames:
        _k = mangle(k)
        assert len(_k) > 0
        _tbl[_k] = {'type': str, 'width': _grow_varchar(None)}  # default data type
        if _k in datatype:
            dt = datatype[_k]
            if dt in ['int', 'int4', 'integer']:
                _tbl[_k] = {'type': int, 'width': 4}
            elif dt in ['smallint', 'short']:
                _tbl[_k] = {'type': int, 'width': 2}
            elif dt in ['float', 'double', 'float8']:
                _tbl[_k] = {'type': float, 'width': 8}
            elif dt in ['text', 'str']:
                _tbl[_k] = {'type': str, 'width': -1}
            elif dt in ['int8', 'bigint']:
                _tbl[_k] = {'type': int, 'width': 8}

    _need_sniff = False
    for k in f.fieldnames:
        if mangle(k) not in datatype:
            _need_sniff = True
            break

    # sniff out data types
    if maxsniff <> 0 and _need_sniff:
        i = 0
        for row in f:
            i += 1
            if maxsniff > 0 and i > maxsniff:
                break

            # if _verbose: print >>sys.stderr, 'sniffing row', i, '...', row, _tbl

            # sniff each data field
            for k in f.fieldnames:
                _k = mangle(k)
                assert len(_k) > 0

                v = row[k]
                assert type(v) == str
                if len(v) == 0:
                    continue  # skip empty strings

                if _k in datatype:
                    continue  # skip already typed column

                (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width'])
                try:
                    if (_isbool(v) or int(v) is not None) and not (dt == float):
                        _tbl[_k] = {'type': int, 'width': 4}
                except ValueError, e:
                    try:
                        if dt == int:  # revert to string
                            _tbl[_k] = {'type': str, 'width': _grow_varchar(v)}
                        if float(v) is not None:
                            _tbl[_k] = {'type': float, 'width': 8}
                    except ValueError, e:
                        if dt == float:
                            _tbl[_k] = {'type': str, 'width': _grow_varchar(v)}
                        if dt == str and dw < len(v):
                            _tbl[_k] = {'type': dt, 'width': _grow_varchar(v)}
Example #6
0
def _psql_identifier(s):
    '''wraps any reserved word with double quote escapes'''
    k = mangle(s)
    if k.lower() in psql_reserved_words:
        return '"%s"' % (k)
    return k
Example #7
0
def csv2psql(stream,
             tablename,
             analyze_table=True,
             cascade=False,
             create_table=True,
             datatype={},
             default_to_null=True,
             default_user=None,
             delimiter='\t',
             force_utf8=False,
             load_data=True,
             maxsniff=-1,
             pkey=None,
             quiet=True,
             schema=None,
             strip_prefix=False,
             truncate_table=False,
             uniquekey=None,
             database_name='',
             is_merge=False,
             joinkeys=None,
             dates=None,
             is_dump=False,
             make_primary_key_first=False,
             serial=None,
             timestamp=None,
             do_add_cols=False,
             is_std_in=True,
             result_prints_std_out=True,
             csv_filename=None,
             postgres_url=None,
             append_sql=False,
             new_table_name=None,
             skipp_stored_proc_modified_time=False,
             delete_temp_table=False,
             modified_timestamp=None):
    # maybe copy?
    _sql = ''
    _copy_sql = ''
    drop_temp_table_sql = ''
    _alter_sql = ''

    orig_tablename = tablename + ""
    skip = is_merge or is_dump

    logger.info(True, "-- skip: %s" % skip)

    if skip:
        tablename = "temp_" + tablename

    if schema is None and not skip:
        schema = os.getenv('CSV2PSQL_SCHEMA', 'public').strip()
        if schema == '':
            schema = None

    if default_user is None and not skip:
        default_user = os.getenv('CSV2PSQL_USER', '').strip()
        if default_user == '':
            default_user = None

    if not append_sql:
        # pass 1
        _tbl = {}

        # back_up stream / data
        data = ''
        if not skip or is_merge:
            data += get_stdin()

            f = dict_reader(data, delimiter)
            mangled_field_names = []
            for key in f.fieldnames:
                mangled_field_names.append(mangle(key))
            _tbl = _sniffer(f, maxsniff, datatype)

        # logger.info(True, "-- _tbl: %s" % _tbl)

        if default_user is not None and not skip:
            _sql += "SET ROLE %s;\n" % default_user

        obj = get_schema_sql(schema, tablename, strip_prefix, skip)
        _sql += obj.sql
        tablename = obj.tablename

        # add explicit client encoding
        if force_utf8:
            _sql += "\\encoding UTF8\n"

        if quiet and not skip:
            _sql += "SET client_min_messages TO ERROR;\n"

        if create_table and not skip:
            create_ctr = 0
            logger.info(True, "-- CREATING TABLE\n")

            _sql += _create_table(
                tablename, cascade, _tbl, f, default_to_null,
                default_user, pkey,
                uniquekey, serial, timestamp)
            create_ctr += 1
            logger.info(True, "-- CREATE COUNTER: %s" % create_ctr)

            _sql += sql_procedures.modified_time_procedure.procedure_str
            # _s1ql += sql_triggers.modified_time_trigger(tablename)

        if truncate_table and not load_data and not skip:
            _sql += "TRUNCATE TABLE %s;\n" % tablename

        # pass 2
        if load_data and not skip:
            total_rows = data.count("\n")
            reader = dict_reader(data, delimiter)
            if is_std_in:

                _copy_sql = out_as_copy_stdin(total_rows, reader, tablename, delimiter, _tbl, dates)
            else:
                _copy_sql = out_as_copy_csv(total_rows, reader, tablename, delimiter, _tbl, csv_filename,
                                            dates)

        if load_data and analyze_table and not skip:
            _sql += "ANALYZE %s;\n" % tablename

        # fix bad dates ints or stings to correct int format
        if dates is not None:
            for date_format, cols in dates.iteritems():
                _alter_sql += sql_alters.dates(tablename, cols, date_format)

        # take cols and merge them into one primary_key
        join_keys_key_name = None
        if joinkeys is not None:
            (keys, key_name) = joinkeys
            join_keys_key_name = key_name

            _alter_sql += sql_alters.fast_delete_dupes(keys, key_name, tablename, True)
            # doing additional cols here as some types are not moved over correctly (with table copy in dupes)
            _alter_sql += additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge,
                                          modified_timestamp)

            _alter_sql += sql_alters.make_primary_key_w_join(tablename, key_name, keys)

        if do_add_cols and joinkeys is None:
            _alter_sql = additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge,
                                         modified_timestamp)

        primary_key = pkey if pkey is not None else join_keys_key_name
        if is_array(primary_key):
            primary_key = primary_key[0]

        # take temporary table and merge it into a real table
        if primary_key is not None and is_dump:
            if create_table and database_name:
                _alter_sql += sql_alters.pg_dump(database_name, schema, tablename, new_table_name)
                # TODO re-order the primary_key to first column

        if is_merge and primary_key is not None:
            logger.info(True, "-- mangled_field_names: %s" % mangled_field_names)
            logger.info(True, "-- make_primary_key_first %s" % make_primary_key_first)

            time_tablename = new_table_name if new_table_name else orig_tablename
            if not skipp_stored_proc_modified_time:
                _sql += sql_triggers.modified_time_trigger(time_tablename)

            _sql += sql_alters.merge(mangled_field_names, orig_tablename,
                                     primary_key, make_primary_key_first, tablename, new_table_name)

            if delete_temp_table:
                logger.info(True, "dropping temp table: %s" % tablename)
                drop_temp_table_sql = "DROP TABLE %s;" % tablename
                # logger.info(True, _sql)

    if append_sql:
        obj = get_schema_sql(schema, tablename, strip_prefix, skip)
        _sql += obj.sql
        _sql += get_stdin()

    if result_prints_std_out:
        c_sql = ''
        if _copy_sql:
            c_sql = _copy_sql.to_psql()

        logger.info(False, "PRIOR CHAIN ATTEMPT")
        logger.info(False, "c_sql: %s" % c_sql)
        logger.info(False, "_alter_sql: %s" % _alter_sql)
        logger.info(False, "drop_temp_table_sql: %s" % drop_temp_table_sql)

        chained = chain(_sql + c_sql + _alter_sql + drop_temp_table_sql)
        chained.pipe()
    else:
        assert postgres_url, "postgres_url undefined"
        # first send regular sql, if we have it
        chained = chain(_sql)
        chained.to_postgres(postgres_url)
        # send copied data
        if not append_sql and _copy_sql:
            chained = chain(_copy_sql.copy_statement)
            chained.to_postgres_copy(postgres_url, _copy_sql.data)
        if _alter_sql:
            chained.to_postgres(postgres_url, _alter_sql)
        if drop_temp_table_sql:
            chained.to_postgres(postgres_url, drop_temp_table_sql)
    return chained
Example #8
0
def main(argv=None):
    # import pydevd
    # pydevd.settrace('localhost', port=9797, stdoutToServer=True, stderrToServer=True, suspend=False)
    '''command-line interface'''
    tablename = None
    if argv is None:
        argv = sys.argv[1:]
        # print "argv: "
        # print argv
        # print "end argv: "
    try:
        # init default flags
        flags = dict()
        flags['maxsniff'] = 1000

        opts, args = \
            getopt.getopt(argv, "ak:s:q", ["help", "version", "schema=", "key=",
                                           "unique=", "cascade", "append", "utf8",
                                           "sniff=", "delimiter=", "datatype=",
                                           "role=", "is_merge=", "joinkeys=",
                                           "dates=", "tablename=", "databasename=",
                                           "is_dump=", "is_merge=", "primaryfirst=", "serial=",
                                           "timestamp=", "do_add_cols=", "analyze_table=",
                                           "now", "postgres_url=", "append_sql",
                                           "new_table_name=", "skipp_stored_proc_modified_time",
                                           "delete_temp_table", "modified_timestamp="])
        # print "opts: "
        # print opts
        # print "end opts"
        # print

        for o, a in opts:
            # print a
            if o in ("--version"):
                print __version__
                return 0
            elif o in ("--help"):
                _usage()
                return 0
            elif o in ("--cascade"):
                flags['cascade'] = True
            elif o in ("-a", "--append"):
                flags['create_table'] = False
                flags['truncate_table'] = False
                flags['load_data'] = True
                flags['maxsniff'] = 0
            elif o in ("-s", "--schema"):
                flags['schema'] = a
            elif o in ("--role"):
                flags['default_user'] = a
            elif o in ("--sniff"):
                flags['maxsniff'] = int(a)
            elif o in ("-k", "--key"):
                flags['pkey'] = a.split(':')
            elif o in ("--unique"):
                flags['uniquekey'] = a.split(':')
            elif o in ("--utf8"):
                flags['force_utf8'] = True
            elif o in ("--delimiter"):
                flags['delimiter'] = a
            elif o in ("--datatype"):
                if 'datatype' not in flags:
                    flags['datatype'] = dict()
                (k, v) = a.split(':')
                v = v.strip().lower()
                if v in _data_types:
                    for k in [mangle(_k) for _k in k.split(',')]:
                        flags['datatype'][k] = v
                else:
                    raise getopt.GetoptError('unknown data type %s (use %s)' % (v, _data_types))
            elif o in ("-q"):
                _verbose = False
            elif o in ("--is_merge"):
                flags['is_merge'] = True if a.lower() == 'true' else False
            elif o in ("--tablename"):
                tablename = a.lower()
            elif o in ("--joinkeys"):
                ( keys, key_name ) = a.lower().split(':')
                keys = keys.lower().split(',')
                flags['joinkeys'] = (keys, key_name)

            elif o in ("--dates"):
                (dates_commas, date_format) = a.split(':')
                dates = dates_commas.lower().split(',')
                if not flags.has_key('dates'):
                    flags['dates'] = dict()
                flags['dates'][date_format] = dates
            elif o in ("--databasename"):
                flags["database_name"] = a.lower()
            elif o in ("--is_dump"):
                flags["is_dump"] = True if a.lower() == 'true' else False
            elif o in ("--primaryfirst"):
                flags["make_primary_key_first"] = True if a.lower() == 'true' else False
            elif o in ("--serial"):
                flags["serial"] = a.lower()
            elif o in ("--timestamp"):
                flags["timestamp"] = a.lower()
            elif o in ("--do_add_cols"):
                flags["do_add_cols"] = True if a.lower() == 'true' else False
            elif o in ("--analyze_table"):
                flags["analyze_table"] = True if a.lower() == 'true' else False
            elif o in ("--now"):
                flags["result_prints_std_out"] = False  # inverse of now
            elif o in ("--postgres_url"):
                flags['postgres_url'] = a
            elif o in ("--append_sql"):
                flags['append_sql'] = True
            elif o in ("--new_table_name"):
                flags['new_table_name'] = a.lower()
            elif o in ("--skipp_stored_proc_modified_time"):
                flags['skipp_stored_proc_modified_time'] = True
            elif o in ("--delete_temp_table"):
                flags['delete_temp_table'] = True
            elif o in ("--modified_timestamp"):
                flags['modified_timestamp'] = a.lower()
            else:
                raise getopt.GetoptError('unknown option %s' % (o))

        print "-- flags: %s" % flags

        if not tablename:
            assert False, 'tablename is required via --tablename'

        if flags.has_key('postgres_url'):
            if not flags.has_key('result_prints_std_out') or (flags["result_prints_std_out"] and flags['postgres_url']):
                assert False, '--postgres_url required if --now is specified'

        print "-- tablename %s" % tablename
        tablename = mangle_table(tablename)
        print "-- mangled tablename %s" % tablename

        csv2psql(sys.stdin, tablename, **flags)
        return 0

    except getopt.GetoptError, err:
        print >> sys.stderr, 'ERROR:', str(err), "\n\n"
        _usage()
        return -1
Example #9
0
def _sniffer(f, maxsniff=-1, datatype={}, do_log=False):
    '''sniffs out data types'''
    _tbl = dict()
    if do_log:
        logger.info(True, "-- fieldnames: %s" % f.fieldnames)
        logger.info(True, "-- datatype: %s" % datatype)
    # initialize data types
    for k in f.fieldnames:
        _k = mangle(k)
        assert len(_k) > 0
        _tbl[_k] = {'type': str, 'width': _grow_varchar(None)}  # default data type
        if _k in datatype:
            dt = datatype[_k]
            if dt in ['int', 'int4', 'integer']:
                _tbl[_k] = {'type': int, 'width': 4}
            elif dt in ['smallint', 'short']:
                _tbl[_k] = {'type': int, 'width': 2}
            elif dt in ['float', 'double', 'float8']:
                _tbl[_k] = {'type': float, 'width': 8}
            elif dt in ['text', 'str']:
                _tbl[_k] = {'type': str, 'width': -1}
            elif dt in ['int8', 'bigint']:
                _tbl[_k] = {'type': int, 'width': 8}

    _need_sniff = False
    for k in f.fieldnames:
        if mangle(k) not in datatype:
            _need_sniff = True
            break

    # sniff out data types
    if maxsniff <> 0 and _need_sniff:
        i = 0
        for row in f:
            i += 1
            if maxsniff > 0 and i > maxsniff:
                break

            # if _verbose: print >>sys.stderr, 'sniffing row', i, '...', row, _tbl

            # sniff each data field
            for k in f.fieldnames:
                _k = mangle(k)
                assert len(_k) > 0

                v = row[k]
                assert type(v) == str
                if len(v) == 0:
                    continue  # skip empty strings

                if _k in datatype:
                    continue  # skip already typed column

                (dt, dw) = (_tbl[_k]['type'], _tbl[_k]['width'])
                try:
                    if (_isbool(v) or int(v) is not None) and not (dt == float):
                        _tbl[_k] = {'type': int, 'width': 4}
                except ValueError, e:
                    try:
                        if dt == int:  # revert to string
                            _tbl[_k] = {'type': str, 'width': _grow_varchar(v)}
                        if float(v) is not None:
                            _tbl[_k] = {'type': float, 'width': 8}
                    except ValueError, e:
                        if dt == float:
                            _tbl[_k] = {'type': str, 'width': _grow_varchar(v)}
                        if dt == str and dw < len(v):
                            _tbl[_k] = {'type': dt, 'width': _grow_varchar(v)}
Example #10
0
def _psql_identifier(s):
    '''wraps any reserved word with double quote escapes'''
    k = mangle(s)
    if k.lower() in psql_reserved_words:
        return '"%s"' % (k)
    return k
Example #11
0
def csv2psql(stream,
             tablename,
             analyze_table=True,
             cascade=False,
             create_table=True,
             datatype={},
             default_to_null=True,
             default_user=None,
             delimiter=',',
             force_utf8=False,
             load_data=True,
             maxsniff=-1,
             pkey=None,
             quiet=True,
             schema=None,
             strip_prefix=False,
             truncate_table=False,
             uniquekey=None,
             database_name='',
             is_merge=False,
             joinkeys=None,
             dates=None,
             is_dump=False,
             make_primary_key_first=False,
             serial=None,
             timestamp=None,
             do_add_cols=False,
             is_std_in=True,
             result_prints_std_out=True,
             csv_filename=None,
             postgres_url=None,
             append_sql=False,
             new_table_name=None,
             skipp_stored_proc_modified_time=False,
             delete_temp_table=False,
             modified_timestamp=None):
    # maybe copy?
    _sql = ''
    _copy_sql = ''
    drop_temp_table_sql = ''
    _alter_sql = ''

    orig_tablename = tablename + ""
    skip = is_merge or is_dump

    logger.info(True, "-- skip: %s" % skip)

    if skip:
        tablename = "temp_" + tablename

    if schema is None and not skip:
        schema = os.getenv('CSV2PSQL_SCHEMA', 'public').strip()
        if schema == '':
            schema = None

    if default_user is None and not skip:
        default_user = os.getenv('CSV2PSQL_USER', '').strip()
        if default_user == '':
            default_user = None

    if not append_sql:
        # pass 1
        _tbl = {}

        # back_up stream / data
        data = ''
        if not skip or is_merge:
            data += get_stdin()

            f = dict_reader(data, delimiter)
            mangled_field_names = []
            for key in f.fieldnames:
                mangled_field_names.append(mangle(key))
            _tbl = _sniffer(f, maxsniff, datatype)

        # logger.info(True, "-- _tbl: %s" % _tbl)

        if default_user is not None and not skip:
            _sql += "SET ROLE %s;\n" % default_user

        obj = get_schema_sql(schema, tablename, strip_prefix, skip)
        _sql += obj.sql
        tablename = obj.tablename

        # add explicit client encoding
        if force_utf8:
            _sql += "\\encoding UTF8\n"

        if quiet and not skip:
            _sql += "SET client_min_messages TO ERROR;\n"

        if create_table and not skip:
            create_ctr = 0
            logger.info(True, "-- CREATING TABLE\n")

            _sql += _create_table(
                tablename, cascade, _tbl, f, default_to_null,
                default_user, pkey,
                uniquekey, serial, timestamp)
            create_ctr += 1
            logger.info(True, "-- CREATE COUNTER: %s" % create_ctr)

            _sql += sql_procedures.modified_time_procedure.procedure_str
            # _s1ql += sql_triggers.modified_time_trigger(tablename)

        if truncate_table and not load_data and not skip:
            _sql += "TRUNCATE TABLE %s;\n" % tablename

        # pass 2
        if load_data and not skip:
            total_rows = data.count("\n")
            reader = dict_reader(data, delimiter)
            if is_std_in:

                _copy_sql = out_as_copy_stdin(total_rows, reader, tablename, delimiter, _tbl, dates)
            else:
                _copy_sql = out_as_copy_csv(total_rows, reader, tablename, delimiter, _tbl, csv_filename,
                                            dates)

        if load_data and analyze_table and not skip:
            _sql += "ANALYZE %s;\n" % tablename

        # fix bad dates ints or stings to correct int format
        if dates is not None:
            for date_format, cols in dates.iteritems():
                _alter_sql += sql_alters.dates(tablename, cols, date_format)

        # take cols and merge them into one primary_key
        join_keys_key_name = None
        if joinkeys is not None:
            (keys, key_name) = joinkeys
            join_keys_key_name = key_name

            _alter_sql += sql_alters.fast_delete_dupes(keys, key_name, tablename, True)
            # doing additional cols here as some types are not moved over correctly (with table copy in dupes)
            _alter_sql += additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge,
                                          modified_timestamp)

            _alter_sql += sql_alters.make_primary_key_w_join(tablename, key_name, keys)

        if do_add_cols and joinkeys is None:
            _alter_sql = additional_cols(tablename, serial, timestamp, mangled_field_names, is_merge,
                                         modified_timestamp)

        primary_key = pkey if pkey is not None else join_keys_key_name
        if is_array(primary_key):
            primary_key = primary_key[0]

        # take temporary table and merge it into a real table
        if primary_key is not None and is_dump:
            if create_table and database_name:
                _alter_sql += sql_alters.pg_dump(database_name, schema, tablename, new_table_name)
                # TODO re-order the primary_key to first column

        if is_merge and primary_key is not None:
            logger.info(True, "-- mangled_field_names: %s" % mangled_field_names)
            logger.info(True, "-- make_primary_key_first %s" % make_primary_key_first)

            time_tablename = new_table_name if new_table_name else orig_tablename
            if not skipp_stored_proc_modified_time:
                _sql += sql_triggers.modified_time_trigger(time_tablename)

            _sql += sql_alters.merge(mangled_field_names, orig_tablename,
                                     primary_key, make_primary_key_first, tablename, new_table_name)

            if delete_temp_table:
                logger.info(True, "dropping temp table: %s" % tablename)
                drop_temp_table_sql = "DROP TABLE %s;" % tablename
                # logger.info(True, _sql)

    if append_sql:
        obj = get_schema_sql(schema, tablename, strip_prefix, skip)
        _sql += obj.sql
        _sql += get_stdin()

    if result_prints_std_out:
        c_sql = ''
        if _copy_sql:
            c_sql = _copy_sql.to_psql()

        logger.info(False, "PRIOR CHAIN ATTEMPT")
        logger.info(False, "c_sql: %s" % c_sql)
        logger.info(False, "_alter_sql: %s" % _alter_sql)
        logger.info(False, "drop_temp_table_sql: %s" % drop_temp_table_sql)

        chained = chain(_sql + c_sql + _alter_sql + drop_temp_table_sql)
        chained.pipe()
    else:
        assert postgres_url, "postgres_url undefined"
        # first send regular sql, if we have it
        chained = chain(_sql)
        chained.to_postgres(postgres_url)
        # send copied data
        if not append_sql and _copy_sql:
            chained = chain(_copy_sql.copy_statement)
            chained.to_postgres_copy(postgres_url, _copy_sql.data)
        if _alter_sql:
            chained.to_postgres(postgres_url, _alter_sql)
        if drop_temp_table_sql:
            chained.to_postgres(postgres_url, drop_temp_table_sql)
    return chained
Example #12
0
def _make_data(totalrows,
               dict_reader,
               _tbl,
               tablename,
               dates,
               exit_on_error=False):
    # TODO Possible alternative to dropping rows
    # create an error table and append bad rows (with original data as all text cols)

    data = ''
    index = 0
    max_errors_per_row = 5

    logger.info(False, "totalrows %s" % totalrows)

    for row in dict_reader:
        index += 1
        outrow = []
        errors_in_row = 0
        for k in dict_reader.fieldnames:
            assert k in row
            try:
                _k = mangle(k)
                if _k in _tbl and 'type' in _tbl[_k]:
                    dt = _tbl[_k]['type']
                else:
                    dt = str
                validify_date_len(dates, k, _tbl)
                maybe_col_data = psqlencode(row[k], dt)
                outrow.append(maybe_col_data)
            except ValueError as e:
                errors_in_row += 1
                if errors_in_row > max_errors_per_row:
                    outrow = None
                    break
                _handle_error(e, k, _k, row, index, dt, tablename,
                              exit_on_error)
                #append NULL
                outrow.append('')
            except Exception as e:
                errors_in_row += 1
                if errors_in_row > max_errors_per_row:
                    outrow = None
                    break
                _handle_error(e, k, _k, row, index, dt, tablename,
                              exit_on_error)
                outrow.append('')
        #skip dead or poorly formatted rows
        if outrow:
            #tab
            data += "\t".join(outrow)
            #newline
            data += "\n"
        else:
            logger.error(
                False, "%s table has CSV ERROR: skipping row %s" %
                (tablename, str(index)))

        if index % 10000 == 0 and index != 0:
            logger.info(
                False, "\n%s table has progressed to the %s row.\n" %
                (tablename, str(index)))

            percent = ((index * 1.0) / totalrows) * 100
            logger.info(
                False,
                "\n%s %% complete for table %s.\n" % (str(percent), tablename))

    return data