def fast_postgresql_to_df(table, schema):
    engine = table.bind
    conn = engine.raw_connection()
    with conn.cursor() as cur:
        with io.StringIO() as f:
            table_name = str(table)
            if not isinstance(table, Table):
                table_name = '({})'.format(table_name)
            sql = "COPY {table_name} TO STDOUT WITH (FORMAT CSV, HEADER TRUE)".format(
                table_name=table_name)
            cur.copy_expert(sql, f)

            f.seek(0)
            # reading csv
            csv_loader = CsvDataStore(schema, f, with_header=True)
            df = csv_loader.load()
            #df = pandas.read_csv(f)
            for col in schema.cols:
                if isinstance(col, dt):
                    # converting datetime column
                    df[col.name] = pandas.to_datetime(
                        df[col.name], format="%Y-%m-%d %H:%M:%S", coerce=True)
                if isinstance(col, big_dt):
                    # converting big_dt column
                    strptime = datetime.datetime.strptime
                    parse_func = (lambda x: strptime(x, "%Y-%m-%d %H:%M:%S"))
                    df[col.name] = df[col.name].map(parse_func,
                                                    na_action='ignore')
    return df
def fast_postgresql_to_df(table, schema):
    engine = table.bind
    conn = engine.raw_connection()
    with conn.cursor() as cur:
        with io.StringIO() as f:
            table_name = str(table)
            if not isinstance(table, Table):
                table_name = '({})'.format(table_name)
            sql = "COPY {table_name} TO STDOUT WITH (FORMAT CSV, HEADER TRUE)".format(
                table_name=table_name)
            cur.copy_expert(sql, f)

            f.seek(0)
             # reading csv
            csv_loader = CsvDataStore(schema, f, with_header=True)
            df = csv_loader.load()
            #df = pandas.read_csv(f)
            for col in schema.cols:
                if isinstance(col, dt):
                    # converting datetime column
                    df[col.name] = pandas.to_datetime(df[col.name], format="%Y-%m-%d %H:%M:%S", coerce=True)
                if isinstance(col, big_dt):
                    # converting big_dt column
                    strptime = datetime.datetime.strptime
                    parse_func = (lambda x: strptime(x, "%Y-%m-%d %H:%M:%S"))
                    df[col.name] = df[col.name].map(parse_func, na_action='ignore')
    return df
def fast_mysql_to_df(table, schema):
    from chatto_transform.config import config

    f = tempfile.NamedTemporaryFile('w',
                                    suffix='.csv',
                                    dir=config.data_dir + 'tmp')
    try:
        f.close()
        if not isinstance(table, Table):
            compiled = table.compile()
            table_name = '({})'.format(str(compiled))
            params = [compiled.params[k] for k in compiled.positiontup]
        else:
            table_name = str(table)
            params = []

        # converting to csv
        sql = """SELECT {cols} FROM {table} AS t INTO OUTFILE '{filename}'
        FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
        ESCAPED BY '\\\\'
        LINES TERMINATED BY '\n'""".format(cols=', '.join(
            '`' + colname + '`' for colname in schema.col_names()),
                                           filename=f.name,
                                           table=table_name)

        table.bind.execute(sql, *params)

        # reading csv
        csv_loader = CsvDataStore(schema,
                                  f.name,
                                  with_header=False,
                                  na_values=['\\N'])
        df = csv_loader.load()
        #df = pandas.read_csv(f.name, header=None, names=schema.col_names(), na_values=['\\N'])
    finally:
        os.remove(f.name)

    # for col in schema.cols:
    #     if isinstance(col, dt):
    #         # converting datetime column
    #         df[col.name] = pandas.to_datetime(df[col.name], format="%Y-%m-%d %H:%M:%S", coerce=True)
    #     if isinstance(col, big_dt):
    #         # converting big_dt column
    #         strptime = datetime.datetime.strptime
    #         parse_func = (lambda x: strptime(x, "%Y-%m-%d %H:%M:%S"))
    #         df[col.name] = df[col.name].map(parse_func, na_action='ignore')
    return df
def fast_mysql_to_df(table, schema):
    from chatto_transform.config import config

    f = tempfile.NamedTemporaryFile('w', suffix='.csv', dir=config.data_dir+'tmp')
    try:
        f.close()
        if not isinstance(table, Table):
            compiled = table.compile()   
            table_name = '({})'.format(str(compiled))
            params = [compiled.params[k] for k in compiled.positiontup]
        else:
            table_name = str(table)
            params = []

        # converting to csv
        sql = """SELECT {cols} FROM {table} AS t INTO OUTFILE '{filename}'
        FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
        ESCAPED BY '\\\\'
        LINES TERMINATED BY '\n'""".format(
            cols=', '.join('`'+colname+'`' for colname in schema.col_names()),
            filename=f.name,
            table=table_name)

        table.bind.execute(sql, *params)
        
        # reading csv
        csv_loader = CsvDataStore(schema, f.name, with_header=False, na_values=['\\N'])
        df = csv_loader.load()
        #df = pandas.read_csv(f.name, header=None, names=schema.col_names(), na_values=['\\N'])
    finally:
        os.remove(f.name)

    # for col in schema.cols:
    #     if isinstance(col, dt):
    #         # converting datetime column
    #         df[col.name] = pandas.to_datetime(df[col.name], format="%Y-%m-%d %H:%M:%S", coerce=True)
    #     if isinstance(col, big_dt):
    #         # converting big_dt column
    #         strptime = datetime.datetime.strptime
    #         parse_func = (lambda x: strptime(x, "%Y-%m-%d %H:%M:%S"))
    #         df[col.name] = df[col.name].map(parse_func, na_action='ignore')
    return df
Beispiel #5
0
def load_csv(file_path, schema):
    store = CsvDataStore(schema, file_path)
    df = store.load()
    return df
def load_csv(file_path, schema):
    store = CsvDataStore(schema, file_path)
    df = store.load()
    return df