Exemple #1
0
def load(db_schema,
         table_name,
         load_postgis,
         json_table_schema,
         connection_string,
         rows,
         indexes_fields,
         batch_size=500):
    if load_postgis:
        load_postgis_support()

    creds = re.match(carto_connection_string_regex, connection_string).groups()
    table = get_table(table_name, json_table_schema)
    schema = jsontableschema.Schema(json_table_schema)

    truncate(creds, table_name)

    _buffer = []
    for row in rows:
        _buffer.append(type_fields(schema, row))
        if len(_buffer) >= batch_size:
            insert(creds, table, _buffer)
            _buffer = []

    if len(_buffer) > 0:
        insert(creds, table, _buffer)

    cartodbfytable(creds, db_schema, table_name)

    if indexes_fields:
        create_indexes(creds, table_name, indexes_fields)

    vacuum_analyze(creds, table_name)
    def write(self, bucket, rows):

        # Prepare
        BUFFER_SIZE = 1000
        descriptor = self.describe(bucket)
        schema = jsontableschema.Schema(descriptor)
        table = self.__get_table(bucket)

        # Write
        with self.__connection.begin():
            keyed_rows = []
            for row in rows:
                keyed_row = {}
                for index, field in enumerate(schema.fields):
                    value = row[index]
                    try:
                        value = field.cast_value(value)
                    except InvalidObjectType:
                        value = json.loads(value)
                    keyed_row[field.name] = value
                keyed_rows.append(keyed_row)
                if len(keyed_rows) > BUFFER_SIZE:
                    # Insert data
                    table.insert().execute(keyed_rows)
                    # Clean memory
                    keyed_rows = []
            if len(keyed_rows) > 0:
                # Insert data
                table.insert().execute(keyed_rows)
    def write(self, rows, keyed):
        # Prepare
        schema = jsontableschema.Schema(self.descriptor)

        # Write
        for row in rows:
            if not keyed:
                row = self.__convert_to_keyed(schema, row)

            keyed_row = row

            if self.__check_existing(keyed_row):
                for wr in self.__insert():
                    yield wr
                ret = self.__update(row)
                if ret is not None:
                    yield WrittenRow(keyed_row,
                                     True,
                                     ret if self.autoincrement else None)
                    continue

            self.__buffer.append(keyed_row)

            if len(self.__buffer) > BUFFER_SIZE:
                for wr in self.__insert():
                    yield wr

        for wr in self.__insert():
            yield wr
Exemple #4
0
def copy_from(engine, table_name, table_schema, rows):
    schema = jsontableschema.Schema(table_schema)

    def type_fields(row):
        typed_row = []
        for index, field in enumerate(schema.fields):
            value = row[index]
            if field.type != 'geojson':
                try:
                    value = field.cast_value(value)
                except InvalidObjectType:
                    value = json.loads(value)
            typed_row.append(value)

        return typed_row

    def transform():
        try:
            row = next(rows)
            typed_row = type_fields(row)
            with io.StringIO() as out:
                writer = csv.writer(out)
                writer.writerow(typed_row)
                return out.getvalue()
        except StopIteration:
            return ''

    transformed_rows = TransformStream(transform)

    conn = engine.raw_connection()
    with conn.cursor() as cur:
        copy = 'COPY {} FROM STDIN CSV'.format(table_name)
        cur.copy_expert(copy, transformed_rows)
        conn.commit()
    conn.close()
Exemple #5
0
def process_resource(spec, rows):
    schema = spec['schema']
    jts = jsontableschema.Schema(schema)
    field_names = list(map(lambda f: f['name'], schema['fields']))
    for row in rows:
        flattened_row = [row.get(name) for name in field_names]
        try:
            flattened_row = jts.cast_row(flattened_row)
        except Exception:
            logging.error('Failed to cast row %r', flattened_row)
            raise
        row = dict(zip(field_names, flattened_row))
        yield row
Exemple #6
0
def load(logger,
         db_schema,
         table_name,
         load_postgis,
         json_table_schema,
         connection_string,
         rows,
         indexes_fields,
         do_truncate,
         batch_size=500):
    if load_postgis:
        load_postgis_support()

    creds = re.match(carto_connection_string_regex, connection_string).groups()
    table = get_table(table_name, json_table_schema)
    schema = jsontableschema.Schema(json_table_schema)

    if do_truncate:
        truncate(logger, creds, table_name)

    _buffer = []
    num_rows_expected = 0
    total_num_rows_inserted = 0
    for row in rows:
        num_rows_expected += 1
        _buffer.append(type_fields(schema, row))
        buf_ln = len(_buffer)
        if buf_ln >= batch_size:
            num_rows_inserted = insert(logger, creds, table, _buffer)
            logger.info('{} - Inserted {} rows'.format(table_name,
                                                       num_rows_inserted))
            if buf_ln != num_rows_inserted:
                message = '{} - Number of rows inserted does not match expected - expected: {} actual: {}'.format(
                    table_name, buf_ln, num_rows_inserted)
                logger.error(message)
                raise Exception(message)
            total_num_rows_inserted += num_rows_inserted
            _buffer = []

    if len(_buffer) > 0:
        insert(logger, creds, table, _buffer)

    verify_count(logger, creds, table, num_rows_expected,
                 total_num_rows_inserted)

    cartodbfytable(logger, creds, db_schema, table_name)

    if indexes_fields:
        create_indexes(logger, creds, table_name, indexes_fields)

    vacuum_analyze(logger, creds, table_name)
Exemple #7
0
 def _iter_from_tabulator(self, table, schema):
     model = None
     if schema is not None:
         model = jsontableschema.Schema(schema)
     for keyed_row in table.iter(keyed=True):
         if model is not None:
             for field in model.fields:
                 value = keyed_row[field.name]
                 try:
                     keyed_row[field.name] = field.cast_value(value)
                 except JsonTableSchemaException as exception:
                     message = 'Cannot cast %r for <%s>' % (value, field.name)
                     six.raise_from(ValueError(message), exception)
         yield keyed_row
Exemple #8
0
def upsert(engine, db_schema, table_name, table_schema, rows):
    if 'primaryKey' not in table_schema:
        raise Exception('`primaryKey` required for upsert')

    schema = jsontableschema.Schema(table_schema)

    upsert_sql = get_upsert_sql(
        db_schema, table_name, table_schema['primaryKey'],
        list(map(lambda x: x['name'], table_schema['fields'])))

    conn = engine.raw_connection()
    with conn.cursor() as cur:
        try:
            for row in rows:
                typed_row = type_fields(schema, row)
                cur.execute(upsert_sql, typed_row +
                            typed_row)  # has to do it twice, insert and set
            conn.commit()
        except:
            conn.rollback()
            raise
    conn.close()
Exemple #9
0
def copy_from(engine, table_name, table_schema, rows):
    schema = jsontableschema.Schema(table_schema)

    def transform():
        try:
            row = next(rows)
            typed_row = type_fields(schema, row)
            with io.StringIO() as out:
                writer = csv.writer(out)
                writer.writerow(typed_row)
                return out.getvalue()
        except StopIteration:
            return ''

    transformed_rows = TransformStream(transform)

    conn = engine.raw_connection()
    with conn.cursor() as cur:
        copy = 'COPY {} FROM STDIN CSV'.format(table_name)
        cur.copy_expert(copy, transformed_rows)
        conn.commit()
    conn.close()