def load(db_schema, table_name, load_postgis, json_table_schema, connection_string, rows, indexes_fields, batch_size=500): if load_postgis: load_postgis_support() creds = re.match(carto_connection_string_regex, connection_string).groups() table = get_table(table_name, json_table_schema) schema = jsontableschema.Schema(json_table_schema) truncate(creds, table_name) _buffer = [] for row in rows: _buffer.append(type_fields(schema, row)) if len(_buffer) >= batch_size: insert(creds, table, _buffer) _buffer = [] if len(_buffer) > 0: insert(creds, table, _buffer) cartodbfytable(creds, db_schema, table_name) if indexes_fields: create_indexes(creds, table_name, indexes_fields) vacuum_analyze(creds, table_name)
def write(self, bucket, rows): # Prepare BUFFER_SIZE = 1000 descriptor = self.describe(bucket) schema = jsontableschema.Schema(descriptor) table = self.__get_table(bucket) # Write with self.__connection.begin(): keyed_rows = [] for row in rows: keyed_row = {} for index, field in enumerate(schema.fields): value = row[index] try: value = field.cast_value(value) except InvalidObjectType: value = json.loads(value) keyed_row[field.name] = value keyed_rows.append(keyed_row) if len(keyed_rows) > BUFFER_SIZE: # Insert data table.insert().execute(keyed_rows) # Clean memory keyed_rows = [] if len(keyed_rows) > 0: # Insert data table.insert().execute(keyed_rows)
def write(self, rows, keyed): # Prepare schema = jsontableschema.Schema(self.descriptor) # Write for row in rows: if not keyed: row = self.__convert_to_keyed(schema, row) keyed_row = row if self.__check_existing(keyed_row): for wr in self.__insert(): yield wr ret = self.__update(row) if ret is not None: yield WrittenRow(keyed_row, True, ret if self.autoincrement else None) continue self.__buffer.append(keyed_row) if len(self.__buffer) > BUFFER_SIZE: for wr in self.__insert(): yield wr for wr in self.__insert(): yield wr
def copy_from(engine, table_name, table_schema, rows): schema = jsontableschema.Schema(table_schema) def type_fields(row): typed_row = [] for index, field in enumerate(schema.fields): value = row[index] if field.type != 'geojson': try: value = field.cast_value(value) except InvalidObjectType: value = json.loads(value) typed_row.append(value) return typed_row def transform(): try: row = next(rows) typed_row = type_fields(row) with io.StringIO() as out: writer = csv.writer(out) writer.writerow(typed_row) return out.getvalue() except StopIteration: return '' transformed_rows = TransformStream(transform) conn = engine.raw_connection() with conn.cursor() as cur: copy = 'COPY {} FROM STDIN CSV'.format(table_name) cur.copy_expert(copy, transformed_rows) conn.commit() conn.close()
def process_resource(spec, rows): schema = spec['schema'] jts = jsontableschema.Schema(schema) field_names = list(map(lambda f: f['name'], schema['fields'])) for row in rows: flattened_row = [row.get(name) for name in field_names] try: flattened_row = jts.cast_row(flattened_row) except Exception: logging.error('Failed to cast row %r', flattened_row) raise row = dict(zip(field_names, flattened_row)) yield row
def load(logger, db_schema, table_name, load_postgis, json_table_schema, connection_string, rows, indexes_fields, do_truncate, batch_size=500): if load_postgis: load_postgis_support() creds = re.match(carto_connection_string_regex, connection_string).groups() table = get_table(table_name, json_table_schema) schema = jsontableschema.Schema(json_table_schema) if do_truncate: truncate(logger, creds, table_name) _buffer = [] num_rows_expected = 0 total_num_rows_inserted = 0 for row in rows: num_rows_expected += 1 _buffer.append(type_fields(schema, row)) buf_ln = len(_buffer) if buf_ln >= batch_size: num_rows_inserted = insert(logger, creds, table, _buffer) logger.info('{} - Inserted {} rows'.format(table_name, num_rows_inserted)) if buf_ln != num_rows_inserted: message = '{} - Number of rows inserted does not match expected - expected: {} actual: {}'.format( table_name, buf_ln, num_rows_inserted) logger.error(message) raise Exception(message) total_num_rows_inserted += num_rows_inserted _buffer = [] if len(_buffer) > 0: insert(logger, creds, table, _buffer) verify_count(logger, creds, table, num_rows_expected, total_num_rows_inserted) cartodbfytable(logger, creds, db_schema, table_name) if indexes_fields: create_indexes(logger, creds, table_name, indexes_fields) vacuum_analyze(logger, creds, table_name)
def _iter_from_tabulator(self, table, schema): model = None if schema is not None: model = jsontableschema.Schema(schema) for keyed_row in table.iter(keyed=True): if model is not None: for field in model.fields: value = keyed_row[field.name] try: keyed_row[field.name] = field.cast_value(value) except JsonTableSchemaException as exception: message = 'Cannot cast %r for <%s>' % (value, field.name) six.raise_from(ValueError(message), exception) yield keyed_row
def upsert(engine, db_schema, table_name, table_schema, rows): if 'primaryKey' not in table_schema: raise Exception('`primaryKey` required for upsert') schema = jsontableschema.Schema(table_schema) upsert_sql = get_upsert_sql( db_schema, table_name, table_schema['primaryKey'], list(map(lambda x: x['name'], table_schema['fields']))) conn = engine.raw_connection() with conn.cursor() as cur: try: for row in rows: typed_row = type_fields(schema, row) cur.execute(upsert_sql, typed_row + typed_row) # has to do it twice, insert and set conn.commit() except: conn.rollback() raise conn.close()
def copy_from(engine, table_name, table_schema, rows): schema = jsontableschema.Schema(table_schema) def transform(): try: row = next(rows) typed_row = type_fields(schema, row) with io.StringIO() as out: writer = csv.writer(out) writer.writerow(typed_row) return out.getvalue() except StopIteration: return '' transformed_rows = TransformStream(transform) conn = engine.raw_connection() with conn.cursor() as cur: copy = 'COPY {} FROM STDIN CSV'.format(table_name) cur.copy_expert(copy, transformed_rows) conn.commit() conn.close()