def upload(self, with_lnglat=None, if_exists=FAIL, table_name=None, schema=None, context=None): if table_name: self.table_name = normalize_name(table_name) if context: self.cc = context self.schema = context.get_default_schema() if schema: self.schema = schema if self.table_name is None or self.cc is None: raise ValueError( 'You should provide a table_name and context to upload data.') if self.gdf is None and self.df is None and self.query is None: raise ValueError( 'Nothing to upload.' 'We need data in a DataFrame or GeoDataFrame or a query to upload data to CARTO.' ) already_exists_error = CartoException( 'Table with name {t} and schema {s} already exists in CARTO.' 'Please choose a different `table_name` or use' 'if_exists="replace" to overwrite it'.format(t=self.table_name, s=self.schema)) # priority order: gdf, df, query if self.gdf is not None: warn( 'GeoDataFrame option is still under development. We will try the upload with DataFrame' ) # TODO: uncomment when we support GeoDataFrame # self.normalized_column_names = _normalize_column_names(self.gdf) if self.df is not None: self.normalized_column_names = _normalize_column_names(self.df) if if_exists == Dataset.REPLACE or not self.exists(): self._create_table(with_lnglat) elif if_exists == Dataset.FAIL: raise already_exists_error self._copyfrom(with_lnglat) elif self.query is not None: if if_exists == Dataset.APPEND: raise CartoException( 'Error using append with a query Dataset.' 'It is not possible to append data to a query') elif if_exists == Dataset.REPLACE or not self.exists(): self._create_table_from_query() elif if_exists == Dataset.FAIL: raise already_exists_error return self
def send(self, relative_path, http_method, **requests_args): try: return super(EtoolsCartoNoAuthClient, self).send(relative_path, http_method.lower(), **requests_args) except Exception as e: raise CartoException(e)
def send(self, relative_path, http_method, **requests_args): try: return super().send( relative_path, http_method.lower(), **requests_args ) except Exception as e: # pragma: no cover raise CartoException(e)
def get(self, method_name, mocker): """ Returns a mock request for a given `method_name` :param method_name: The test method name :return: A `requests_mock` object :raise: CartoException """ try: if method_name in self.requests: r = self.requests[method_name] mocker.get(r['url'], text=r['text']) else: raise CartoException('method_name not found: ' + method_name) except Exception as e: raise CartoException(e)
def _create_table(self, with_lnglat=None): job = self.cc.batch_sql_client \ .create_and_wait_for_completion( '''BEGIN; {drop}; {create}; {cartodbfy}; COMMIT;''' .format(drop=self._drop_table_query(), create=self._create_table_query(with_lnglat), cartodbfy=self._cartodbfy_query())) if job['status'] != 'done': raise CartoException('Cannot create table: {}.'.format( job['failed_reason']))
def query_with_retries(self, query, offset, max_retries=5): """ Query CartoDB with retries """ retries = 0 while retries < max_retries: time.sleep(0.1) retries += 1 try: sites = self.sql_client.send(query) except CartoException: if retries < max_retries: logger.warning( 'Retrying again table page at offset {}'.format( offset)) if 'error' in sites: raise CartoException('Invalid CartoDBTable') return sites['rows'] raise CartoException('Cannot connect to CartoDB')
def _handle_import(self, import_job, table_name): """Handle state of import job""" if import_job['state'] == 'failure': if import_job['error_code'] == 8001: raise CartoException('Over CARTO account storage limit for ' 'user `{}`. Try subsetting your ' 'DataFrame or dropping columns to reduce ' 'the data size.'.format(self.username)) elif import_job['error_code'] == 6668: raise CartoException('Too many rows in DataFrame. Try ' 'subsetting DataFrame before writing to ' 'CARTO.') else: raise CartoException('Error code: `{}`. See CARTO Import ' 'API error documentation for more ' 'information: https://carto.com/docs/' 'carto-engine/import-api/import-errors' ''.format(import_job['error_code'])) elif import_job['state'] == 'complete': self._debug_print(final_table=import_job['table_name']) if import_job['table_name'] != table_name: try: res = self.sql_client.send(''' DROP TABLE IF EXISTS {orig_table}; ALTER TABLE {dupe_table} RENAME TO {orig_table}; '''.format( orig_table=table_name, dupe_table=import_job['table_name'])) self._debug_print(res=res) except Exception as err: self._debug_print(err=err) raise Exception('Cannot overwrite table `{table_name}` ' '({err}). DataFrame was written to ' '`{new_table}` instead.'.format( table_name=table_name, err=err, new_table=import_job['table_name'])) return table_name
def sync(self): try: with transaction.atomic(): old2new, to_deactivate = get_remapping(self.sql_client, self.carto) self.handle_obsolete_locations(to_deactivate) self.apply_remap(old2new) new, updated, skipped, error = self.create_or_update_locations( ) self.clean_upper_level() return new, updated, skipped, error except CartoException as e: logger.error(str(e)) raise CartoException(str(e))
def _send_dataframe(self, df, table_name, temp_dir, geom_col): """Send a DataFrame to CARTO to be imported as a SQL table Args: df (pandas.DataFrame): DataFrame that is will be sent to CARTO table_name (str): Name on CARTO for the table that will have the data from ``df`` temp_dir (str): Name of directory used for temporarily storing the DataFrame file to sent to CARTO geom_col (str): Name of geometry column Returns: final_table_name (str): Name of final table. This method will overwrite the table `table_name` if it already exists. """ def remove_tempfile(filepath): """removes temporary file""" os.remove(filepath) tempfile = '{temp_dir}/{table_name}.csv'.format(temp_dir=temp_dir, table_name=table_name) self._debug_print(tempfile=tempfile) df.drop(geom_col, axis=1, errors='ignore').to_csv(tempfile) with open(tempfile, 'rb') as f: res = self._auth_send('api/v1/imports', 'POST', files={'file': f}, params={'type_guessing': 'false'}, stream=True) self._debug_print(res=res) if not res['success']: remove_tempfile(tempfile) raise CartoException('Failed to send DataFrame') import_id = res['item_queue_id'] remove_tempfile(tempfile) final_table_name = table_name while True: import_job = self._check_import(import_id) self._debug_print(import_job=import_job) final_table_name = self._handle_import(import_job, table_name) if import_job['state'] == 'complete': break # Wait a second before doing another request time.sleep(1.0) return final_table_name
def get_cartodb_locations(self, cartodb_id_col='cartodb_id'): """ returns locations referenced by cartodb_table """ rows = [] try: row_count = self.sql_client.send( f'select count(*) from {self.carto.table_name}' )['rows'][0]['count'] max_id = self.sql_client.send( f'select MAX({cartodb_id_col}) from {self.carto.table_name}' )['rows'][0]['max'] except CartoException: # pragma: no-cover message = f"Cannot fetch pagination prerequisites from CartoDB for table {self.carto.table_name}" logger.exception(message) raise CartoException(message) offset, limit = 0, 100 # failsafe in the case when cartodb id's are too much off compared to the nr. of records if max_id > (5 * row_count): limit = max_id + 1 logger.warning( "The CartoDB primary key seems off, pagination is not possible" ) parent_qry = f', {self.carto.parent_code_col}' if self.carto.parent_code_col and self.carto.parent else '' base_qry = f'select st_AsGeoJSON(the_geom) as the_geom, {self.carto.name_col}, ' \ f'{self.carto.pcode_col}{parent_qry} from {self.carto.table_name}' while offset <= max_id: logger.info( f'Requesting rows between {offset} and {offset + limit} for {self.carto.table_name}' ) paged_qry = base_qry + f' WHERE {cartodb_id_col} > {offset} AND {cartodb_id_col} <= {offset + limit}' time.sleep(0.1) # do not spam Carto with requests new_rows = self.query_with_retries(paged_qry, offset) rows += new_rows offset += limit return rows
def data_augment(self, table_name, metadata): """Augment an existing CARTO table with `Data Observatory <https://carto.com/data-observatory>`__ measures. See the full `Data Observatory catalog <https://cartodb.github.io/bigmetadata/index.html>`__ for all available measures. The result of this operation is: 1. It updates `table_name` by adding columns from the Data Observatory 2. It returns a pandas DataFrame representation of that newly augmented table. Note: This method alters `table_name` in the user's CARTO database by adding additional columns. To avoid this, create a copy of the table first and use the new copy instead. Example: Add new measures to a CARTO table and pass it to a pandas DataFrame. Using the "Median Household Income in the past 12 months" measure from the `Data Observatory Catalog <https://cartodb.github.io/bigmetadata/united_states/income.html#median-household-income-in-the-past-12-months>`__. :: import cartoframes cc = cartoframes.CartoContext(BASEURL, APIKEY) median_income = [{'numer_id': 'us.census.acs.B19013001', 'geom_id': 'us.census.tiger.block_group', 'numer_timespan': '2011 - 2015'}] df = cc.data_augment('transaction_events', median_income) Args: table_name (str): Name of table on CARTO account that Data Observatory measures are to be added to. metadata (list of dicts): List of all measures to add to `table_name`. Each `dict` has the following keys: - `numer_id` (str): The identifier for the desired measurement - `geom_id` (str, optional): Identifier for a desired geographic boundary level to use when calculating measures. Will be automatically assigned if undefined - `normalization` (str, optional): The desired normalization. One of 'area', 'prenormalized', or 'denominated'. 'Area' will normalize the measure per square kilometer, 'prenormalized' will return the original value, and 'denominated' will normalize by a denominator. - `denom_id` (str, optional): Measure ID from DO catalog - `numer_timespan` (str, optional): The desired timespan for the measurement. Defaults to most recent timespan available if left unspecified. - `geom_timespan` (str, optional): The desired timespan for the geometry. Defaults to timespan matching `numer_timespan` if left unspecified. - `target_area` (str, optional): Instead of aiming to have `target_geoms` in the area of the geometry passed as extent, fill this area. Unit is square degrees WGS84. Set this to `0` if you want to use the smallest source geometry for this element of metadata, for example if you're passing in points. - `target_geoms` (str, optional): Override global `target_geoms` for this element of metadata - `max_timespan_rank` (str, optional): Override global `max_timespan_rank` for this element of metadata - `max_score_rank` (str, optional): Override global `max_score_rank` for this element of metadata Returns: pandas.DataFrame: A DataFrame representation of `table_name` which has new columns for each measure in `metadata`. """ try: with open(os.path.join(os.path.dirname(__file__), 'assets/data_obs_augment.sql'), 'r') as f: augment_functions = f.read() self.sql_client.send(augment_functions) except Exception as err: raise CartoException("Could not install `obs_augment_table` onto " "user account ({})".format(err)) # augment with data observatory metadata augment_query = ''' select obs_augment_table('{username}.{tablename}', '{cols_meta}'); '''.format(username=self.username, tablename=table_name, cols_meta=json.dumps(metadata)) resp = self.sql_client.send(augment_query) # read full augmented table return self.read(table_name)
def _send_batches(self, df, table_name, temp_dir, geom_col): """Batch sending a dataframe Args: df (pandas.DataFrame): DataFrame that will be batched up for sending to CARTO table_name (str): Name of table to send DataFrame to temp_dir (str): Local directory for temporary storage of DataFrame written to file that will be sent to CARTO geom_col (str): Name of encoded geometry column (if any) that will be dropped or converted to `the_geom` column Returns: final_table_name (str): Final table name on CARTO that the DataFrame is stored in Exceptions: * TODO: add more (Out of storage) """ subtables = [] # send dataframe chunks to carto for chunk_num, chunk in tqdm(df.groupby([i // MAX_IMPORT_ROWS for i in range(df.shape[0])]), desc='Uploading in batches: '): temp_table = '{orig}_cartoframes_temp_{chunk}'.format( orig=table_name[:40], chunk=chunk_num) try: # send dataframe chunk, get new name if collision temp_table = self._send_dataframe(chunk, temp_table, temp_dir, geom_col) except CartoException as err: self._drop_tables(subtables) raise CartoException(err) if temp_table: subtables.append(temp_table) self._debug_print(chunk_num=chunk_num, chunk_shape=str(chunk.shape), temp_table=temp_table) # combine chunks into final table try: select_base = ('SELECT %(schema)s ' 'FROM "{table}"') % dict(schema=_df2pg_schema(df)) unioned_tables = '\nUNION ALL\n'.join([select_base.format(table=t) for t in subtables]) self._debug_print(unioned=unioned_tables) query = ''' DROP TABLE IF EXISTS "{table_name}"; CREATE TABLE "{table_name}" As {unioned_tables}; ALTER TABLE {table_name} DROP COLUMN IF EXISTS cartodb_id; {drop_tables} SELECT CDB_CartoDBFYTable('{org}', '{table_name}'); '''.format(table_name=table_name, unioned_tables=unioned_tables, org=self.username if self.is_org else 'public', drop_tables=_drop_tables_query(subtables)) self._debug_print(query=query) _ = self.sql_client.send(query) except CartoException as err: try: self._drop_tables(subtables) except CartoException as err: warn('Failed to drop the following subtables from CARTO ' 'account: {}'.format(', '.join(subtables))) finally: raise Exception('Failed to upload dataframe: {}'.format(err)) return table_name
def to_carto(dataframe, table_name, credentials=None, if_exists='fail', geom_col=None, index=False, index_label=None, cartodbfy=True, log_enabled=True, retry_times=3, max_upload_size=MAX_UPLOAD_SIZE_BYTES, skip_quota_warning=False): """Upload a DataFrame to CARTO. The geometry's CRS must be WGS 84 (EPSG:4326) so you can use it on CARTO. Args: dataframe (pandas.DataFrame, geopandas.GeoDataFrame`): data to be uploaded. table_name (str): name of the table to upload the data. credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional): instance of Credentials (username, api_key, etc). if_exists (str, optional): 'fail', 'replace', 'append'. Default is 'fail'. geom_col (str, optional): name of the geometry column of the dataframe. index (bool, optional): write the index in the table. Default is False. index_label (str, optional): name of the index column in the table. By default it uses the name of the index from the dataframe. cartodbfy (bool, optional): convert the table to CARTO format. Default True. More info `here <https://carto.com/developers/sql-api/guides/creating-tables/#create-tables>`. log_enabled (bool, optional): enable the logging mechanism. Default is True. retry_times (int, optional): Number of time to retry the upload in case it fails. Default is 3. max_upload_size (int, optional): defines the maximum size of the dataframe to be uploaded. Default is 2GB. skip_quota_warning (bool, optional): skip the quota exceeded check and force the upload. (The upload will still fail if the size of the dataset exceeds the remaining DB quota). Default is False. Returns: string: the table name normalized. Raises: ValueError: if the dataframe or table name provided are wrong or the if_exists param is not valid. """ if not isinstance(dataframe, DataFrame): raise ValueError( 'Wrong dataframe. You should provide a valid DataFrame instance.') if isinstance(dataframe, GeoDataFrame): if is_reprojection_needed(dataframe): dataframe = reproject(dataframe) if not is_valid_str(table_name): raise ValueError( 'Wrong table name. You should provide a valid table name.') if if_exists not in IF_EXISTS_OPTIONS: raise ValueError( 'Wrong option for the `if_exists` param. You should provide: {}.'. format(', '.join(IF_EXISTS_OPTIONS))) context_manager = ContextManager(credentials) if not skip_quota_warning: me_data = context_manager.credentials.me_data if me_data is not None and me_data.get('user_data'): n = min(SAMPLE_ROWS_NUMBER, len(dataframe)) estimated_byte_size = len(dataframe.sample(n=n).to_csv(header=False)) * len(dataframe) \ / n / CSV_TO_CARTO_RATIO remaining_byte_quota = me_data.get('user_data').get( 'remaining_byte_quota') if remaining_byte_quota is not None and estimated_byte_size > remaining_byte_quota: raise CartoException( 'DB Quota will be exceeded. ' 'The remaining quota is {} bytes and the dataset size is {} bytes.' .format(remaining_byte_quota, estimated_byte_size)) gdf = GeoDataFrame(dataframe, copy=True) if index: index_name = index_label or gdf.index.name if index_name is not None and index_name != '': # Append the index as a column gdf[index_name] = gdf.index else: raise ValueError( 'Wrong index name. You should provide a valid index label.') if geom_col in gdf: set_geometry(gdf, geom_col, inplace=True, drop=True) elif has_geometry(dataframe): gdf.set_geometry(dataframe.geometry.name, inplace=True) if has_geometry(gdf): if GEOM_COLUMN_NAME in gdf and dataframe.geometry.name != GEOM_COLUMN_NAME: gdf.drop(columns=[GEOM_COLUMN_NAME], inplace=True) # Prepare geometry column for the upload gdf.rename_geometry(GEOM_COLUMN_NAME, inplace=True) elif isinstance(dataframe, GeoDataFrame): log.warning('Geometry column not found in the GeoDataFrame.') chunk_count = math.ceil(estimate_csv_size(gdf) / max_upload_size) chunk_row_size = int(math.ceil(len(gdf) / chunk_count)) chunked_gdf = [ gdf[i:i + chunk_row_size] for i in range(0, gdf.shape[0], chunk_row_size) ] for i, chunk in enumerate(chunked_gdf): if i > 0: if_exists = 'append' table_name = context_manager.copy_from(chunk, table_name, if_exists, cartodbfy, retry_times) if log_enabled: log.info('Success! Data uploaded to table "{}" correctly'.format( table_name)) return table_name
def create_or_update_locations(self): """ Create or update locations based on p-code (only active locations are considerate) """ logging.info('Create/Update new locations') rows = self.get_cartodb_locations() new, updated, skipped, error = 0, 0, 0, 0 for row in rows: pcode = row[self.carto.pcode_col] name = row[self.carto.name_col] geom = row['the_geom'] if all([name, pcode, geom]): geom_key = 'point' if 'Point' in row['the_geom'] else 'geom' default_dict = { 'admin_level': self.carto.admin_level, 'admin_level_name': self.carto.admin_level_name, 'name': name, geom_key: geom, } parent_pcode = row[ self.carto. parent_code_col] if self.carto.parent_code_col in row else None if parent_pcode: try: parent = get_location_model().objects.get( p_code=parent_pcode, is_active=True) default_dict['parent'] = parent except (get_location_model().DoesNotExist, get_location_model().MultipleObjectsReturned): skipped += 1 logger.info(f"Skipping row pcode {pcode}") continue try: location, created = get_location_model( ).objects.get_or_create(p_code=pcode, is_active=True, defaults=default_dict) if created: new += 1 else: for attr, value in default_dict.items(): setattr(location, attr, value) location.save() updated += 1 except get_location_model().MultipleObjectsReturned: message = f"Multiple locations found for: {self.carto.admin_level}, {name} ({pcode})" logger.exception(message) raise CartoException(message) except IntegrityError: message = f"Duplicate Creation {name} {pcode} {self.carto.location_type.name}" logger.exception(message) raise CartoException(message) else: skipped += 1 logger.info(f"Skipping row pcode {pcode}") return new, updated, skipped, error