Ejemplo n.º 1
0
    def upload(self,
               with_lnglat=None,
               if_exists=FAIL,
               table_name=None,
               schema=None,
               context=None):
        if table_name:
            self.table_name = normalize_name(table_name)
        if context:
            self.cc = context
            self.schema = context.get_default_schema()
        if schema:
            self.schema = schema

        if self.table_name is None or self.cc is None:
            raise ValueError(
                'You should provide a table_name and context to upload data.')

        if self.gdf is None and self.df is None and self.query is None:
            raise ValueError(
                'Nothing to upload.'
                'We need data in a DataFrame or GeoDataFrame or a query to upload data to CARTO.'
            )

        already_exists_error = CartoException(
            'Table with name {t} and schema {s} already exists in CARTO.'
            'Please choose a different `table_name` or use'
            'if_exists="replace" to overwrite it'.format(t=self.table_name,
                                                         s=self.schema))

        # priority order: gdf, df, query
        if self.gdf is not None:
            warn(
                'GeoDataFrame option is still under development. We will try the upload with DataFrame'
            )
            # TODO: uncomment when we support GeoDataFrame
            # self.normalized_column_names = _normalize_column_names(self.gdf)

        if self.df is not None:
            self.normalized_column_names = _normalize_column_names(self.df)

            if if_exists == Dataset.REPLACE or not self.exists():
                self._create_table(with_lnglat)
            elif if_exists == Dataset.FAIL:
                raise already_exists_error

            self._copyfrom(with_lnglat)

        elif self.query is not None:
            if if_exists == Dataset.APPEND:
                raise CartoException(
                    'Error using append with a query Dataset.'
                    'It is not possible to append data to a query')
            elif if_exists == Dataset.REPLACE or not self.exists():
                self._create_table_from_query()
            elif if_exists == Dataset.FAIL:
                raise already_exists_error

        return self
Ejemplo n.º 2
0
 def send(self, relative_path, http_method, **requests_args):
     try:
         return super(EtoolsCartoNoAuthClient,
                      self).send(relative_path, http_method.lower(),
                                 **requests_args)
     except Exception as e:
         raise CartoException(e)
Ejemplo n.º 3
0
 def send(self, relative_path, http_method, **requests_args):
     try:
         return super().send(
             relative_path,
             http_method.lower(),
             **requests_args
         )
     except Exception as e:  # pragma: no cover
         raise CartoException(e)
Ejemplo n.º 4
0
    def get(self, method_name, mocker):
        """
        Returns a mock request for a given `method_name`

        :param method_name: The test method name

        :return:
            A `requests_mock` object
        :raise:
            CartoException
        """
        try:
            if method_name in self.requests:
                r = self.requests[method_name]
                mocker.get(r['url'], text=r['text'])
            else:
                raise CartoException('method_name not found: ' + method_name)
        except Exception as e:
            raise CartoException(e)
Ejemplo n.º 5
0
    def _create_table(self, with_lnglat=None):
        job = self.cc.batch_sql_client \
                  .create_and_wait_for_completion(
                      '''BEGIN; {drop}; {create}; {cartodbfy}; COMMIT;'''
                      .format(drop=self._drop_table_query(),
                              create=self._create_table_query(with_lnglat),
                              cartodbfy=self._cartodbfy_query()))

        if job['status'] != 'done':
            raise CartoException('Cannot create table: {}.'.format(
                job['failed_reason']))
Ejemplo n.º 6
0
    def query_with_retries(self, query, offset, max_retries=5):
        """
        Query CartoDB with retries
        """
        retries = 0
        while retries < max_retries:
            time.sleep(0.1)
            retries += 1
            try:
                sites = self.sql_client.send(query)
            except CartoException:
                if retries < max_retries:
                    logger.warning(
                        'Retrying again table page at offset {}'.format(
                            offset))

            if 'error' in sites:
                raise CartoException('Invalid CartoDBTable')
            return sites['rows']
        raise CartoException('Cannot connect to CartoDB')
Ejemplo n.º 7
0
    def _handle_import(self, import_job, table_name):
        """Handle state of import job"""
        if import_job['state'] == 'failure':
            if import_job['error_code'] == 8001:
                raise CartoException('Over CARTO account storage limit for '
                                     'user `{}`. Try subsetting your '
                                     'DataFrame or dropping columns to reduce '
                                     'the data size.'.format(self.username))
            elif import_job['error_code'] == 6668:
                raise CartoException('Too many rows in DataFrame. Try '
                                     'subsetting DataFrame before writing to '
                                     'CARTO.')
            else:
                raise CartoException('Error code: `{}`. See CARTO Import '
                                     'API error documentation for more '
                                     'information: https://carto.com/docs/'
                                     'carto-engine/import-api/import-errors'
                                     ''.format(import_job['error_code']))
        elif import_job['state'] == 'complete':
            self._debug_print(final_table=import_job['table_name'])
            if import_job['table_name'] != table_name:
                try:
                    res = self.sql_client.send('''
                        DROP TABLE IF EXISTS {orig_table};
                        ALTER TABLE {dupe_table}
                        RENAME TO {orig_table};
                        '''.format(
                            orig_table=table_name,
                            dupe_table=import_job['table_name']))

                    self._debug_print(res=res)
                except Exception as err:
                    self._debug_print(err=err)
                    raise Exception('Cannot overwrite table `{table_name}` '
                                    '({err}). DataFrame was written to '
                                    '`{new_table}` instead.'.format(
                                        table_name=table_name,
                                        err=err,
                                        new_table=import_job['table_name']))
        return table_name
Ejemplo n.º 8
0
    def sync(self):
        try:
            with transaction.atomic():
                old2new, to_deactivate = get_remapping(self.sql_client,
                                                       self.carto)
                self.handle_obsolete_locations(to_deactivate)
                self.apply_remap(old2new)
                new, updated, skipped, error = self.create_or_update_locations(
                )
                self.clean_upper_level()
                return new, updated, skipped, error

        except CartoException as e:
            logger.error(str(e))
            raise CartoException(str(e))
Ejemplo n.º 9
0
    def _send_dataframe(self, df, table_name, temp_dir, geom_col):
        """Send a DataFrame to CARTO to be imported as a SQL table

        Args:
            df (pandas.DataFrame): DataFrame that is will be sent to CARTO
            table_name (str): Name on CARTO for the table that will have the
                data from ``df``
            temp_dir (str): Name of directory used for temporarily storing the
                DataFrame file to sent to CARTO
            geom_col (str): Name of geometry column

        Returns:
            final_table_name (str): Name of final table. This method will
            overwrite the table `table_name` if it already exists.
        """
        def remove_tempfile(filepath):
            """removes temporary file"""
            os.remove(filepath)

        tempfile = '{temp_dir}/{table_name}.csv'.format(temp_dir=temp_dir,
                                                        table_name=table_name)
        self._debug_print(tempfile=tempfile)
        df.drop(geom_col, axis=1, errors='ignore').to_csv(tempfile)

        with open(tempfile, 'rb') as f:
            res = self._auth_send('api/v1/imports', 'POST',
                                  files={'file': f},
                                  params={'type_guessing': 'false'},
                                  stream=True)
            self._debug_print(res=res)

            if not res['success']:
                remove_tempfile(tempfile)
                raise CartoException('Failed to send DataFrame')
            import_id = res['item_queue_id']

        remove_tempfile(tempfile)
        final_table_name = table_name
        while True:
            import_job = self._check_import(import_id)
            self._debug_print(import_job=import_job)
            final_table_name = self._handle_import(import_job, table_name)
            if import_job['state'] == 'complete':
                break
            # Wait a second before doing another request
            time.sleep(1.0)

        return final_table_name
Ejemplo n.º 10
0
    def get_cartodb_locations(self, cartodb_id_col='cartodb_id'):
        """
        returns locations referenced by cartodb_table
        """
        rows = []
        try:
            row_count = self.sql_client.send(
                f'select count(*) from {self.carto.table_name}'
            )['rows'][0]['count']
            max_id = self.sql_client.send(
                f'select MAX({cartodb_id_col}) from {self.carto.table_name}'
            )['rows'][0]['max']
        except CartoException:  # pragma: no-cover
            message = f"Cannot fetch pagination prerequisites from CartoDB for table {self.carto.table_name}"
            logger.exception(message)
            raise CartoException(message)

        offset, limit = 0, 100

        # failsafe in the case when cartodb id's are too much off compared to the nr. of records
        if max_id > (5 * row_count):
            limit = max_id + 1
            logger.warning(
                "The CartoDB primary key seems off, pagination is not possible"
            )

        parent_qry = f', {self.carto.parent_code_col}' if self.carto.parent_code_col and self.carto.parent else ''
        base_qry = f'select st_AsGeoJSON(the_geom) as the_geom, {self.carto.name_col}, ' \
                   f'{self.carto.pcode_col}{parent_qry} from {self.carto.table_name}'

        while offset <= max_id:
            logger.info(
                f'Requesting rows between {offset} and {offset + limit} for {self.carto.table_name}'
            )
            paged_qry = base_qry + f' WHERE {cartodb_id_col} > {offset} AND {cartodb_id_col} <= {offset + limit}'
            time.sleep(0.1)  # do not spam Carto with requests
            new_rows = self.query_with_retries(paged_qry, offset)
            rows += new_rows
            offset += limit

        return rows
Ejemplo n.º 11
0
    def data_augment(self, table_name, metadata):
        """Augment an existing CARTO table with `Data Observatory
        <https://carto.com/data-observatory>`__ measures. See the full `Data
        Observatory catalog
        <https://cartodb.github.io/bigmetadata/index.html>`__ for all available
        measures. The result of this operation is:

        1. It updates `table_name` by adding columns from the Data Observatory
        2. It returns a pandas DataFrame representation of that newly augmented
           table.

        Note:
            This method alters `table_name` in the user's CARTO database by
            adding additional columns. To avoid this, create a copy of the
            table first and use the new copy instead.

        Example:
            Add new measures to a CARTO table and pass it to a pandas DataFrame.
            Using the "Median Household Income in the past 12 months" measure
            from the `Data Observatory Catalog
            <https://cartodb.github.io/bigmetadata/united_states/income.html#median-household-income-in-the-past-12-months>`__.
            ::

                import cartoframes
                cc = cartoframes.CartoContext(BASEURL, APIKEY)
                median_income = [{'numer_id': 'us.census.acs.B19013001',
                                  'geom_id': 'us.census.tiger.block_group',
                                  'numer_timespan': '2011 - 2015'}]
                df = cc.data_augment('transaction_events',
                                     median_income)

        Args:
            table_name (str): Name of table on CARTO account that Data
                Observatory measures are to be added to.
            metadata (list of dicts): List of all measures to add to
                `table_name`. Each `dict` has the following keys:

                - `numer_id` (str): The identifier for the desired measurement
                - `geom_id` (str, optional): Identifier for a desired
                  geographic boundary level to use when calculating measures.
                  Will be automatically assigned if undefined
                - `normalization` (str, optional): The desired normalization. One
                  of 'area', 'prenormalized', or 'denominated'. 'Area' will
                  normalize the measure per square kilometer, 'prenormalized'
                  will return the original value, and 'denominated' will
                  normalize by a denominator.
                - `denom_id` (str, optional): Measure ID from DO catalog
                - `numer_timespan` (str, optional): The desired timespan for the
                  measurement. Defaults to most recent timespan available if
                  left unspecified.
                - `geom_timespan` (str, optional): The desired timespan for the
                  geometry. Defaults to timespan matching `numer_timespan` if
                  left unspecified.
                - `target_area` (str, optional): Instead of aiming to have
                  `target_geoms` in the area of the geometry passed as extent,
                  fill this area. Unit is square degrees WGS84. Set this to
                  `0` if you want to use the smallest source geometry for this
                  element of metadata, for example if you're passing in points.
                - `target_geoms` (str, optional): Override global `target_geoms`
                  for this element of metadata
                - `max_timespan_rank` (str, optional): Override global
                  `max_timespan_rank` for this element of metadata
                - `max_score_rank` (str, optional): Override global
                  `max_score_rank` for this element of metadata

        Returns:
            pandas.DataFrame: A DataFrame representation of `table_name` which
            has new columns for each measure in `metadata`.
        """

        try:
            with open(os.path.join(os.path.dirname(__file__),
                                   'assets/data_obs_augment.sql'), 'r') as f:
                augment_functions = f.read()
            self.sql_client.send(augment_functions)
        except Exception as err:
            raise CartoException("Could not install `obs_augment_table` onto "
                                 "user account ({})".format(err))

        # augment with data observatory metadata
        augment_query = '''
            select obs_augment_table('{username}.{tablename}',
                                     '{cols_meta}');
        '''.format(username=self.username,
                   tablename=table_name,
                   cols_meta=json.dumps(metadata))
        resp = self.sql_client.send(augment_query)

        # read full augmented table
        return self.read(table_name)
Ejemplo n.º 12
0
    def _send_batches(self, df, table_name, temp_dir, geom_col):
        """Batch sending a dataframe

        Args:
            df (pandas.DataFrame): DataFrame that will be batched up for
                sending to CARTO
            table_name (str): Name of table to send DataFrame to
            temp_dir (str): Local directory for temporary storage of DataFrame
                written to file that will be sent to CARTO
            geom_col (str): Name of encoded geometry column (if any) that will
                be dropped or converted to `the_geom` column

        Returns:
            final_table_name (str): Final table name on CARTO that the
            DataFrame is stored in

        Exceptions:
            * TODO: add more (Out of storage)
        """
        subtables = []
        # send dataframe chunks to carto
        for chunk_num, chunk in tqdm(df.groupby([i // MAX_IMPORT_ROWS
                                                 for i in range(df.shape[0])]),
                                     desc='Uploading in batches: '):
            temp_table = '{orig}_cartoframes_temp_{chunk}'.format(
                orig=table_name[:40],
                chunk=chunk_num)
            try:
                # send dataframe chunk, get new name if collision
                temp_table = self._send_dataframe(chunk, temp_table,
                                                  temp_dir, geom_col)
            except CartoException as err:
                self._drop_tables(subtables)
                raise CartoException(err)

            if temp_table:
                subtables.append(temp_table)
            self._debug_print(chunk_num=chunk_num,
                              chunk_shape=str(chunk.shape),
                              temp_table=temp_table)

        # combine chunks into final table
        try:
            select_base = ('SELECT %(schema)s '
                           'FROM "{table}"') % dict(schema=_df2pg_schema(df))
            unioned_tables = '\nUNION ALL\n'.join([select_base.format(table=t)
                                                   for t in subtables])
            self._debug_print(unioned=unioned_tables)
            query = '''
                DROP TABLE IF EXISTS "{table_name}";
                CREATE TABLE "{table_name}" As {unioned_tables};
                ALTER TABLE {table_name} DROP COLUMN IF EXISTS cartodb_id;
                {drop_tables}
                SELECT CDB_CartoDBFYTable('{org}', '{table_name}');
                '''.format(table_name=table_name,
                           unioned_tables=unioned_tables,
                           org=self.username if self.is_org else 'public',
                           drop_tables=_drop_tables_query(subtables))
            self._debug_print(query=query)
            _ = self.sql_client.send(query)
        except CartoException as err:
            try:
                self._drop_tables(subtables)
            except CartoException as err:
                warn('Failed to drop the following subtables from CARTO '
                     'account: {}'.format(', '.join(subtables)))
            finally:
                raise Exception('Failed to upload dataframe: {}'.format(err))

        return table_name
Ejemplo n.º 13
0
def to_carto(dataframe,
             table_name,
             credentials=None,
             if_exists='fail',
             geom_col=None,
             index=False,
             index_label=None,
             cartodbfy=True,
             log_enabled=True,
             retry_times=3,
             max_upload_size=MAX_UPLOAD_SIZE_BYTES,
             skip_quota_warning=False):
    """Upload a DataFrame to CARTO. The geometry's CRS must be WGS 84 (EPSG:4326) so you can use it on CARTO.

    Args:
        dataframe (pandas.DataFrame, geopandas.GeoDataFrame`): data to be uploaded.
        table_name (str): name of the table to upload the data.
        credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional):
            instance of Credentials (username, api_key, etc).
        if_exists (str, optional): 'fail', 'replace', 'append'. Default is 'fail'.
        geom_col (str, optional): name of the geometry column of the dataframe.
        index (bool, optional): write the index in the table. Default is False.
        index_label (str, optional): name of the index column in the table. By default it
            uses the name of the index from the dataframe.
        cartodbfy (bool, optional): convert the table to CARTO format. Default True. More info
            `here <https://carto.com/developers/sql-api/guides/creating-tables/#create-tables>`.
        log_enabled (bool, optional): enable the logging mechanism. Default is True.
        retry_times (int, optional):
            Number of time to retry the upload in case it fails. Default is 3.
        max_upload_size (int, optional): defines the maximum size of the dataframe to be uploaded.
            Default is 2GB.
        skip_quota_warning (bool, optional): skip the quota exceeded check and force the upload.
            (The upload will still fail if the size of the dataset exceeds the remaining DB quota).
            Default is False.

    Returns:
        string: the table name normalized.

    Raises:
        ValueError: if the dataframe or table name provided are wrong or the if_exists param is not valid.

    """
    if not isinstance(dataframe, DataFrame):
        raise ValueError(
            'Wrong dataframe. You should provide a valid DataFrame instance.')

    if isinstance(dataframe, GeoDataFrame):
        if is_reprojection_needed(dataframe):
            dataframe = reproject(dataframe)

    if not is_valid_str(table_name):
        raise ValueError(
            'Wrong table name. You should provide a valid table name.')

    if if_exists not in IF_EXISTS_OPTIONS:
        raise ValueError(
            'Wrong option for the `if_exists` param. You should provide: {}.'.
            format(', '.join(IF_EXISTS_OPTIONS)))

    context_manager = ContextManager(credentials)

    if not skip_quota_warning:
        me_data = context_manager.credentials.me_data
        if me_data is not None and me_data.get('user_data'):
            n = min(SAMPLE_ROWS_NUMBER, len(dataframe))
            estimated_byte_size = len(dataframe.sample(n=n).to_csv(header=False)) * len(dataframe) \
                / n / CSV_TO_CARTO_RATIO
            remaining_byte_quota = me_data.get('user_data').get(
                'remaining_byte_quota')

            if remaining_byte_quota is not None and estimated_byte_size > remaining_byte_quota:
                raise CartoException(
                    'DB Quota will be exceeded. '
                    'The remaining quota is {} bytes and the dataset size is {} bytes.'
                    .format(remaining_byte_quota, estimated_byte_size))

    gdf = GeoDataFrame(dataframe, copy=True)

    if index:
        index_name = index_label or gdf.index.name
        if index_name is not None and index_name != '':
            # Append the index as a column
            gdf[index_name] = gdf.index
        else:
            raise ValueError(
                'Wrong index name. You should provide a valid index label.')

    if geom_col in gdf:
        set_geometry(gdf, geom_col, inplace=True, drop=True)
    elif has_geometry(dataframe):
        gdf.set_geometry(dataframe.geometry.name, inplace=True)

    if has_geometry(gdf):
        if GEOM_COLUMN_NAME in gdf and dataframe.geometry.name != GEOM_COLUMN_NAME:
            gdf.drop(columns=[GEOM_COLUMN_NAME], inplace=True)

        # Prepare geometry column for the upload
        gdf.rename_geometry(GEOM_COLUMN_NAME, inplace=True)

    elif isinstance(dataframe, GeoDataFrame):
        log.warning('Geometry column not found in the GeoDataFrame.')

    chunk_count = math.ceil(estimate_csv_size(gdf) / max_upload_size)
    chunk_row_size = int(math.ceil(len(gdf) / chunk_count))
    chunked_gdf = [
        gdf[i:i + chunk_row_size]
        for i in range(0, gdf.shape[0], chunk_row_size)
    ]

    for i, chunk in enumerate(chunked_gdf):
        if i > 0:
            if_exists = 'append'
        table_name = context_manager.copy_from(chunk, table_name, if_exists,
                                               cartodbfy, retry_times)

    if log_enabled:
        log.info('Success! Data uploaded to table "{}" correctly'.format(
            table_name))

    return table_name
Ejemplo n.º 14
0
    def create_or_update_locations(self):
        """
        Create or update locations based on p-code (only active locations are considerate)

        """
        logging.info('Create/Update new locations')
        rows = self.get_cartodb_locations()
        new, updated, skipped, error = 0, 0, 0, 0
        for row in rows:
            pcode = row[self.carto.pcode_col]
            name = row[self.carto.name_col]
            geom = row['the_geom']

            if all([name, pcode, geom]):
                geom_key = 'point' if 'Point' in row['the_geom'] else 'geom'
                default_dict = {
                    'admin_level': self.carto.admin_level,
                    'admin_level_name': self.carto.admin_level_name,
                    'name': name,
                    geom_key: geom,
                }

                parent_pcode = row[
                    self.carto.
                    parent_code_col] if self.carto.parent_code_col in row else None
                if parent_pcode:
                    try:
                        parent = get_location_model().objects.get(
                            p_code=parent_pcode, is_active=True)
                        default_dict['parent'] = parent
                    except (get_location_model().DoesNotExist,
                            get_location_model().MultipleObjectsReturned):
                        skipped += 1
                        logger.info(f"Skipping row pcode {pcode}")
                        continue

                try:
                    location, created = get_location_model(
                    ).objects.get_or_create(p_code=pcode,
                                            is_active=True,
                                            defaults=default_dict)
                    if created:
                        new += 1
                    else:
                        for attr, value in default_dict.items():
                            setattr(location, attr, value)
                        location.save()
                        updated += 1

                except get_location_model().MultipleObjectsReturned:
                    message = f"Multiple locations found for: {self.carto.admin_level}, {name} ({pcode})"
                    logger.exception(message)
                    raise CartoException(message)

                except IntegrityError:
                    message = f"Duplicate Creation {name} {pcode} {self.carto.location_type.name}"
                    logger.exception(message)
                    raise CartoException(message)
            else:
                skipped += 1
                logger.info(f"Skipping row pcode {pcode}")

        return new, updated, skipped, error