Esempio n. 1
0
    def _open_gcs_url(self, binary) -> object:
        mode = "rb" if binary else "r"
        service_account_json = self._provider.get("service_account_json")
        credentials = None
        if service_account_json:
            try:
                credentials = json.loads(
                    self._provider["service_account_json"])
            except json.decoder.JSONDecodeError as err:
                error_msg = f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}"
                logger.error(error_msg)
                raise ConfigurationError(error_msg) from err

        if credentials:
            credentials = service_account.Credentials.from_service_account_info(
                credentials)
            client = GCSClient(credentials=credentials,
                               project=credentials._project_id)
        else:
            client = GCSClient.create_anonymous_client()
        file_to_close = smart_open.open(self.full_url,
                                        transport_params=dict(client=client),
                                        mode=mode)

        return file_to_close
Esempio n. 2
0
    def load_dataframes(self, fp, skip_data=False) -> List:
        """load and return the appropriate pandas dataframe.

        :param fp: file-like object to read from
        :param skip_data: limit reading data
        :return: a list of dataframe loaded from files described in the configuration
        """
        readers = {
            # pandas.read_csv additional arguments can be passed to customize how to parse csv.
            # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            "csv": pd.read_csv,
            # We can add option to call to pd.normalize_json to normalize semi-structured JSON data into a flat table
            # by asking user to specify how to flatten the nested columns
            "flat_json": pd.read_json,
            "html": pd.read_html,
            "excel": pd.read_excel,
            "feather": pd.read_feather,
            "parquet": pd.read_parquet,
            "orc": pd.read_orc,
            "pickle": pd.read_pickle,
        }

        try:
            reader = readers[self._reader_format]
        except KeyError as err:
            error_msg = f"Reader {self._reader_format} is not supported\n{traceback.format_exc()}"
            logger.error(error_msg)
            raise ConfigurationError(error_msg) from err

        reader_options = {**self._reader_options}
        if skip_data and self._reader_format == "csv":
            reader_options["nrows"] = 0
            reader_options["index_col"] = 0

        return [reader(fp, **reader_options)]
Esempio n. 3
0
    def list(self, fields: Sequence[str] = None) -> Iterator[dict]:
        for account in self._api.accounts:
            ig_account = account["instagram_business_account"]
            media = self._get_media(ig_account,
                                    {"limit": self.result_return_limit},
                                    ["media_type"])
            for ig_media in media:
                try:
                    yield {
                        **{
                            "id": ig_media.get("id"),
                            "page_id": account["page_id"],
                            "business_account_id": ig_account.get("id"),
                        },
                        **{
                            record.get("name"): record.get("values")[0]["value"]
                            for record in self._get_insights(ig_media)
                        },
                    }
                except FacebookRequestError as error:
                    # An error might occur if the media was posted before the most recent time that
                    # the user's account was converted to a business account from a personal account
                    if error.api_error_subcode() == 2108006:
                        logger.error(
                            f"Insights error for business_account_id {ig_account.get('id')}: {error.body()}"
                        )

                        # We receive all Media starting from the last one, and if on the next Media we get an Insight error,
                        # then no reason to make inquiries for each Media further, since they were published even earlier.
                        break
                    raise error
Esempio n. 4
0
    def storage_scheme(self) -> str:
        """Convert Storage Names to the proper URL Prefix
        :return: the corresponding URL prefix / scheme
        """
        storage_name = self._provider["storage"].upper()
        parse_result = urlparse(self._url)
        if storage_name == "GCS":
            return "gs://"
        elif storage_name == "S3":
            return "s3://"
        elif storage_name == "HTTPS":
            return "https://"
        elif storage_name == "SSH" or storage_name == "SCP":
            return "scp://"
        elif storage_name == "SFTP":
            return "sftp://"
        elif storage_name == "WEBHDFS":
            return "webhdfs://"
        elif storage_name == "LOCAL":
            return "file://"
        elif parse_result.scheme:
            return parse_result.scheme

        logger.error(f"Unknown Storage provider in: {self.full_url}")
        return ""
Esempio n. 5
0
    def health_check(self) -> Tuple[bool, str]:
        alive = True
        error_message = None
        try:
            self._find_accounts()
        except InstagramAPIException as exc:
            logger.error(str(exc))
            alive = False
            error_message = str(exc)

        return alive, error_message
Esempio n. 6
0
 def _request(self, url: str, **kwargs) -> Response:
     response = requests.get(url, headers=self._headers, **kwargs)
     if response.status_code == status_codes.TOO_MANY_REQUESTS:
         msg = f"Rate limit error: {response.json()}"
         logger.error(msg)
         raise TooManyRequests(msg)
     elif response.status_code != status_codes.OK:
         raise Exception(
             f"Unable to get data, error during request from url: {url}. Error: {response.json()}"
         )
     return response
Esempio n. 7
0
    def health_check(self) -> Tuple[bool, str]:
        alive = True
        error_message = None
        try:
            self._find_account(self._account_id)
        except FacebookAPIException as exc:
            logger.error(str(exc))  # we might need some extra details, so log original exception here
            alive = False
            error_message = str(exc)

        return alive, error_message
Esempio n. 8
0
 def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None):
     self._dataset_name = dataset_name
     self._url = url
     self._provider = provider
     self._reader_format = format or "csv"
     self._reader_options = {}
     if reader_options:
         try:
             self._reader_options = json.loads(reader_options)
         except json.decoder.JSONDecodeError as err:
             error_msg = f"Failed to parse reader options {repr(err)}\n{reader_options}\n{traceback.format_exc()}"
             logger.error(error_msg)
             raise ConfigurationError(error_msg) from err
Esempio n. 9
0
    def _get_insights(self, item) -> Iterator[Any]:
        """
        This is necessary because the functions that call this endpoint return
        a generator, whose calls need decorated with a backoff.
        """

        # Story IG Media object metrics with values less than 5 will return an error code 10 with the message (#10)
        # Not enough viewers for the media to show insights.
        try:
            return item.get_insights(params={"metric": self.STORY_METRICS})
        except FacebookRequestError as error:
            logger.error(f"Insights error: {error.api_error_message()}")
            if error.api_error_code() == 10:
                return []
            raise error
Esempio n. 10
0
    def _get_insights(self, item) -> Iterator[Any]:
        """
        This is necessary because the functions that call this endpoint return
        a generator, whose calls need decorated with a backoff.
        """
        if item.get("media_type") == "VIDEO":
            metrics = self.MEDIA_METRICS + ["video_views"]
        elif item.get("media_type") == "CAROUSEL_ALBUM":
            metrics = self.CAROUSEL_ALBUM_METRICS
        else:
            metrics = self.MEDIA_METRICS

        # An error might occur if the media was posted before the most recent time that
        # the user's account was converted to a business account from a personal account
        try:
            return item.get_insights(params={"metric": metrics})
        except FacebookRequestError as error:
            logger.error(f"Insights error: {error.body()}")
            raise error
Esempio n. 11
0
 def log_giveup(_details):
     logger.error("Max retry limit reached")