def _open_gcs_url(self, binary) -> object: mode = "rb" if binary else "r" service_account_json = self._provider.get("service_account_json") credentials = None if service_account_json: try: credentials = json.loads( self._provider["service_account_json"]) except json.decoder.JSONDecodeError as err: error_msg = f"Failed to parse gcs service account json: {repr(err)}\n{traceback.format_exc()}" logger.error(error_msg) raise ConfigurationError(error_msg) from err if credentials: credentials = service_account.Credentials.from_service_account_info( credentials) client = GCSClient(credentials=credentials, project=credentials._project_id) else: client = GCSClient.create_anonymous_client() file_to_close = smart_open.open(self.full_url, transport_params=dict(client=client), mode=mode) return file_to_close
def load_dataframes(self, fp, skip_data=False) -> List: """load and return the appropriate pandas dataframe. :param fp: file-like object to read from :param skip_data: limit reading data :return: a list of dataframe loaded from files described in the configuration """ readers = { # pandas.read_csv additional arguments can be passed to customize how to parse csv. # see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html "csv": pd.read_csv, # We can add option to call to pd.normalize_json to normalize semi-structured JSON data into a flat table # by asking user to specify how to flatten the nested columns "flat_json": pd.read_json, "html": pd.read_html, "excel": pd.read_excel, "feather": pd.read_feather, "parquet": pd.read_parquet, "orc": pd.read_orc, "pickle": pd.read_pickle, } try: reader = readers[self._reader_format] except KeyError as err: error_msg = f"Reader {self._reader_format} is not supported\n{traceback.format_exc()}" logger.error(error_msg) raise ConfigurationError(error_msg) from err reader_options = {**self._reader_options} if skip_data and self._reader_format == "csv": reader_options["nrows"] = 0 reader_options["index_col"] = 0 return [reader(fp, **reader_options)]
def list(self, fields: Sequence[str] = None) -> Iterator[dict]: for account in self._api.accounts: ig_account = account["instagram_business_account"] media = self._get_media(ig_account, {"limit": self.result_return_limit}, ["media_type"]) for ig_media in media: try: yield { **{ "id": ig_media.get("id"), "page_id": account["page_id"], "business_account_id": ig_account.get("id"), }, **{ record.get("name"): record.get("values")[0]["value"] for record in self._get_insights(ig_media) }, } except FacebookRequestError as error: # An error might occur if the media was posted before the most recent time that # the user's account was converted to a business account from a personal account if error.api_error_subcode() == 2108006: logger.error( f"Insights error for business_account_id {ig_account.get('id')}: {error.body()}" ) # We receive all Media starting from the last one, and if on the next Media we get an Insight error, # then no reason to make inquiries for each Media further, since they were published even earlier. break raise error
def storage_scheme(self) -> str: """Convert Storage Names to the proper URL Prefix :return: the corresponding URL prefix / scheme """ storage_name = self._provider["storage"].upper() parse_result = urlparse(self._url) if storage_name == "GCS": return "gs://" elif storage_name == "S3": return "s3://" elif storage_name == "HTTPS": return "https://" elif storage_name == "SSH" or storage_name == "SCP": return "scp://" elif storage_name == "SFTP": return "sftp://" elif storage_name == "WEBHDFS": return "webhdfs://" elif storage_name == "LOCAL": return "file://" elif parse_result.scheme: return parse_result.scheme logger.error(f"Unknown Storage provider in: {self.full_url}") return ""
def health_check(self) -> Tuple[bool, str]: alive = True error_message = None try: self._find_accounts() except InstagramAPIException as exc: logger.error(str(exc)) alive = False error_message = str(exc) return alive, error_message
def _request(self, url: str, **kwargs) -> Response: response = requests.get(url, headers=self._headers, **kwargs) if response.status_code == status_codes.TOO_MANY_REQUESTS: msg = f"Rate limit error: {response.json()}" logger.error(msg) raise TooManyRequests(msg) elif response.status_code != status_codes.OK: raise Exception( f"Unable to get data, error during request from url: {url}. Error: {response.json()}" ) return response
def health_check(self) -> Tuple[bool, str]: alive = True error_message = None try: self._find_account(self._account_id) except FacebookAPIException as exc: logger.error(str(exc)) # we might need some extra details, so log original exception here alive = False error_message = str(exc) return alive, error_message
def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None): self._dataset_name = dataset_name self._url = url self._provider = provider self._reader_format = format or "csv" self._reader_options = {} if reader_options: try: self._reader_options = json.loads(reader_options) except json.decoder.JSONDecodeError as err: error_msg = f"Failed to parse reader options {repr(err)}\n{reader_options}\n{traceback.format_exc()}" logger.error(error_msg) raise ConfigurationError(error_msg) from err
def _get_insights(self, item) -> Iterator[Any]: """ This is necessary because the functions that call this endpoint return a generator, whose calls need decorated with a backoff. """ # Story IG Media object metrics with values less than 5 will return an error code 10 with the message (#10) # Not enough viewers for the media to show insights. try: return item.get_insights(params={"metric": self.STORY_METRICS}) except FacebookRequestError as error: logger.error(f"Insights error: {error.api_error_message()}") if error.api_error_code() == 10: return [] raise error
def _get_insights(self, item) -> Iterator[Any]: """ This is necessary because the functions that call this endpoint return a generator, whose calls need decorated with a backoff. """ if item.get("media_type") == "VIDEO": metrics = self.MEDIA_METRICS + ["video_views"] elif item.get("media_type") == "CAROUSEL_ALBUM": metrics = self.CAROUSEL_ALBUM_METRICS else: metrics = self.MEDIA_METRICS # An error might occur if the media was posted before the most recent time that # the user's account was converted to a business account from a personal account try: return item.get_insights(params={"metric": metrics}) except FacebookRequestError as error: logger.error(f"Insights error: {error.body()}") raise error
def log_giveup(_details): logger.error("Max retry limit reached")