def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state_path: str = None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) if state_path: logger.info("Starting sync with provided state file") state_obj = json.loads(open(state_path, "r").read()) else: logger.info("No state provided, starting fresh sync") state_obj = {} state = defaultdict(dict, state_obj) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name, state=state): yield record logger.info("Finished syncing mailchimp")
def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]: """ :param logger: :param config_container: :param catalog_path: :param state_path: :return: """ config = config_container.rendered_config storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"]) url = SourceFile.get_simple_url(config["url"]) name = SourceFile.get_stream_name(config) logger.info( f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..." ) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) selection = SourceFile.parse_catalog(catalog) try: if "format" in config and config["format"] == "json": data_list = SourceFile.load_nested_json(config, logger) for data in data_list: yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) else: df_list = SourceFile.load_dataframes(config, logger) for df in df_list: if len(selection) > 0: columns = selection.intersection(set(df.columns)) else: columns = df.columns df = df.replace(np.nan, "NaN", regex=True) for data in df[columns].to_dict(orient="records"): yield AirbyteMessage( type=Type.RECORD, record=AirbyteRecordMessage( stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000), ) except Exception as err: reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}" logger.error(reason) raise err
def read( self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path: str, state_path: str = None ) -> Generator[AirbyteMessage, None, None]: """ Implements the parent class read method. """ catalogs = self._discover_internal(logger, config_container.config_path) masked_airbyte_catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) selected_singer_catalog_path = SingerHelper.create_singer_catalog_with_selection(masked_airbyte_catalog, catalogs.singer_catalog) read_cmd = self.read_cmd(logger, config_container.config_path, selected_singer_catalog_path, state_path) return SingerHelper.read(logger, read_cmd)
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._get_client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info(f"Starting syncing {self.__class__.__name__}") for configured_stream in catalog.streams: for record in client.read_stream(configured_stream.stream): yield AirbyteMessage(type=airbyte_protocol.Type.RECORD, record=record) logger.info(f"Finished syncing {self.__class__.__name__}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) logger.info("Starting syncing mailchimp") for configured_stream in catalog.streams: stream = configured_stream.stream for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing mailchimp")
def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: config = config_container.rendered_config client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"])) catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog) spreadsheet_id = config["spreadsheet_id"] logger.info(f"Starting syncing spreadsheet {spreadsheet_id}") # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been # a blank row, emit the row batch sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(client, spreadsheet_id, sheet_to_column_name) for sheet in sheet_to_column_index_to_name.keys(): logger.info(f"Syncing sheet {sheet}") column_index_to_name = sheet_to_column_index_to_name[sheet] row_cursor = 2 # we start syncing past the header row encountered_blank_row = False while not encountered_blank_row: range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}" logger.info(f"Fetching range {range}") row_batch = SpreadsheetValues.parse_obj( client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute() ) row_cursor += ROW_BATCH_SIZE + 1 # there should always be one range since we requested only one value_ranges = row_batch.valueRanges[0] if not value_ranges.values: break row_values = value_ranges.values if len(row_values) == 0: break for row in row_values: if Helpers.is_row_empty(row): encountered_blank_row = True break elif Helpers.row_contains_relevant_data(row, column_index_to_name.keys()): yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name)) logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
def read(self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]: client = self._client(config_container) config = self.read_config(catalog_path) catalog = ConfiguredAirbyteCatalog.parse_obj(config) logger.info("Starting syncing recurly") for configured_stream in catalog.streams: # TODO handle incremental syncs stream = configured_stream.stream if stream.name not in client.ENTITIES: logger.warn( f"Stream '{stream}' not found in the recognized entities") continue for record in self._read_record(client=client, stream=stream.name): yield AirbyteMessage(type=Type.RECORD, record=record) logger.info("Finished syncing recurly")
def get_catalog(self) -> ConfiguredAirbyteCatalog: raw_catalog = pkgutil.get_data( self.__class__.__module__.split(".")[0], "configured_catalog.json") return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
def get_catalog(self) -> ConfiguredAirbyteCatalog: raw_catalog = pkgutil.get_data( self.__class__.__module__.split(".")[0], self.CONFIGURED_CATALOG_FILENAME) return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: return ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path))
def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: catalog = ConfiguredAirbyteCatalog.parse_obj( self.read_config(catalog_path)) if not self.reports_to_read: self.reports_to_read = [i.stream.name for i in catalog.streams] return catalog_path