Ejemplo n.º 1
0
    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"])
        url = SourceFile.get_simple_url(config["url"])
        logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...")
        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            df_list = SourceFile.load_dataframes(config, logger)
            for df in df_list:
                columns = selection.intersection(set(df.columns))
                for data in df[columns].to_dict(orient="records"):
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
        except Exception as err:
            reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Ejemplo n.º 2
0
    def read(self,
             logger: AirbyteLogger,
             config_container,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))

        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id,
                                             ranges=range,
                                             majorDimension="ROWS").execute())
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(
                            row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Ejemplo n.º 3
0
 def get_catalog(self) -> AirbyteCatalog:
     raw_spec = pkgutil.get_data(
         self.__class__.__module__.split(".")[0], "catalog.json")
     return AirbyteCatalog.parse_obj(json.loads(raw_spec))
Ejemplo n.º 4
0
 def get_catalog(self) -> AirbyteCatalog:
     raw_catalog = pkgutil.get_data(
         self.__class__.__module__.split(".")[0],
         "integration_test_catalog.json")
     return AirbyteCatalog.parse_obj(json.loads(raw_catalog))