コード例 #1
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state_path: str = None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        if state_path:
            logger.info("Starting sync with provided state file")
            state_obj = json.loads(open(state_path, "r").read())
        else:
            logger.info("No state provided, starting fresh sync")
            state_obj = {}

        state = defaultdict(dict, state_obj)
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client,
                                            stream=stream.name,
                                            state=state):
                yield record

        logger.info("Finished syncing mailchimp")
コード例 #2
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger,
                                                config["provider"]["storage"],
                                                config["url"])
        url = SourceFile.get_simple_url(config["url"])
        name = SourceFile.get_stream_name(config)
        logger.info(
            f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..."
        )
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            if "format" in config and config["format"] == "json":
                data_list = SourceFile.load_nested_json(config, logger)
                for data in data_list:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=name,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
            else:
                df_list = SourceFile.load_dataframes(config, logger)
                for df in df_list:
                    if len(selection) > 0:
                        columns = selection.intersection(set(df.columns))
                    else:
                        columns = df.columns
                    df = df.replace(np.nan, "NaN", regex=True)
                    for data in df[columns].to_dict(orient="records"):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
        except Exception as err:
            reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
コード例 #3
0
ファイル: source.py プロジェクト: yevhenii-ldv/airbyte
    def read(
        self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path: str, state_path: str = None
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Implements the parent class read method.
        """
        catalogs = self._discover_internal(logger, config_container.config_path)
        masked_airbyte_catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selected_singer_catalog_path = SingerHelper.create_singer_catalog_with_selection(masked_airbyte_catalog, catalogs.singer_catalog)

        read_cmd = self.read_cmd(logger, config_container.config_path, selected_singer_catalog_path, state_path)
        return SingerHelper.read(logger, read_cmd)
コード例 #4
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            for record in client.read_stream(configured_stream.stream):
                yield AirbyteMessage(type=airbyte_protocol.Type.RECORD,
                                     record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
コード例 #5
0
ファイル: source.py プロジェクト: upnrunnHQ/airbyte
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing mailchimp")
コード例 #6
0
    def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(json.loads(config["credentials_json"]))

        catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS").execute()
                )
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
                        yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
コード例 #7
0
ファイル: source.py プロジェクト: vinhdangphuc/airbyte
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info("Starting syncing recurly")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITIES:
                logger.warn(
                    f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing recurly")
コード例 #8
0
 def get_catalog(self) -> ConfiguredAirbyteCatalog:
     raw_catalog = pkgutil.get_data(
         self.__class__.__module__.split(".")[0], "configured_catalog.json")
     return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
コード例 #9
0
 def get_catalog(self) -> ConfiguredAirbyteCatalog:
     raw_catalog = pkgutil.get_data(
         self.__class__.__module__.split(".")[0],
         self.CONFIGURED_CATALOG_FILENAME)
     return ConfiguredAirbyteCatalog.parse_obj(json.loads(raw_catalog))
コード例 #10
0
ファイル: integration.py プロジェクト: subodh1810/airbyte
 def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog:
     return ConfiguredAirbyteCatalog.parse_obj(
         self.read_config(catalog_path))
コード例 #11
0
ファイル: source.py プロジェクト: tesla-avant/airbyte
 def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog:
     catalog = ConfiguredAirbyteCatalog.parse_obj(
         self.read_config(catalog_path))
     if not self.reports_to_read:
         self.reports_to_read = [i.stream.name for i in catalog.streams]
     return catalog_path