Exemple #1
0
 def read(self, logger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
     """
     Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.
     """
     storage = SourceFile.get_storage_scheme(logger, config["provider"]["storage"], config["url"])
     url = SourceFile.get_simple_url(config["url"])
     name = SourceFile.get_stream_name(config)
     logger.info(f"Reading {name} ({storage}{url})...")
     selection = SourceFile.parse_catalog(catalog)
     try:
         if "format" in config and config["format"] == "json":
             data_list = SourceFile.load_nested_json(config, logger)
             for data in data_list:
                 yield AirbyteMessage(
                     type=Type.RECORD,
                     record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                 )
         else:
             df_list = SourceFile.load_dataframes(config, logger)
             for df in df_list:
                 if len(selection) > 0:
                     columns = selection.intersection(set(df.columns))
                 else:
                     columns = df.columns
                 df = df.replace(np.nan, "NaN", regex=True)
                 for data in df[columns].to_dict(orient="records"):
                     yield AirbyteMessage(
                         type=Type.RECORD,
                         record=AirbyteRecordMessage(stream=name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                     )
     except Exception as err:
         reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
         logger.error(reason)
         raise err
Exemple #2
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        total_state = {**state}
        for configured_stream in catalog.streams:
            stream_name = configured_stream.stream.name

            if client.stream_has_state(stream_name) and state.get(stream_name):
                logger.info(f"Set state of {stream_name} stream to {state.get(stream_name)}")
                client.set_stream_state(stream_name, state.get(stream_name))

            logger.info(f"Syncing {stream_name} stream")
            for record in client.read_stream(configured_stream.stream):
                now = int(datetime.now().timestamp()) * 1000
                message = AirbyteRecordMessage(stream=stream_name, data=record, emitted_at=now)
                yield AirbyteMessage(type=MessageType.RECORD, record=message)

            if client.stream_has_state(stream_name):
                total_state[stream_name] = client.get_stream_state(stream_name)
                # output state object only together with other stream states
                yield AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=total_state))

        logger.info(f"Finished syncing {self.__class__.__name__}")
Exemple #3
0
    def _read_stream(self, logger: AirbyteLogger, client: BaseClient,
                     configured_stream: ConfiguredAirbyteStream,
                     state: MutableMapping[str, Any]):
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state(
            stream_name)

        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            client.set_stream_state(stream_name, state.get(stream_name))

        logger.info(f"Syncing {stream_name} stream")
        for record in client.read_stream(configured_stream.stream):
            now = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

        if use_incremental and client.get_stream_state(stream_name):
            state[stream_name] = client.get_stream_state(stream_name)
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Exemple #4
0
    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        logger.info(f"Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...")

        message = AirbyteRecordMessage(stream="love_airbyte", data={"love": True}, emitted_at=int(time.time() * 1000))
        yield AirbyteMessage(type="RECORD", record=message)

        state = AirbyteStateMessage(data={"love_cursor": "next_version"})
        yield AirbyteMessage(type="STATE", state=state)
Exemple #5
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger,
                                                config["provider"]["storage"],
                                                config["url"])
        url = SourceFile.get_simple_url(config["url"])
        name = SourceFile.get_stream_name(config)
        logger.info(
            f"Reading {name} ({storage}{url}, {catalog_path}, {state_path})..."
        )
        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            if "format" in config and config["format"] == "json":
                data_list = SourceFile.load_nested_json(config, logger)
                for data in data_list:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=name,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
            else:
                df_list = SourceFile.load_dataframes(config, logger)
                for df in df_list:
                    if len(selection) > 0:
                        columns = selection.intersection(set(df.columns))
                    else:
                        columns = df.columns
                    df = df.replace(np.nan, "NaN", regex=True)
                    for data in df[columns].to_dict(orient="records"):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
        except Exception as err:
            reason = f"Failed to read data of {name} at {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Exemple #6
0
def log_line(line, default_level):
    split_line = line.split()
    first_word = next(iter(split_line), None)
    if first_word in valid_log_types:
        log_level = first_word
        rendered_line = " ".join(split_line[1:])
    else:
        log_level = default_level
        rendered_line = line
    log_record = AirbyteLogMessage(level=log_level, message=rendered_line)
    log_message = AirbyteMessage(type="LOG", log=log_record)
    print(log_message.serialize())
Exemple #7
0
 def read(
     logger,
     shell_command,
     is_message=(lambda x: True),
     transform=(lambda x: x)
 ) -> Generator[AirbyteMessage, None, None]:
     with subprocess.Popen(shell_command,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE,
                           universal_newlines=True) as p:
         sel = selectors.DefaultSelector()
         sel.register(p.stdout, selectors.EVENT_READ)
         sel.register(p.stderr, selectors.EVENT_READ)
         ok = True
         while ok:
             for key, val1 in sel.select():
                 line = key.fileobj.readline()
                 if not line:
                     ok = False
                 elif key.fileobj is p.stdout:
                     out_json = to_json(line)
                     if out_json is not None and is_message(out_json):
                         transformed_json = transform(out_json)
                         if transformed_json is not None:
                             if transformed_json.get(
                                     "type"
                             ) == "SCHEMA" or transformed_json.get(
                                     "type") == "ACTIVATE_VERSION":
                                 pass
                             elif transformed_json.get("type") == "STATE":
                                 out_record = AirbyteStateMessage(
                                     data=transformed_json["value"])
                                 out_message = AirbyteMessage(
                                     type=Type.STATE, state=out_record)
                                 yield transform(out_message)
                             else:
                                 # todo: check that messages match the discovered schema
                                 stream_name = transformed_json["stream"]
                                 out_record = AirbyteRecordMessage(
                                     stream=stream_name,
                                     data=transformed_json["record"],
                                     emitted_at=int(
                                         datetime.now().timestamp()) * 1000,
                                 )
                                 out_message = AirbyteMessage(
                                     type=Type.RECORD, record=out_record)
                                 yield transform(out_message)
                     else:
                         logger.log_by_prefix(line, "INFO")
                 else:
                     logger.log_by_prefix(line, "ERROR")
Exemple #8
0
    def run(self,
            cmd,
            config=None,
            state=None,
            catalog=None,
            **kwargs) -> Iterable[AirbyteMessage]:
        self._runs += 1
        volumes = self._prepare_volumes(config, state, catalog)
        logs = self._client.containers.run(image=self._image,
                                           command=cmd,
                                           working_dir="/data",
                                           volumes=volumes,
                                           network="host",
                                           stdout=True,
                                           stderr=True,
                                           **kwargs)
        logging.info("Docker run: \n%s\ninput: %s\noutput: %s", cmd,
                     self.input_folder, self.output_folder)

        with open(str(self.output_folder / "raw"), "wb+") as f:
            f.write(logs)

        for line in logs.decode("utf-8").splitlines():
            try:
                yield AirbyteMessage.parse_raw(line)
            except ValidationError as exc:
                logging.warning("Unable to parse connector's output %s", exc)
Exemple #9
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        logger.info("read called")

        url = config["url"]
        username = config["username"]
        key = config["access_token"]
        client = WSClient(url)
        login = client.do_login(username, key, withpassword=False)
        query = config["query"]
        logger.info(query)
        data = client.do_query(query)
        try:
            for single_dict in data:
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=DATASET_ITEMS_STREAM_NAME,
                        data=single_dict,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )
        except Exception as err:
            reason = f"Failed to read data of {DATASET_ITEMS_STREAM_NAME} at {url}"
            logger.error(reason)
            raise err
Exemple #10
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = "TableName"  # Example
        data = {"columnName": "Hello World"}  # Example

        # Not Implemented

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=stream_name,
                data=data,
                emitted_at=int(datetime.now().timestamp()) * 1000),
        )
Exemple #11
0
    def read(
            self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        for stream in catalog.streams:
            name = stream.stream.name
            key = stream.stream.name
            logger.debug(f'****** mode {stream.sync_mode} state={state}')
            if key == 'SiteMetaData':
                url = sitemetadata_url(config)
            elif key == 'WellScreens':
                url = screens_url(config)
            elif key == 'ManualGWL':
                url = manual_water_levels_url(config)
            elif key == 'PressureGWL':
                url = pressure_water_levels_url(config)
            elif key == 'AcousticGWL':
                url = acoustic_water_levels_url(config)
            else:
                continue

            while 1:
                objectid = state[key]
                if objectid:
                    curl = f'{url}?objectid={objectid}'
                else:
                    curl = url

                logger.info(f'fetching url={curl}')
                jobj = get_json(logger, curl)
                if jobj:
                    state[key] = jobj[-1]['OBJECTID']
                else:
                    break

                for di in jobj:
                    di['import_uuid'] = str(uuid.uuid4())
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=name, data=di,
                                                    emitted_at=int(datetime.now().timestamp()) * 1000))
Exemple #12
0
    def read(self, logger, config_container, catalog_path, state_path=None) -> Generator[AirbyteMessage, None, None]:
        """

        :param logger:
        :param config_container:
        :param catalog_path:
        :param state_path:
        :return:
        """
        config = config_container.rendered_config
        storage = SourceFile.get_storage_scheme(logger, config["storage"], config["url"])
        url = SourceFile.get_simple_url(config["url"])
        logger.info(f"Reading ({storage}{url}, {catalog_path}, {state_path})...")
        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))
        selection = SourceFile.parse_catalog(catalog)
        try:
            df_list = SourceFile.load_dataframes(config, logger)
            for df in df_list:
                columns = selection.intersection(set(df.columns))
                for data in df[columns].to_dict(orient="records"):
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(stream=url, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
        except Exception as err:
            reason = f"Failed to read data of {storage}{url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Exemple #13
0
    def read(self,
             logger,
             config_container,
             catalog_path,
             state_path=None) -> Generator[AirbyteMessage, None, None]:
        logger.info(
            f'Reading ({config_container.rendered_config_path}, {catalog_path}, {state_path})...'
        )

        message = AirbyteRecordMessage(stream='love_airbyte',
                                       data={'love': True},
                                       emitted_at=int(time.time() * 1000))
        yield AirbyteMessage(type='RECORD', record=message)

        state = AirbyteStateMessage(data={'love_cursor': 'next_version'})
        yield AirbyteMessage(type='STATE', state=state)
Exemple #14
0
def expected_records_fixture(inputs, base_path) -> List[AirbyteMessage]:
    path = getattr(inputs, "expected_records_path")
    if not path:
        return []

    with open(str(base_path / path)) as f:
        return [AirbyteMessage.parse_raw(line) for line in f]
Exemple #15
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """

        for stream in catalog.streams:
            key = stream.stream.name
            prid = None
            if stream.sync_mode == SyncMode.incremental and key in state:
                prid = state.get(key)

            ret = self._get_records(logger, config, prid)
            if ret is not None:
                header, rid, records = ret
                if records:
                    for data in records:
                        for k, v in data.items():
                            if v.isdigit():
                                continue

                            try:
                                data[k] = float(v)
                            except ValueError:
                                pass

                        record = AirbyteRecordMessage(
                            stream=key,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000)
                        yield AirbyteMessage(type=Type.RECORD, record=record)

                    state[key] = rid
                    output_message = {
                        "type": "STATE",
                        "state": {
                            "data": state
                        }
                    }
                    print(json.dumps(output_message))
Exemple #16
0
    def read(
        shell_command, is_message=(lambda x: True), transform=(lambda x: x)
    ) -> Generator[AirbyteMessage, None, None]:
        with subprocess.Popen(shell_command,
                              shell=True,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              bufsize=1,
                              universal_newlines=True) as p:
            for tuple in zip(p.stdout, p.stderr):
                out_line = tuple[0]
                err_line = tuple[1]

                if out_line:
                    out_json = to_json(out_line)
                    if out_json is not None and is_message(out_json):
                        transformed_json = transform(out_json)
                        if transformed_json is not None:
                            if transformed_json.get('type') == "SCHEMA":
                                pass
                            elif transformed_json.get('type') == "STATE":
                                out_record = AirbyteStateMessage(
                                    data=transformed_json["value"])
                                out_message = AirbyteMessage(type="STATE",
                                                             state=out_record)
                                yield transform(out_message)
                            else:
                                # todo: remove type from record
                                # todo: handle stream designation
                                # todo: check that messages match the discovered schema
                                stream_name = transformed_json["stream"]
                                out_record = AirbyteRecordMessage(
                                    stream=stream_name,
                                    data=transformed_json["record"],
                                    emitted_at=int(
                                        datetime.now().timestamp()) * 1000)
                                out_message = AirbyteMessage(type="RECORD",
                                                             record=out_record)
                                yield transform(out_message)
                    elif out_line:
                        log_line(out_line, "INFO")

                if err_line:
                    log_line(err_line, "ERROR")
Exemple #17
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            for record in client.read_stream(configured_stream.stream):
                yield AirbyteMessage(type=airbyte_protocol.Type.RECORD, record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
Exemple #18
0
 def _airbyte_message_from_json(
         transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
     if transformed_json is None or transformed_json.get(
             "type") == "SCHEMA" or transformed_json.get(
                 "type") == "ACTIVATE_VERSION":
         return None
     elif transformed_json.get("type") == "STATE":
         out_record = AirbyteStateMessage(data=transformed_json["value"])
         out_message = AirbyteMessage(type=Type.STATE, state=out_record)
     else:
         # todo: check that messages match the discovered schema
         stream_name = transformed_json["stream"]
         out_record = AirbyteRecordMessage(
             stream=stream_name,
             data=transformed_json["record"],
             emitted_at=int(datetime.now().timestamp()) * 1000,
         )
         out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
     return out_message
Exemple #19
0
    def read(self, logger: AirbyteLogger, config_container, catalog_path, state=None) -> Generator[AirbyteMessage, None, None]:
        r = self._make_request(config_container.rendered_config)
        if r.status_code != 200:
            raise Exception(f"Request failed. {r.text}")

        # need to eagerly fetch the json.
        message = AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(stream=SourceRestApi.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000),
        )
        return (m for m in [message])
    def read(self,
             logger: AirbyteLogger,
             config_container,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        config = config_container.rendered_config
        client = Helpers.get_authenticated_sheets_client(
            json.loads(config["credentials_json"]))

        catalog = AirbyteCatalog.parse_obj(self.read_config(catalog_path))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            encountered_blank_row = False
            while not encountered_blank_row:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.values().batchGet(spreadsheetId=spreadsheet_id,
                                             ranges=range,
                                             majorDimension="ROWS").execute())
                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if Helpers.is_row_empty(row):
                        encountered_blank_row = True
                        break
                    elif Helpers.row_contains_relevant_data(
                            row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Exemple #21
0
    def _read_stream(
            self, logger: AirbyteLogger, stream_instance: Stream,
            configured_stream: ConfiguredAirbyteStream,
            state: MutableMapping[str, Any]) -> Iterator[AirbyteMessage]:
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental

        stream_state = {}
        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            stream_state = state.get(stream_name)

        logger.info(f"Syncing stream: {stream_name} ")
        record_counter = 0
        for record in stream_instance.read_stream(
                configured_stream=configured_stream,
                stream_state=copy.deepcopy(stream_state)):
            now_millis = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now_millis)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

            record_counter += 1
            if use_incremental:
                stream_state = stream_instance.get_updated_state(
                    stream_state, record)
                if record_counter % stream_instance.state_checkpoint_interval == 0:
                    state[stream_name] = stream_state
                    yield AirbyteMessage(type=MessageType.STATE,
                                         state=AirbyteStateMessage(data=state))

        if use_incremental and stream_state:
            state[stream_name] = stream_state
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Exemple #22
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = GoogleSheetsClient(json.loads(config["credentials_json"]))

        sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(
            catalog)
        spreadsheet_id = config["spreadsheet_id"]

        logger.info(f"Starting syncing spreadsheet {spreadsheet_id}")
        # For each sheet in the spreadsheet, get a batch of rows, and as long as there hasn't been
        # a blank row, emit the row batch
        sheet_to_column_index_to_name = Helpers.get_available_sheets_to_column_index_to_name(
            client, spreadsheet_id, sheet_to_column_name)
        sheet_row_counts = Helpers.get_sheet_row_count(client, spreadsheet_id)
        logger.info(f"Row counts: {sheet_row_counts}")
        for sheet in sheet_to_column_index_to_name.keys():
            logger.info(f"Syncing sheet {sheet}")
            column_index_to_name = sheet_to_column_index_to_name[sheet]
            row_cursor = 2  # we start syncing past the header row
            # For the loop, it is necessary that the initial row exists when we send a request to the API,
            # if the last row of the interval goes outside the sheet - this is normal, we will return
            # only the real data of the sheet and in the next iteration we will loop out.
            while row_cursor <= sheet_row_counts[sheet]:
                range = f"{sheet}!{row_cursor}:{row_cursor + ROW_BATCH_SIZE}"
                logger.info(f"Fetching range {range}")
                row_batch = SpreadsheetValues.parse_obj(
                    client.get_values(spreadsheetId=spreadsheet_id,
                                      ranges=range,
                                      majorDimension="ROWS"))

                row_cursor += ROW_BATCH_SIZE + 1
                # there should always be one range since we requested only one
                value_ranges = row_batch.valueRanges[0]

                if not value_ranges.values:
                    break

                row_values = value_ranges.values
                if len(row_values) == 0:
                    break

                for row in row_values:
                    if not Helpers.is_row_empty(
                            row) and Helpers.row_contains_relevant_data(
                                row, column_index_to_name.keys()):
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=Helpers.row_data_to_record_message(
                                sheet, row, column_index_to_name))
        logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")
Exemple #23
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                continue
            logger.info(f"Syncing {stream.name} stream")
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
Exemple #24
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        smartsheet_client = smartsheet.Smartsheet(access_token)

        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            properties = stream.json_schema["properties"]
            if isinstance(properties, list):
                columns = tuple(key for dct in properties
                                for key in dct.keys())
            elif isinstance(properties, dict):
                columns = tuple(i for i in properties.keys())
            else:
                logger.error(
                    "Could not read properties from the JSONschema in this stream"
                )
            name = stream.name

            try:
                sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
                sheet = json.loads(str(sheet))  # make it subscriptable
                logger.info(f"Starting syncing spreadsheet {sheet['name']}")
                logger.info(f"Row count: {sheet['totalRowCount']}")

                for row in sheet["rows"]:
                    values = tuple(i["value"] for i in row["cells"])
                    try:
                        data = dict(zip(columns, values))

                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                    except Exception as e:
                        logger.error(
                            f"Unable to encode row into an AirbyteMessage with the following error: {e}"
                        )

            except Exception as e:
                logger.error(f"Could not read smartsheet: {name}")
                raise e
        logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
Exemple #25
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config_container)

        config = self.read_config(catalog_path)
        catalog = ConfiguredAirbyteCatalog.parse_obj(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            for record in client.read_stream(configured_stream.stream):
                yield AirbyteMessage(type=airbyte_protocol.Type.RECORD,
                                     record=record)
        logger.info(f"Finished syncing {self.__class__.__name__}")
Exemple #26
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        r = self._make_request(config)
        if r.status_code != 200:
            raise Exception(f"Request failed. {r.text}")

        # need to eagerly fetch the json.
        message = AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=SourceHttpRequest.STREAM_NAME, data=r.json(), emitted_at=int(datetime.now().timestamp()) * 1000
            ),
        )

        return (m for m in [message])
Exemple #27
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config)

        logger.info("Starting syncing sendgrid")
        for configured_stream in catalog.streams:
            # TODO handle incremental syncs
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                logger.warn(f"Stream '{stream}' not found in the recognized entities")
                continue
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing sendgrid")
Exemple #28
0
    def read(self,
             logger: AirbyteLogger,
             config_container: ConfigContainer,
             catalog_path,
             state=None) -> Generator[AirbyteMessage, None, None]:
        client = self._client(config_container)

        catalog = ConfiguredAirbyteCatalog.parse_obj(
            self.read_config(catalog_path))

        logger.info("Starting syncing mailchimp")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            for record in self._read_record(client=client, stream=stream.name):
                yield AirbyteMessage(type=Type.RECORD, record=record)

        logger.info("Finished syncing mailchimp")
Exemple #29
0
    def read(
        self, logger: AirbyteLogger, config: json, catalog: ConfiguredAirbyteCatalog, state: Dict[str, any]
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        coda_token = config["api_key"]
        headers = {'Authorization': f'Bearer {config["api_key"]}'}
        docs_uri = 'https://coda.io/apis/v1/docs'
        docs_params = {'isOwner': True}

        stream_name = "CodaRows"  # Example
        #data = {"columnName": {"Hello World": "hi"}}
        data_res = self._api_call(docs_uri, coda_token, headers)
        data = data_res

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=int(datetime.now().timestamp()) * 1000),
        )
    # ********************** END - Implementing read connection *************************


# from airbyte-integrations/connectors/source-<source-name>
# python main_dev.py spec
# python main_dev.py check --config secrets/config.json
# python main_dev.py discover --config secrets/config.json
# python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json
# python main_dev.py read --config secrets/config.json --catalog source_code_connector/schema/configured_catalog.json
Exemple #30
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        client = self._get_client(config)

        logger.info(f"Starting syncing {self.__class__.__name__}")
        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            if stream.name not in client.ENTITY_MAP.keys():
                continue
            try:
                for record in self._read_record(client=client,
                                                stream=stream.name):
                    yield AirbyteMessage(type=Type.RECORD, record=record)
            except requests.exceptions.RequestException as e:
                error = json.loads(e.args[0])["error"]
                logger.error(
                    f"Get {stream.name} error. Error: {error['code']} {error['message']}"
                )
        logger.info(f"Finished syncing {self.__class__.__name__}")