Example #1
0
def test_infer_schemas():
    expected_schema = {
        "$schema": "http://json-schema.org/schema#",
        "properties": {
            "a": {
                "type": "integer"
            },
            "b": {
                "type": "string"
            }
        },
        "type": "object",
    }

    with tempfile.TemporaryDirectory() as temp_dir:
        os.chdir(temp_dir)
        record = {"a": 1, "b": "test"}
        record_message = AirbyteMessage(type=Type.RECORD,
                                        record=AirbyteRecordMessage(
                                            stream="stream",
                                            data=record,
                                            emitted_at=111)).json()
        sys.stdin = io.StringIO(record_message)
        infer_schemas()
        assert os.path.exists("schemas/stream.json")

        with open("schemas/stream.json") as f:
            schema = json.loads(f.read())
            assert schema == expected_schema
Example #2
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        try:
            for configured_stream in catalog.streams:
                if configured_stream.sync_mode == SyncMode.full_refresh:
                    stream_name = configured_stream.stream.name
                    reader = Reader(logger, config)
                    table_client = reader.get_table_client(stream_name)
                    logger.info(f"Reading data from stream '{stream_name}'")

                    for row in reader.read(table_client, None):
                        # Timestamp property is in metadata object
                        # row.metadata.timestamp
                        row["additionalProperties"] = True
                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=stream_name,
                                data=row,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                if configured_stream.sync_mode == SyncMode.incremental:
                    logger.warn(
                        f"Incremental sync is not supported by stream {stream_name}"
                    )

        except Exception as err:
            reason = f"Failed to read data of {stream_name}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Example #3
0
def airbyte_message_from_data(raw_data: List[Any], columns: List[str],
                              table_name: str) -> Optional[AirbyteMessage]:
    """
    Wrap data into an AirbyteMessage.

    :param raw_data: Raw data row returned from a fetch query. Each item in the list
        represents a row of data.
        Example: [10, "Oranges"]
    :param columns: List of column names
        Example: ["Quantity", "Fruit"]
    :param table_name: Name of a table where data was fetched from

    :return: AirbyteMessage containing parsed data
    """
    raw_data = format_fetch_result(raw_data)
    data = dict(zip(columns, raw_data))
    # Remove empty values
    data = {k: v for k, v in data.items() if v is not None}
    if not data:
        return None
    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream=table_name,
            data=data,
            emitted_at=int(datetime.now().timestamp()) * 1000,
        ),
    )
Example #4
0
def test_read(schema, record, should_fail):
    catalog = ConfiguredAirbyteCatalog(streams=[
        ConfiguredAirbyteStream(
            stream=AirbyteStream.parse_obj({
                "name": "test_stream",
                "json_schema": schema
            }),
            sync_mode="full_refresh",
            destination_sync_mode="overwrite",
        )
    ])
    input_config = BasicReadTestConfig()
    docker_runner_mock = MagicMock()
    docker_runner_mock.call_read.return_value = [
        AirbyteMessage(type=Type.RECORD,
                       record=AirbyteRecordMessage(stream="test_stream",
                                                   data=record,
                                                   emitted_at=111))
    ]
    t = _TestBasicRead()
    if should_fail:
        with pytest.raises(
                AssertionError,
                match="stream should have some fields mentioned by json schema"
        ):
            t.test_read(None, catalog, input_config, [], docker_runner_mock,
                        MagicMock())
    else:
        t.test_read(None, catalog, input_config, [], docker_runner_mock,
                    MagicMock())
Example #5
0
    def _read_stream(self, logger: AirbyteLogger, client: BaseClient,
                     configured_stream: ConfiguredAirbyteStream,
                     state: MutableMapping[str, Any]):
        stream_name = configured_stream.stream.name
        use_incremental = configured_stream.sync_mode == SyncMode.incremental and client.stream_has_state(
            stream_name)

        if use_incremental and state.get(stream_name):
            logger.info(
                f"Set state of {stream_name} stream to {state.get(stream_name)}"
            )
            client.set_stream_state(stream_name, state.get(stream_name))

        logger.info(f"Syncing {stream_name} stream")
        for record in client.read_stream(configured_stream.stream):
            now = int(datetime.now().timestamp()) * 1000
            message = AirbyteRecordMessage(stream=stream_name,
                                           data=record,
                                           emitted_at=now)
            yield AirbyteMessage(type=MessageType.RECORD, record=message)

        if use_incremental and client.get_stream_state(stream_name):
            state[stream_name] = client.get_stream_state(stream_name)
            # output state object only together with other stream states
            yield AirbyteMessage(type=MessageType.STATE,
                                 state=AirbyteStateMessage(data=state))
Example #6
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        stream_name = "TableName"  # Example
        data = {"columnName": "Hello World"}  # Example

        # Not Implemented

        yield AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream=stream_name,
                data=data,
                emitted_at=int(datetime.now().timestamp()) * 1000),
        )
Example #7
0
def test_validate_records_format(record, configured_catalog, valid):
    records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0)]
    streams_with_errors = verify_records_schema(records, configured_catalog)
    if valid:
        assert not streams_with_errors
    else:
        assert streams_with_errors, f"Record {record} should produce errors against {configured_catalog.streams[0].stream.json_schema}"
Example #8
0
def expected_records_fixture(inputs, base_path) -> List[AirbyteRecordMessage]:
    expect_records = getattr(inputs, "expect_records")
    if not expect_records:
        return []

    with open(str(base_path / getattr(expect_records, "path"))) as f:
        return [AirbyteRecordMessage.parse_raw(line) for line in f]
Example #9
0
def _record(stream: str, str_value: str, int_value: int) -> AirbyteMessage:
    return AirbyteMessage(type=Type.RECORD,
                          record=AirbyteRecordMessage(stream=stream,
                                                      data={
                                                          "str_col": str_value,
                                                          "int_col": int_value
                                                      },
                                                      emitted_at=0))
Example #10
0
def retrieve_all_records(client):
    return [
        AirbyteMessage(type=Type.RECORD,
                       record=AirbyteRecordMessage(stream=collection.id,
                                                   data=doc.to_dict(),
                                                   emitted_at=0))
        for collection in client.collections() for doc in collection.order_by(
            "int_col", direction=firestore.Query.ASCENDING).stream()
    ]
Example #11
0
 def record_message_from_record(record_: Dict) -> List[AirbyteMessage]:
     return [
         AirbyteMessage(
             type=Type.RECORD,
             record=AirbyteRecordMessage(stream="test_stream",
                                         data=record_,
                                         emitted_at=111),
         )
     ]
Example #12
0
 def _record(stream: str, data: Dict[str, Any],
             seller_id: str) -> AirbyteMessage:
     now = int(datetime.now().timestamp()) * 1000
     if seller_id:
         data["seller_id"] = seller_id
     return AirbyteMessage(type=Type.RECORD,
                           record=AirbyteRecordMessage(stream=stream,
                                                       data=data,
                                                       emitted_at=now))
Example #13
0
 def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
     now_millis = int(datetime.now().timestamp() * 1000)
     transformer, schema = self._get_stream_transformer_and_schema(stream_name)
     # Transform object fields according to config. Most likely you will
     # need it to normalize values against json schema. By default no action
     # taken unless configured. See
     # docs/connector-development/cdk-python/schemas.md for details.
     transformer.transform(data, schema)
     message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
     return AirbyteMessage(type=MessageType.RECORD, record=message)
def records_fixture():
    return [
        AirbyteMessage(
            type=Type.RECORD,
            record=AirbyteRecordMessage(
                stream="my_stream",
                data={"id": 1, "ts_created": "2015-11-01T22:03:11", "nested": {"ts_updated": "2015-05-01"}},
                emitted_at=0,
            ),
        )
    ]
Example #15
0
def airbyte_message2() -> AirbyteMessage:
    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream="table2",
            data={
                "key1": "value2",
                "key2": 3
            },
            emitted_at=int(datetime.now().timestamp()) * 1000,
        ),
    )
Example #16
0
def airbyte_message1(test_table_name: str):
    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream=test_table_name,
            data={
                "key1": "value1",
                "key2": 2
            },
            emitted_at=int(datetime.now().timestamp()) * 1000,
        ),
    )
Example #17
0
def generate_record(stream: any, data: any):
    dict = data.copy()

    # timestamps need to be emitted in ISO format
    for key in dict:
        if isinstance(dict[key], datetime.datetime):
            dict[key] = dict[key].isoformat()

    return AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(stream=stream.stream.name, data=dict, emitted_at=int(datetime.datetime.now().timestamp()) * 1000),
    )
Example #18
0
    def read(
        self,
        logger: logging.Logger,
        config: Mapping[str, Any],
        catalog: ConfiguredAirbyteCatalog,
        state: MutableMapping[str, Any] = None
    ) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json/spec.yaml file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
        in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
        with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        report_name = config.get("report_name")

        response = self._run_report(config)
        rows = Client.response_to_list(response)

        last_cursor_value = state.get(report_name,
                                      {}).get(DEFAULT_CURSOR_FIELD, "")

        for row in rows:
            if last_cursor_value <= row[DEFAULT_CURSOR_FIELD]:
                yield AirbyteMessage(
                    type=Type.RECORD,
                    record=AirbyteRecordMessage(
                        stream=report_name,
                        data=row,
                        emitted_at=int(datetime.now().timestamp()) * 1000),
                )

                last_cursor_value = row[DEFAULT_CURSOR_FIELD]

        yield AirbyteMessage(
            type=Type.STATE,
            state=AirbyteStateMessage(
                data={report_name: {
                    DEFAULT_CURSOR_FIELD: last_cursor_value
                }}))
Example #19
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:

        access_token = config["access_token"]
        spreadsheet_id = config["spreadsheet_id"]
        smartsheet_client = smartsheet.Smartsheet(access_token)

        for configured_stream in catalog.streams:
            stream = configured_stream.stream
            properties = stream.json_schema["properties"]
            if isinstance(properties, list):
                columns = tuple(key for dct in properties
                                for key in dct.keys())
            elif isinstance(properties, dict):
                columns = tuple(i for i in properties.keys())
            else:
                logger.error(
                    "Could not read properties from the JSONschema in this stream"
                )
            name = stream.name

            try:
                sheet = smartsheet_client.Sheets.get_sheet(spreadsheet_id)
                sheet = json.loads(str(sheet))  # make it subscriptable
                logger.info(f"Starting syncing spreadsheet {sheet['name']}")
                logger.info(f"Row count: {sheet['totalRowCount']}")

                for row in sheet["rows"]:
                    values = tuple(i["value"] if "value" in i else ""
                                   for i in row["cells"])
                    try:
                        data = dict(zip(columns, values))

                        yield AirbyteMessage(
                            type=Type.RECORD,
                            record=AirbyteRecordMessage(
                                stream=name,
                                data=data,
                                emitted_at=int(datetime.now().timestamp()) *
                                1000),
                        )
                    except Exception as e:
                        logger.error(
                            f"Unable to encode row into an AirbyteMessage with the following error: {e}"
                        )

            except Exception as e:
                logger.error(f"Could not read smartsheet: {name}")
                raise e
        logger.info(f"Finished syncing spreadsheet with ID: {spreadsheet_id}")
 def read(
     self,
     logger: AirbyteLogger,
     config: Mapping[str, Any],
     catalog: ConfiguredAirbyteCatalog,
     state: MutableMapping[str, Any] = None,
 ) -> Iterable[AirbyteMessage]:
     logger.info(I_AM_A_SECRET_VALUE)
     logger.info(I_AM_A_SECRET_VALUE + " plus Some non secret Value in the same log record" + NOT_A_SECRET_VALUE)
     logger.info(NOT_A_SECRET_VALUE)
     yield AirbyteMessage(
         record=AirbyteRecordMessage(stream="stream", data={"data": "stuff"}, emitted_at=1),
         type=Type.RECORD,
     )
Example #21
0
def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog):
    """Test that correct records returned as records with errors, and verify specific error messages"""
    records = [
        {
            "text_or_null": 123,  # wrong format
            "number_or_null": 10.3,
            "text": "text",
            "number": "text",  # wrong format
        },
        {
            "text_or_null": "test",
            "number_or_null": None,
            "text": None,  # wrong value
            "number": None,  # wrong value
        },
        {
            "text_or_null": None,
            "number_or_null": None,
            "text": "text",
            "number": 77,
        },
        {
            "text_or_null": None,
            "number_or_null": None,
            "text": "text",
            "number": "text",  # wrong format
        },
    ]

    records = [
        AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0)
        for record in records
    ]

    streams_with_errors = verify_records_schema(records, configured_catalog)
    errors = [
        error.message for error in streams_with_errors["my_stream"].values()
    ]

    assert "my_stream" in streams_with_errors
    assert len(streams_with_errors) == 1, "only one stream"
    assert len(streams_with_errors["my_stream"]
               ) == 3, "only first error for each field"
    assert errors == [
        "123 is not of type 'null', 'string'",
        "'text' is not of type 'number'", "None is not of type 'string'"
    ]
Example #22
0
    def read(self, logger: AirbyteLogger, config: json,
             catalog: ConfiguredAirbyteCatalog,
             state: Dict[str, any]) -> Generator[AirbyteMessage, None, None]:
        """
        Returns a generator of the AirbyteMessages generated by reading the source with the given configuration,
        catalog, and state.

        :param logger: Logging object to display debug/info/error to the logs
            (logs will not be accessible via airbyte UI if they are not passed to this logger)
        :param config: Json object containing the configuration of this source, content of this json is as specified in
            the properties of the spec.json file
        :param catalog: The input catalog is a ConfiguredAirbyteCatalog which is almost the same as AirbyteCatalog
            returned by discover(), but
            in addition, it's been configured in the UI! For each particular stream and field, there may have been provided
            with extra modifications such as: filtering streams and/or columns out, renaming some entities, etc
        :param state: When a Airbyte reads data from a source, it might need to keep a checkpoint cursor to resume
            replication in the future from that saved checkpoint.
            This is the object that is provided with state from previous runs and avoid replicating the entire set of
            data everytime.

        :return: A generator that produces a stream of AirbyteRecordMessage contained in AirbyteMessage object.
        """
        logger.info("Reading data from Apify dataset")

        dataset_id = config["datasetId"]
        clean = config.get("clean", False)

        client = ApifyClient()
        dataset_client = client.dataset(dataset_id)

        # Get total number of items in dataset. This will be used in pagination
        dataset = dataset_client.get()
        num_items = dataset["itemCount"]

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for result in executor.map(
                    partial(self._apify_get_dataset_items, dataset_client,
                            clean), range(0, num_items, BATCH_SIZE)):
                for data in result.items:
                    yield AirbyteMessage(
                        type=Type.RECORD,
                        record=AirbyteRecordMessage(
                            stream=DATASET_ITEMS_STREAM_NAME,
                            data=data,
                            emitted_at=int(datetime.now().timestamp()) * 1000),
                    )
Example #23
0
def test_run_read(entrypoint: AirbyteEntrypoint, mocker, spec_mock,
                  config_mock):
    parsed_args = Namespace(command="read",
                            config="config_path",
                            state="statepath",
                            catalog="catalogpath")
    expected = AirbyteRecordMessage(stream="stream",
                                    data={"data": "stuff"},
                                    emitted_at=1)
    mocker.patch.object(MockSource, "read_state", return_value={})
    mocker.patch.object(MockSource, "read_catalog", return_value={})
    mocker.patch.object(
        MockSource,
        "read",
        return_value=[AirbyteMessage(record=expected, type=Type.RECORD)])
    assert [_wrap_message(expected)] == list(entrypoint.run(parsed_args))
    assert spec_mock.called
Example #24
0
def test_airbyte_message_from_data(mock_datetime):
    mock_datetime.now.return_value.timestamp.return_value = 10
    raw_data = [1, "a", [1, 2, 3]]
    columns = ["Col1", "Col2", "Col3"]
    table_name = "dummy"
    expected = AirbyteMessage(
        type=Type.RECORD,
        record=AirbyteRecordMessage(
            stream="dummy",
            data={
                "Col1": 1,
                "Col2": "a",
                "Col3": [1, 2, 3]
            },
            emitted_at=10000,
        ),
    )
    result = airbyte_message_from_data(raw_data, columns, table_name)
    assert result == expected
Example #25
0
 def _airbyte_message_from_json(
         transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
     if transformed_json is None or transformed_json.get(
             "type") == "SCHEMA" or transformed_json.get(
                 "type") == "ACTIVATE_VERSION":
         return None
     elif transformed_json.get("type") == "STATE":
         out_record = AirbyteStateMessage(data=transformed_json["value"])
         out_message = AirbyteMessage(type=Type.STATE, state=out_record)
     else:
         # todo: check that messages match the discovered schema
         stream_name = transformed_json["stream"]
         out_record = AirbyteRecordMessage(
             stream=stream_name,
             data=transformed_json["record"],
             emitted_at=int(datetime.now().timestamp()) * 1000,
         )
         out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
     return out_message
Example #26
0
def test_verify_records_schema(configured_catalog: ConfiguredAirbyteCatalog):
    """Test that correct records returned as records with errors, and verify specific error messages"""
    records = [
        {
            "text_or_null": 123,  # wrong format
            "number_or_null": 10.3,
            "text": "text",
            "number": "text",  # wrong format
        },
        {
            "text_or_null": "test",
            "number_or_null": None,
            "text": None,  # wrong value
            "number": None,  # wrong value
        },
        {
            "text_or_null": None,
            "number_or_null": None,
            "text": "text",
            "number": 77,
        },
        {
            "text_or_null": None,
            "number_or_null": None,
            "text": "text",
            "number": "text",  # wrong format
        },
    ]

    records = [AirbyteRecordMessage(stream="my_stream", data=record, emitted_at=0) for record in records]

    records_with_errors, record_errors = zip(*verify_records_schema(records, configured_catalog))
    errors = [[error.message for error in errors] for errors in record_errors]

    assert len(records_with_errors) == 3, "only 3 out of 4 records have errors"
    assert records_with_errors[0] == records[0], "1st record should have errors"
    assert records_with_errors[1] == records[1], "2nd record should have errors"
    assert records_with_errors[2] == records[3], "4th record should have errors"
    assert errors[0] == ["'text' is not of type 'number'", "123 is not of type 'null', 'string'"]
    assert errors[1] == ["None is not of type 'number'", "None is not of type 'string'"]
    assert errors[2] == ["'text' is not of type 'number'"]
Example #27
0
    def read(
        self, logger: AirbyteLogger, config: Mapping,
        catalog: ConfiguredAirbyteCatalog,
        state_path: Mapping[str,
                            any]) -> Generator[AirbyteMessage, None, None]:
        """Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state."""
        client = self._get_client(config)
        fields = self.selected_fields(catalog)
        name = client.stream_name

        logger.info(f"Reading {name} ({client.reader.full_url})...")
        try:
            for row in client.read(fields=fields):
                record = AirbyteRecordMessage(
                    stream=name,
                    data=row,
                    emitted_at=int(datetime.now().timestamp()) * 1000)
                yield AirbyteMessage(type=Type.RECORD, record=record)
        except Exception as err:
            reason = f"Failed to read data of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
            logger.error(reason)
            raise err
Example #28
0
 def _airbyte_message_from_json(
         transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]:
     if transformed_json is None or transformed_json.get(
             "type") == "SCHEMA" or transformed_json.get(
                 "type") == "ACTIVATE_VERSION":
         return None
     elif transformed_json.get("type") == "STATE":
         out_record = AirbyteStateMessage(data=transformed_json["value"])
         out_message = AirbyteMessage(type=Type.STATE, state=out_record)
     else:
         # todo: check that messages match the discovered schema
         stream_name = transformed_json["stream"]
         # according to issue CDK: typing errors #9500, mypy raises error on this line
         # 'Incompatible types in assignment (expression has type "AirbyteRecordMessage", variable has type "AirbyteStateMessage")'
         # type of out_record is first initialized as AirbyteStateMessage on the line 240
         # however AirbyteRecordMessage is assigned on the line below, it causes error
         # ignored
         out_record = AirbyteRecordMessage(  # type: ignore
             stream=stream_name,
             data=transformed_json["record"],
             emitted_at=int(datetime.now().timestamp()) * 1000,
         )
         out_message = AirbyteMessage(type=Type.RECORD, record=out_record)
     return out_message
Example #29
0
 def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
     now_millis = int(datetime.now().timestamp()) * 1000
     message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
     return AirbyteMessage(type=MessageType.RECORD, record=message)
Example #30
0
def _as_record(stream: str, data: Dict[str, Any]) -> AirbyteMessage:
    return AirbyteMessage(type=Type.RECORD,
                          record=AirbyteRecordMessage(
                              stream=stream,
                              data=data,
                              emitted_at=GLOBAL_EMITTED_AT))