Ejemplo n.º 1
0
def test_request_absolute_endpoint():
    endpoint = "/api/service/health"
    full_url = f"http://localhost:9100{endpoint}"
    responses.add(responses.GET, full_url, json={})
    client = Client(UsernamePasswordAuth("username", "password"))
    # If client does not properly handle absolute paths, client.get() will
    # raise a ConnectionRefused exception.
    client.get(endpoint)
    def test_dataset_profile(self):
        auth = UsernamePasswordAuth("username", "password")
        client = Client(auth)

        dataset_id = "3"
        dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}"
        profile_url = f"{dataset_url}/profile"
        responses.add(responses.GET, dataset_url, json={})
        responses.add(responses.GET, profile_url, json=self.profile_stale)

        dataset = client.datasets.by_resource_id(dataset_id)
        profile = dataset.profile()
        self.assertEqual(self.profile_stale["datasetName"],
                         profile.dataset_name)
        self.assertEqual(self.profile_stale["relativeDatasetId"],
                         profile.relative_dataset_id)
        self.assertEqual(self.profile_stale["isUpToDate"],
                         profile.is_up_to_date)
        self.assertEqual(self.profile_stale["profiledDataVersion"],
                         profile.profiled_data_version)
        self.assertEqual(self.profile_stale["profiledAt"], profile.profiled_at)
        self.assertEqual(self.profile_stale["simpleMetrics"],
                         profile.simple_metrics)
        self.assertEqual(self.profile_stale["attributeProfiles"],
                         profile.attribute_profiles)
Ejemplo n.º 3
0
def initiate_backup(
    client: Client,
    *,
    poll_interval_seconds: int = 30,
    polling_timeout_seconds: Optional[int] = None,
    connection_retry_timeout_seconds: int = 600,
) -> requests.Response:
    """Runs a backup of Tamr client and waits until it is finished.

    Args:
        client: A Tamr client object
        poll_interval_seconds: Amount of time in between polls of job state.
        polling_timeout_seconds: Amount of time before a timeout error is thrown.
        connection_retry_timeout_seconds: Amount of time before timeout error is thrown during
            connection retry

    Returns:
        Json dict of response from API request."""
    response = client.post("backups")
    if not response.ok:
        message = f"Received non-200 response code '{response.status_code}': {response.json()}"
        LOGGER.error(message)
        raise RuntimeError(message)

    backup_id = response.json()["relativeId"]

    op = utils.client.poll_endpoint(
        client=client,
        api_endpoint=f"backups/{backup_id}",
        poll_interval_seconds=poll_interval_seconds,
        polling_timeout_seconds=polling_timeout_seconds,
        connection_retry_timeout_seconds=connection_retry_timeout_seconds,
    )
    return op
Ejemplo n.º 4
0
def test_binning_model_records():

    records_body = [{
        "id": ["d8b7351d-24ce-49aa-8655-5b5809ab6bb8"],
        "isActive": ["true"],
        "clauseId": ["2e6c5f1b-ed49-40ab-8cbb-350aded25070"],
        "similarityFunction": ["COSINE"],
        "tokenizer": ["DEFAULT"],
        "fieldName": ["surname"],
        "threshold": ["0.75"],
    }]

    records_url = (
        f"http://localhost:9100/api/versioned/v1/projects/1/binningModel/records"
    )

    responses.add(responses.GET, project_url, json=project_config)

    responses.add(
        responses.GET,
        records_url,
        body="\n".join(json.dumps(body) for body in records_body),
    )

    tamr = Client(UsernamePasswordAuth("username", "password"))

    project = tamr.projects.by_resource_id("1").as_mastering()
    binning_model = project.binning_model()

    binning_model_records = list(binning_model.records())
    assert binning_model_records == records_body
Ejemplo n.º 5
0
def get_backup_by_id(client: Client, backup_id: str) -> JsonDict:
    """Fetches the json object for a given backup ID.

    Args:
        client: A Tamr client object.
        backup_id: The relativeID corresponding to the desired backup.

    Returns:
        Json dict corresponding to the desired backup.

    Raises:
        ValueError: Raised if GET request to Tamr fails
        """
    api_string = f"backups/{backup_id}"
    response = client.get(api_string)

    if not response.ok:
        message = (
            f"Received non-200 response code '{response.status_code}' "
            f"with message '{response.json()['message']}': '{response.json()}'"
        )
        LOGGER.error(message)
        raise ValueError(message)

    return response.json()
Ejemplo n.º 6
0
def client():
    from tamr_unify_client import Client
    from tamr_unify_client.auth import UsernamePasswordAuth

    auth = UsernamePasswordAuth("username", "password")
    tamr = Client(auth)
    return tamr
def test_continuous_mastering():
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)

    project_id = "1"
    project = unify.projects.by_resource_id(project_id)
    project = project.as_mastering()

    unified_dataset = project.unified_dataset()
    op = unified_dataset.refresh(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.pairs().refresh(poll_interval_seconds=0)
    assert op.succeeded()

    model = project.pair_matching_model()
    op = model.train(poll_interval_seconds=0)
    assert op.succeeded()

    op = model.predict(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.record_clusters().refresh(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.published_clusters().refresh(poll_interval_seconds=0)
    assert op.succeeded()
Ejemplo n.º 8
0
def health_check(client: Client) -> bool:
    """
    Query the health check API and check if each service is healthy (returns True)

    Args:
        client: the tamr client

    Returns:
        True if all services are healthy, False if unhealthy
    """
    try:
        response = client.get(endpoint="/api/service/health")
        healthy_status = all(
            [value["healthy"] for value in response.json().values()])
        if healthy_status:
            LOGGER.info(
                f"Client is healthy: {dumps(response.json(), indent=2)}")
        else:
            LOGGER.error(
                f"Client is unhealthy: {dumps(response.json(), indent=2)}")
        return healthy_status

    except requests.exceptions.ConnectionError as e:
        LOGGER.error(f"Could not connect to {client.host}. Error: {e}")
        return False
Ejemplo n.º 9
0
def get_with_connection_retry(client: Client,
                              api_endpoint: str,
                              *,
                              timeout_seconds: int = 600,
                              sleep_seconds: int = 20) -> requests.Response:
    """Will handle exceptions when attempting to connect to the Tamr API.
        This is used to handle connection issues when Tamr restarts due to a restore.

    Args:
        client: A Tamr client object
        api_endpoint: Tamr API endpoint
        timeout_seconds: Amount of time before a timeout error is thrown. Default is 600 seconds
        sleep_seconds: Amount of time in between attempts to connect to Tamr.

    Returns:
        A response object from API request."""
    started = now()
    while timeout_seconds is None or now() - started < timeout_seconds:
        try:
            response = client.get(api_endpoint)
            return response
        except ConnectionError as e:
            # If we got for example a connection refused exception, try again later
            LOGGER.warning(f"Caught exception in connect {e}")
            sleep(sleep_seconds)
    raise TimeoutError(
        f"Took longer than {timeout_seconds} seconds to connect to tamr.")
Ejemplo n.º 10
0
def test_client_repr():
    auth = UsernamePasswordAuth("username", "password")

    unify = Client(auth)
    rstr = f"{unify!r}"

    assert rstr.startswith("tamr_unify_client.client.Client(")
    assert "http" in rstr
    assert rstr.endswith(")")
    assert "password" not in rstr

    unify = Client(auth, protocol="http", port=1234, base_path="foo/bar")
    rstr = f"{unify!r}"

    assert "'http'" in rstr
    assert "1234" in rstr
    assert "foo/bar" in rstr
Ejemplo n.º 11
0
 def test_project_by_name(self):
     responses.add(responses.GET,
                   self.project_list_url,
                   json=self.project_json)
     auth = UsernamePasswordAuth("username", "password")
     tamr = Client(auth)
     actual_project = tamr.projects.by_name(self.project_name)
     assert actual_project._data == self.project_json[0]
Ejemplo n.º 12
0
def initiate_restore(
    client: Client,
    backup_id: str,
    *,
    polling_timeout_seconds: Optional[int] = None,
    poll_interval_seconds: int = 30,
    connection_retry_timeout_seconds: int = 600,
) -> requests.Response:
    """Restores the Tamr client to the state of the supplied backup.

    Args:
        client: A Tamr client object
        backup_id: BackupId of the desired backup.
        polling_timeout_seconds: Amount of time before a timeout error is thrown.
        poll_interval_seconds: Amount of time in between polls of job state.
        connection_retry_timeout_seconds: Amount of time before timeout error is thrown during
            connection retry

    Returns:
        Json dict of response from API request.

    Raises:
        ValueError: Raised if the target backup contains errors
        RuntimeError: Raised if the restore fails to start
        """
    backup = get_backup_by_id(client=client, backup_id=backup_id)
    backup_state = backup["state"]
    if not backup_state == "SUCCEEDED":
        value_error_message1 = (
            f"Backup file with ID {backup_id} did not succeed and has status {backup_state}"
        )
        raise ValueError(value_error_message1)

    error_message = backup["errorMessage"]
    if not error_message == "":
        value_error_message2 = (
            f"Backup file with ID {backup_id} contains non-null error message {error_message}"
        )
        LOGGER.error(value_error_message2)
        raise ValueError(value_error_message2)

    response = client.post("instance/restore", data=backup_id)
    if not response.ok:
        runtime_error_message = (
            f"Received non-200 response code '{response.status_code}' : {response.json()}"
        )
        LOGGER.error(runtime_error_message)
        raise RuntimeError(runtime_error_message)

    op = utils.client.poll_endpoint(
        client=client,
        api_endpoint="instance/restore",
        poll_interval_seconds=poll_interval_seconds,
        polling_timeout_seconds=polling_timeout_seconds,
        connection_retry_timeout_seconds=connection_retry_timeout_seconds,
    )

    return op
def test_dataset_collection_repr():
    client = Client(UsernamePasswordAuth("username", "password"))
    full_clz_name = "tamr_unify_client.models.dataset.collection.DatasetCollection"

    rstr = f"{client.datasets!r}"

    assert rstr.startswith(f"{full_clz_name}(")
    assert "api_path='datasets'" in rstr
    assert rstr.endswith(")")
Ejemplo n.º 14
0
def test_http_error():
    """Ensure that the client surfaces HTTP errors as exceptions.
    """
    endpoint = f"http://localhost:9100/api/versioned/v1/projects/1"
    responses.add(responses.GET, endpoint, status=401)
    auth = UsernamePasswordAuth("nonexistent-username", "invalid-password")
    unify = Client(auth)
    with raises(HTTPError) as e:
        unify.projects.by_resource_id("1")
    assert f"401 Client Error: Unauthorized for url: {endpoint}" in str(e)
Ejemplo n.º 15
0
def list_backups(client: Client) -> Generator[JsonDict, None, None]:
    """Lists all backups available to Tamr client. Will list both succeeded and failed backups.

    Args:
        client: A client object
    Returns:
        A generator of json dict objects for the backups available to client."""
    response = client.get("backups")
    for backup in response.json():
        yield backup
def test_client_repr():
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)
    full_clz_name = "tamr_unify_client.client.Client"

    rstr = f"{unify!r}"

    assert rstr.startswith(f"{full_clz_name}(")
    assert "http" in rstr
    assert "password" not in rstr
    assert rstr.endswith(")")

    # further testing when Client has optional arguments
    unify = Client(auth, protocol="http", port=1234, base_path="foo/bar")
    rstr = f"{unify!r}"

    assert "'http'" in rstr
    assert "1234" in rstr
    assert "foo/bar" in rstr
def test_dataset_status():
    dataset_id = "1"
    dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}"
    status_url = f"{dataset_url}/status"
    responses.add(responses.GET, dataset_url, json={})
    responses.add(responses.GET, status_url, json=status_json)
    auth = UsernamePasswordAuth("username", "password")
    tamr = Client(auth)

    dataset = tamr.datasets.by_resource_id(dataset_id)
    status = dataset.status()
    assert status._data == status_json
Ejemplo n.º 18
0
def from_resource_id(tamr: Client, *, job_id: Union[int, str]) -> Operation:
    """Create an operation from a job id

    Args:
        tamr: A Tamr client
        job_id: A job ID

    Returns:
        A Tamr operation
    """
    job_response = tamr.get(f"/api/versioned/v1/operations/{job_id}")
    return Operation.from_response(tamr, job_response)
Ejemplo n.º 19
0
def test_request_session_cookie():
    endpoint = "http://localhost:9100/api/versioned/v1/test"
    responses.add(responses.GET, endpoint, json={})

    session = requests.Session()
    cookie = requests.cookies.create_cookie(
        name="test_cookie", value="the-cookie-works"
    )
    session.cookies.set_cookie(cookie)

    client = Client(UsernamePasswordAuth("username", "password"), session=session)

    assert client.session is session

    endpoint = "test"
    client.get(endpoint)

    assert len(responses.calls) == 1
    req = responses.calls[0].request
    assert req.url.endswith("test")
    assert req.headers.get("Cookie") is not None
    assert "test_cookie=" in req.headers.get("Cookie")
Ejemplo n.º 20
0
def current(client: Client) -> str:
    """Gets the version of Tamr for provided client

    Args:
        client: Tamr client

    Returns:
        String representation of Tamr version

    """
    url = "/api/versioned/service/version"
    response = client.get(url).successful()
    return json.loads(response.content)["version"]
def test_continuous_mastering():
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)

    project_id = "1"
    project = unify.projects.by_resource_id(project_id)
    project = project.as_mastering()

    unified_dataset = project.unified_dataset()
    op = unified_dataset.refresh(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.pairs().refresh(poll_interval_seconds=0)
    assert op.succeeded()

    model = project.pair_matching_model()
    op = model.train(poll_interval_seconds=0)
    assert op.succeeded()

    op = model.predict(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.record_clusters().refresh(poll_interval_seconds=0)
    assert op.succeeded()

    op = project.published_clusters().refresh(poll_interval_seconds=0)
    assert op.succeeded()

    estimate_url = (
        f"http://localhost:9100/api/versioned/v1/projects/1/estimatedPairCounts"
    )
    estimate_json = {
        "isUpToDate": "true",
        "totalEstimate": {"candidatePairCount": "200", "generatedPairCount": "100"},
        "clauseEstimates": {
            "clause1": {"candidatePairCount": "50", "generatedPairCount": "25"},
            "clause2": {"candidatePairCount": "50", "generatedPairCount": "25"},
            "clause3": {"candidatePairCount": "100", "generatedPairCount": "50"},
        },
    }
    responses.add(responses.GET, estimate_url, json=estimate_json)

    status = project.estimate_pairs().is_up_to_date
    assert status

    candidate = project.estimate_pairs().total_estimate["candidatePairCount"]
    assert candidate == "200"

    clause1 = project.estimate_pairs().clause_estimates["clause1"]
    assert clause1["generatedPairCount"] == "25"
def test_dataset_records():
    dataset_id = "1"
    dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}"
    records_url = f"{dataset_url}/records"
    responses.add(responses.GET, dataset_url, json={})
    responses.add(responses.GET,
                  records_url,
                  body='{"attribute1": 1}\n{"attribute1": 2}')
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)

    dataset = unify.datasets.by_resource_id(dataset_id)
    records = list(dataset.records())
    assert records == [{"attribute1": 1}, {"attribute1": 2}]
def test_dataset_profile():
    dataset_id = "3"
    dataset_url = f"http://localhost:9100/api/versioned/v1/datasets/{dataset_id}"
    profile_url = f"{dataset_url}/profile"
    profile_refresh_url = f"{profile_url}:refresh"
    responses.add(responses.GET, dataset_url, json={})
    responses.add(responses.GET, profile_url, json=profile_json1)
    responses.add(responses.POST, profile_refresh_url, json=[], status=204)
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)

    dataset = unify.datasets.by_resource_id(dataset_id)
    profile = dataset.profile()
    assert profile._data == profile_json1
Ejemplo n.º 24
0
def test_project_add_source_dataset():
    responses.add(responses.GET, datasets_url, json=dataset_json)
    responses.add(responses.GET, projects_url, json=project_json)
    responses.add(
        responses.POST, input_datasets_url, json=post_input_datasets_json, status=204
    )
    responses.add(responses.GET, input_datasets_url, json=get_input_datasets_json)
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)
    dataset = unify.datasets.by_external_id(dataset_external_id)
    project = unify.projects.by_external_id(project_external_id)
    project.add_source_dataset(dataset)
    alias = project.api_path + "/inputDatasets"
    input_datasets = project.client.get(alias).successful().json()
    assert input_datasets == dataset_json
Ejemplo n.º 25
0
def test_record_clusters_with_data():

    project_config = {
        "name": "Project 1",
        "description": "Mastering Project",
        "type": "DEDUP",
        "unifiedDatasetName": "Project 1 - Unified Dataset",
        "externalId": "Project1",
        "resourceId": "1",
    }

    unified_dataset_json = {
        "id": "unify://unified-data/v1/datasets/8",
        "name": "Project_1_unified_dataset",
        "version": "10",
        "relativeId": "datasets/8",
        "externalId": "Project_1_unified_dataset",
    }

    rcwd_json = {
        "externalId": "1",
        "id": "unify://unified-data/v1/datasets/36",
        "name": "Project_1_unified_dataset_dedup_clusters_with_data",
        "relativeId": "datasets/36",
        "version": "251",
    }

    datasets_json = [rcwd_json]

    unify = Client(UsernamePasswordAuth("username", "password"))

    project_id = "1"

    project_url = f"http://localhost:9100/api/versioned/v1/projects/{project_id}"
    unified_dataset_url = (
        f"http://localhost:9100/api/versioned/v1/projects/{project_id}/unifiedDataset"
    )
    datasets_url = f"http://localhost:9100/api/versioned/v1/datasets"

    responses.add(responses.GET, project_url, json=project_config)
    responses.add(responses.GET,
                  unified_dataset_url,
                  json=unified_dataset_json)
    responses.add(responses.GET, datasets_url, json=datasets_json)
    project = unify.projects.by_resource_id(project_id)
    actual_rcwd_dataset = project.as_mastering().record_clusters_with_data()
    assert actual_rcwd_dataset.name == rcwd_json["name"]
Ejemplo n.º 26
0
def get_all(tamr: Client) -> List[Operation]:
    """
    Get a list of all jobs or operations.

    Args:
        tamr: A Tamr client

    Returns:
        A list of Operation objects.

    """
    response = tamr.get(
        "/api/versioned/v1/operations", headers={"Accept": "application/json"}, stream=True
    ).json()
    ops = [Operation.from_json(tamr, item) for item in response]

    return ops
    def test_profile_refresh(self):
        auth = UsernamePasswordAuth("username", "password")
        client = Client(auth)

        dataset_id = "3"
        dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}"
        profile_url = f"{dataset_url}/profile"
        profile_refresh_url = f"{profile_url}:refresh"
        responses.add(responses.GET, dataset_url, json={})
        responses.add(responses.GET, profile_url, json=self.profile_stale)
        responses.add(responses.POST,
                      profile_refresh_url,
                      json=self.operation_succeeded)

        dataset = client.datasets.by_resource_id(dataset_id)
        profile = dataset.profile()
        op = profile.refresh()
        self.assertTrue(op.succeeded())
def test_continuous_categorization():
    auth = UsernamePasswordAuth("username", "password")
    unify = Client(auth)

    project_id = "3"
    project = unify.projects.by_resource_id(project_id)
    project = project.as_categorization()

    unified_dataset = project.unified_dataset()
    op = unified_dataset.refresh(poll_interval_seconds=0)
    assert op.succeeded()

    model = project.model()
    op = model.train(poll_interval_seconds=0)
    assert op.succeeded()

    op = model.predict(poll_interval_seconds=0)
    assert op.succeeded()
def test_dataset_status_repr():
    client = Client(UsernamePasswordAuth("username", "password"))
    data = {
        "relativeId": "path/to/thing/1",
        "datasetName": "testdsname",
        "relativeDatasetId": "path/to/data/1",
        "isStreamable": True,
    }
    status = DatasetStatus.from_json(client, data)
    full_clz_name = "tamr_unify_client.models.dataset_status.DatasetStatus"

    rstr = f"{status!r}"

    assert rstr.startswith(f"{full_clz_name}(")
    assert "testdsname" in rstr
    assert "True" in rstr
    assert "path/to/thing" in rstr
    assert rstr.endswith(")")
    def test_profile_create(self):
        auth = UsernamePasswordAuth("username", "password")
        client = Client(auth)

        dataset_id = "3"
        dataset_url = f"{client.protocol}://{client.host}:{client.port}/api/versioned/v1/datasets/{dataset_id}"
        profile_url = f"{dataset_url}/profile"
        profile_refresh_url = f"{profile_url}:refresh"
        responses.add(responses.GET, dataset_url, json={})
        # We need to ensure that, when creating the profile,
        # nothing ever tries to access the (non-existent) profile.
        responses.add(responses.GET, profile_url, status=404)
        responses.add(responses.POST,
                      profile_refresh_url,
                      json=self.operation_succeeded)

        dataset = client.datasets.by_resource_id(dataset_id)
        op = dataset.create_profile()
        self.assertTrue(op.succeeded())