Beispiel #1
0
def add_synthetic_keys_to_stream_schema(stream):
    """Add synthetic keys to stream's schema."""
    stream.schema.properties["_sdc_report_datetime"] = Schema.from_dict({
        "description":
        "DateTime of Report Run",
        "type":
        "string",
        "format":
        "date-time",
    })
    if stream.tap_stream_id in STATISTICS_REPORT_TYPES:
        stream.schema.properties["_sdc_report_currency"] = Schema.from_dict({
            "description":
            "Currency of all costs in report",
            "type":
            "string",
        })
        stream.schema.properties[
            "_sdc_report_ignore_x_device"] = Schema.from_dict({
                "description":
                "Ignore cross-device data. Also can explicitly " +
                "set to null for TransactionID ReportType to get all data.",
                "type":
                "boolean",
            })
    return stream
def do_sync(client, catalog, state, config):
    selected_stream_names = get_selected_streams(catalog)
    validate_dependencies(selected_stream_names)
    populate_class_schemas(catalog, selected_stream_names)
    all_sub_stream_names = get_sub_stream_names()

    for stream in catalog.streams:
        stream_name = stream.tap_stream_id
        mdata = metadata.to_map(stream.metadata)
        if stream_name not in selected_stream_names:
            LOGGER.info("%s: Skipping - not selected", stream_name)
            continue

        key_properties = metadata.get(mdata, (), 'table-key-properties')
        sideload_objects = metadata.get(mdata, (), 'sideload-objects')
        if sideload_objects:
            stream_schema = get_side_load_schemas(sideload_objects, stream)
            stream.schema = Schema.from_dict(stream_schema)

        singer.write_schema(stream_name, stream.schema.to_dict(),
                            key_properties)

        sub_stream_names = SUB_STREAMS.get(stream_name)
        if sub_stream_names:
            for sub_stream_name in sub_stream_names:
                if sub_stream_name not in selected_stream_names:
                    continue
                sub_stream = STREAMS[sub_stream_name].stream
                sub_mdata = metadata.to_map(sub_stream.metadata)
                sub_key_properties = metadata.get(sub_mdata, (),
                                                  'table-key-properties')
                sideload_objects = metadata.get(mdata, (), 'sideload-objects')
                if sideload_objects:
                    sub_stream_schema = get_side_load_schemas(
                        sideload_objects, sub_stream)
                    sub_stream.schema = Schema.from_dict(sub_stream_schema)
                singer.write_schema(sub_stream.tap_stream_id,
                                    sub_stream.schema.to_dict(),
                                    sub_key_properties)

        # parent stream will sync sub stream
        if stream_name in all_sub_stream_names:
            continue

        LOGGER.info("%s: Starting sync", stream_name)
        instance = STREAMS[stream_name](client, config)
        counter_value = sync_stream(state, config.get('start_date'), instance)
        singer.write_state(state)
        LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter_value)
        zendesk_metrics.log_aggregate_rates()

    singer.write_state(state)
    LOGGER.info("Finished sync")
    zendesk_metrics.log_aggregate_rates()
Beispiel #3
0
def generate_catalog(client, report_config, standard_fields, custom_fields,
                     all_cubes, cubes_lookup, profile_ids):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []
    # for report in PREMADE_REPORTS:
    for report in report_config:
        # change to safe name for bigquery
        temp = report['name'].replace(' ', '_').lower()
        report['name'] = temp

        metrics_dimensions = set(report['metrics'] + report['dimensions'])
        selected_by_default = {
            *report['metrics'][:10],  # Use first 10 metrics in definition
            *report.get('default_dimensions', [])
        }
        premade_fields = [
            field for field in standard_fields
            if field['id'] in metrics_dimensions
        ]
        schema, mdata = generate_premade_catalog_entry(premade_fields,
                                                       all_cubes, cubes_lookup)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['name'],
                         metadata=metadata.to_list(mdata)))

    # for report in report_config:
    for report in []:
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['id'],
                         metadata=metadata.to_list(mdata)))
    return Catalog(catalog_entries)
Beispiel #4
0
 def test_should_ignore_fields_that_dont_parse(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "created_at": "not a date",
         "edited_at": "2019-09-02T05:13:43.151"
     }
     schema = Schema.from_dict({
         "properties": {
             "created_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "edited_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["created_at"] == "not a date"
     assert updated_record[
         "edited_at"] == "2019-09-02T05:13:43.151000+00:00"
    def test_one_pk_value_with_bookmark(self):
        catalog_entry = CatalogEntry(tap_stream_id='foo',
                                     schema=Schema.from_dict({
                                         'properties': {
                                             'id': {
                                                 'type': ['integer']
                                             }
                                         }
                                     }),
                                     metadata=[{
                                         'breadcrumb': (),
                                         'metadata': {
                                             'table-key-properties': ['id']
                                         }
                                     }])
        state = {
            'bookmarks': {
                'foo': {
                    'last_pk_fetched': {
                        'id': 4
                    },
                    'max_pk_values': {
                        'id': 10
                    }
                }
            }
        }

        expected = ' WHERE ((`id` > 4)) AND `id` <= 10 ORDER BY `id` ASC'
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
    def test_fails_with_null_bookmark(self):
        catalog_entry = CatalogEntry(
            schema=Schema.from_dict({'properties': {}}))
        key_properties = []
        last_pk_fetched = None

        with self.assertRaises(AssertionError):
            generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                        catalog_entry)
    def test_no_pk_values(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict(
            {'properties': {}}),
                                     metadata=[])
        state = {}

        expected = ''
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
    def test_empty_pk(self):
        catalog_entry = CatalogEntry(
            schema=Schema.from_dict({'properties': {}}))
        key_properties = []
        last_pk_fetched = {}

        expected = ''
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
def generate_catalog(client, standard_fields, custom_fields, all_cubes,
                     cubes_lookup, profile_id):
    schema, mdata = generate_catalog_entry(client, standard_fields,
                                           custom_fields, all_cubes,
                                           cubes_lookup, profile_id)
    # Do the thing to generate the thing
    catalog_entry = CatalogEntry(schema=Schema.from_dict(schema),
                                 key_properties=['_sdc_record_hash'],
                                 stream='report',
                                 tap_stream_id='report',
                                 metadata=metadata.to_list(mdata))
    return Catalog([catalog_entry])
Beispiel #10
0
    def test_one_pk(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict(
            {'properties': {
                'id1': {
                    'type': ['integer']
                }
            }}))
        key_properties = ['id1']
        last_pk_fetched = {'id1': 4}

        expected = '(`id1` > 4)'
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Beispiel #11
0
 def schema(self):
     if self._schema is None:
         raw_schema = self._client.item_schema(self.api)
         if self._config.merge_fields_array:
             # Replace merge fields object with array to make a separate table.
             mf_desc = raw_schema['properties']['merge_fields'][
                 'description']
             raw_schema['properties']['merge_fields'] = {
                 'description': mf_desc,
                 'type': 'array',
                 'items': {
                     'type': 'object',
                     'properties': {
                         'merge_id': {
                             'type': 'number'
                         },
                         'tag': {
                             'type': 'string'
                         },
                         'name': {
                             'type': 'string'
                         },
                         'type': {
                             'type': 'string'
                         },
                         'value': {
                             'type': 'string'
                         }
                     }
                 }
             }
         if self._config.interests_array:
             # Replace interest object with array to make a separate table.
             int_desc = raw_schema['properties']['interests']['description']
             raw_schema['properties']['interests'] = {
                 'description': int_desc,
                 'type': 'array',
                 'items': {
                     'type': 'object'
                 }
             }
         self._schema = Schema.from_dict(raw_schema)
     return self._schema
Beispiel #12
0
    def test_two_pks(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict({
            'properties': {
                'id1': {
                    'type': ['integer']
                },
                'str': {
                    'type': ['string']
                }
            }
        }))
        key_properties = ['id1', 'str']
        last_pk_fetched = {'id1': 4, 'str': 'apples'}

        expected = '(`id1` > 4) OR (`id1` = 4 AND `str` > \'apples\')'
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Beispiel #13
0
    def test_three_pk_values_with_bookmark(self):
        catalog_entry = CatalogEntry(tap_stream_id='foo',
                                     schema=Schema.from_dict({
                                         'properties': {
                                             'id1': {
                                                 'type': ['integer']
                                             },
                                             'id2': {
                                                 'type': ['string']
                                             },
                                             'id3': {
                                                 'type': ['integer']
                                             }
                                         }
                                     }),
                                     metadata=[{
                                         'breadcrumb': (),
                                         'metadata': {
                                             'table-key-properties':
                                             ['id1', 'id2', 'id3']
                                         }
                                     }])
        state = {
            'bookmarks': {
                'foo': {
                    'last_pk_fetched': {
                        'id1': 4,
                        'id2': 6,
                        'id3': 2
                    },
                    'max_pk_values': {
                        'id1': 10,
                        'id2': 8,
                        'id3': 3
                    }
                }
            }
        }

        expected = ' WHERE ((`id1` > 4) OR (`id1` = 4 AND `id2` > \'6\') OR (`id1` = 4 AND `id2` = \'6\' AND `id3` > 2)) AND `id1` <= 10 AND `id2` <= \'8\' AND `id3` <= 3 ORDER BY `id1`, `id2`, `id3` ASC'
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
Beispiel #14
0
def get_schema_for_table(table):
    """ Given a table object, output its schema
    """
    schema = {"type": "object", "properties": {}, "required": []}
    table_fields = table.get("table_fields", [])

    # Add __id (record id) to every table

    record_id_field = {
        "id": "__id",
        "type": "integer",
        "required": True,
        "is_multiple": False
    }

    table_fields.append(record_id_field)

    for field in table_fields:
        property_schema = {"inclusion": "automatic"}
        property_schema['type'] = []

        if field["required"]:
            schema["required"].append(field["id"])
        else:
            property_schema['type'].append("null")

        if field["type"] in (STRING_TYPES | DATE_TYPES) \
                or field["is_multiple"]:
            property_schema['type'].append("string")

        if field["type"] in DATE_TYPES:
            property_schema["format"] = "date-time"

        if field["type"] in NUMERIC_TYPES:
            property_schema["type"].append("number")

        if field["type"] in INTEGER_TYPES or field["type"] == "integer":
            property_schema["type"].append("integer")

        schema["properties"][field["id"]] = property_schema

    return Schema.from_dict(schema)
Beispiel #15
0
 def test_should_not_update_datetime_that_contains_timezone(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "domain": "pageup",
         "created_at": "2019-09-02T05:13:43.151+10:00"
     }
     schema = Schema.from_dict({
         "properties": {
             "created_at": {
                 "format": "date-time",
                 "type": "string"
             },
             "domain": {
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["created_at"] == "2019-09-02T05:13:43.151+10:00"
Beispiel #16
0
def generate_catalog(
    client,
    report_config,
    standard_fields,
    custom_fields,
    all_cubes,
    cubes_lookup,
    profile_ids,
):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []

    for report in report_config:
        selected_by_default = {
            *report['metrics'][:10], *report.get('dimensions', [])
        }
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(
                schema=Schema.from_dict(schema),
                key_properties=['_sdc_record_hash'],
                stream=report['name'],
                tap_stream_id=report['name'],
                metadata=metadata.to_list(mdata),
            ))
    return Catalog(catalog_entries)
Beispiel #17
0
 def test_should_append_timezone_to_nested_datetime_fields(self):
     record = {
         "id": "5d6ca50762a07c00045125fb",
         "date": "2019-08-13T00:00:00",
         "edited": {
             "author": "Sam Witwicky",
             "date": "2019-10-09T06:14:58.877"
         },
     }
     schema = Schema.from_dict({
         "properties": {
             "edited": {
                 "properties": {
                     "author": {
                         "type": "string"
                     },
                     "date": {
                         "format": "date-time",
                         "type": "string"
                     }
                 },
                 "type": "object",
             },
             "date": {
                 "format": "date-time",
                 "type": "string"
             },
             "id": {
                 "type": "string"
             },
         }
     })
     updated_record = Stream.convert_dates_to_rfc3339(record, schema)
     assert updated_record["date"] == "2019-08-13T00:00:00+00:00"
     assert updated_record["edited"][
         "date"] == "2019-10-09T06:14:58.877000+00:00"
Beispiel #18
0
def load_static_schemas(streams):
    """ Load default schemas for all streams
    """
    for stream in streams:
        LOGGER.info('Loading schema for %s', stream.tap_stream_id)
        stream.discovered_schema.update(load_static_schema(stream))


STATIC_STREAMS = "members pipes cards tables".split()

catalog_entries = [
    CatalogEntry(tap_stream_id=stream,
                 stream=stream,
                 key_properties=["id"],
                 schema=Schema.from_dict(load_static_schema(stream)))
    for stream in STATIC_STREAMS
]

CATALOG = Catalog(catalog_entries)

LOGGER.info("There are %s static streams", len(CATALOG.streams))
LOGGER.info("STREAMS: %s", [stream.stream for stream in CATALOG.streams])


def get_schema_for_table(table):
    """ Given a table object, output its schema
    """
    schema = {"type": "object", "properties": {}, "required": []}
    table_fields = table.get("table_fields", [])
Beispiel #19
0
 def schema(self):
     if self._schema is None:
         self._schema = Schema.from_dict(self._client.item_schema(self.api))
     return self._schema