Beispiel #1
0
def discover():
    catalog = Catalog([])

    for stream_name, endpoint_config in get_endpoints():
        schema_dict, metadata = get_schema(stream_name, endpoint_config)
        schema = Schema.from_dict(schema_dict)

        catalog.streams.append(CatalogEntry(
            stream=stream_name,
            tap_stream_id=stream_name,
            key_properties=endpoint_config['pk'],
            schema=schema,
            metadata=metadata
        ))

    return catalog
Beispiel #2
0
def resolve_catalog(discovered_catalog, streams_to_sync):
    result = Catalog(streams=[])

    # Iterate over the streams in the input catalog and match each one up
    # with the same stream in the discovered catalog.
    for catalog_entry in streams_to_sync:
        catalog_metadata = metadata.to_map(catalog_entry.metadata)
        replication_key = catalog_metadata.get((), {}).get("replication-key")

        discovered_table = discovered_catalog.get_stream(
            catalog_entry.tap_stream_id)
        database_name = common.get_database_name(catalog_entry)

        if not discovered_table:
            LOGGER.warning(
                "Database %s table %s was selected but does not exist",
                database_name,
                catalog_entry.table,
            )
            continue

        selected = {
            k
            for k, v in discovered_table.schema.properties.items()
            if common.property_is_selected(catalog_entry, k)
            or k == replication_key
        }

        # These are the columns we need to select
        columns = desired_columns(selected, discovered_table.schema)
        result.streams.append(
            CatalogEntry(
                tap_stream_id=catalog_entry.tap_stream_id,
                metadata=catalog_entry.metadata,
                stream=catalog_entry.tap_stream_id,
                table=catalog_entry.table,
                schema=Schema(
                    type="object",
                    properties={
                        col: discovered_table.schema.properties[col]
                        for col in columns
                    },
                ),
            ))

    return result
Beispiel #3
0
def discover(service):
    catalog = Catalog([])

    for entity_name, entity in service.entities.items():
        optionset_map = get_optionset_metadata(service, entity_name)
        schema_dict, metadata, pks = get_schema(entity.__odata_schema__,
                                                optionset_map)
        schema = Schema.from_dict(schema_dict)

        catalog.streams.append(
            CatalogEntry(stream=entity_name,
                         tap_stream_id=entity_name,
                         key_properties=pks,
                         schema=schema,
                         metadata=metadata))

    return catalog
Beispiel #4
0
def discover(client):
    catalog = Catalog([])

    for resource_name in RESOURCES.keys():
        schema_dict, metadata = get_schema(client, resource_name)
        schema = Schema.from_dict(schema_dict)

        stream_name = RESOURCES[resource_name]

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=['Id'],
                         schema=schema,
                         metadata=metadata))

    return catalog
Beispiel #5
0
def discover(client):
    schemas, field_metadata = get_schemas(client)
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        metadata = field_metadata[stream_name]
        pk = get_pk(stream_name)

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=pk,
                         schema=schema,
                         metadata=metadata))

    return catalog
Beispiel #6
0
def discover():
    schemas, schemas_metadata = get_schemas()

    streams = []
    for schema_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        schema_meta = schemas_metadata[schema_name]

        streams.append(
            CatalogEntry(
                tap_stream_id=schema_name,
                stream=schema_name,
                schema=schema,
                key_properties=STREAMS[schema_name]['key_properties'],
                metadata=schema_meta,
            ))
    return Catalog(streams)
Beispiel #7
0
def add_stream_to_catalog(catalog, ctx, stream):
    schema_dict = load_schema(ctx, stream.tap_stream_id)
    schema = Schema.from_dict(schema_dict)
    mdata = metadata.get_standard_metadata(schema_dict,
                                           key_properties=stream.pk_fields)
    mdata = metadata.to_map(mdata)

    for field_name in schema_dict['properties'].keys():
        mdata = metadata.write(mdata, ('properties', field_name), 'inclusion',
                               'automatic')

    catalog.streams.append(
        CatalogEntry(stream=stream.tap_stream_id,
                     tap_stream_id=stream.tap_stream_id,
                     key_properties=stream.pk_fields,
                     schema=schema,
                     metadata=metadata.to_list(mdata)))
Beispiel #8
0
def discover():
    entries = []

    for stream in streams:
        schema = Schema.from_dict(stream.get_schema())
        stream_metadata = []
        key_properties = stream.key_properties
        for prop, json_schema in schema.properties.items():
            inclusion = 'available'
            if prop in key_properties or prop == 'start_date':
                inclusion = 'automatic'

            stream_metadata.append({
                'breadcrumb': [],
                'metadata': {
                    'inclusion': 'available',
                    'table-key-properties': key_properties,
                    'schema-name': stream.tap_stream_id,
                    'selected': True,
                }
            })

            stream_metadata.append({
                'breadcrumb': ['properties', prop],
                'metadata': {
                    'inclusion': inclusion
                }
            })

        entries.append(
            CatalogEntry(
                tap_stream_id=stream.tap_stream_id,
                stream=stream.tap_stream_id,
                schema=schema,
                key_properties=key_properties,
                metadata=stream_metadata,
                replication_key=None,
                is_view=None,
                database=None,
                table=None,
                row_count=None,
                stream_alias=None,
                replication_method=None,
            )
        )
    return Catalog(entries)
def discover_streams(service, config):
    profile_id = config.get('profile_id')

    reports = (service.reports().list(
        profileId=profile_id).execute().get('items'))

    reports = sorted(reports, key=lambda x: x['id'])
    report_configs = {}
    for report in reports:
        stream_name = sanitize_name(report['name'])
        tap_stream_id = '{}_{}'.format(stream_name, report['id'])
        report_configs[(stream_name, tap_stream_id)] = report

    field_type_lookup = get_field_type_lookup()
    catalog = Catalog([])

    for (stream_name, tap_stream_id), report in report_configs.items():
        fieldmap = get_fields(field_type_lookup, report)
        schema_dict = get_schema(stream_name, fieldmap)
        schema = Schema.from_dict(schema_dict)

        metadata = []
        metadata.append({
            'metadata': {
                'tap-doubleclick-campaign-manager.report-id': report['id']
            },
            'breadcrumb': []
        })

        for prop in schema_dict['properties'].keys():
            metadata.append({
                'metadata': {
                    'inclusion': 'automatic'
                },
                'breadcrumb': ['properties', prop]
            })

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         stream_alias=stream_name,
                         tap_stream_id=tap_stream_id,
                         key_properties=[],
                         schema=schema,
                         metadata=metadata))

    return catalog.to_dict()
Beispiel #10
0
def discover(ctx):
    LOGGER.info("Running discover")
    use_event_log = has_access_to_event_log(ctx)
    catalog = Catalog([])
    for tap_stream_id in streams_.stream_ids:
        if not use_event_log and tap_stream_id == schemas.IDS.EVENT_LOG:
            continue
        raw_schema = schemas.load_schema(ctx, tap_stream_id)
        schema = Schema.from_dict(raw_schema, inclusion="automatic")
        catalog.streams.append(
            CatalogEntry(
                stream=tap_stream_id,
                tap_stream_id=tap_stream_id,
                key_properties=schemas.PK_FIELDS[tap_stream_id],
                schema=schema,
            ))
    return catalog
Beispiel #11
0
def discover() -> Catalog:  # noqa: WPS210
    """Load the Stream catalog.

    Returns:
        Catalog -- The catalog
    """
    raw_schemas: dict = load_schemas()
    streams: list = []

    # Parse every schema
    for stream_id, schema in raw_schemas.items():

        stream_meta: dict = {'key_properties': ['id']}
        # Create metadata
        mdata: list = metadata.get_standard_metadata(
            schema=schema.to_dict(),
            key_properties=stream_meta.get('key_properties', None),
            valid_replication_keys=stream_meta.get(
                'replication_keys',
                None,
            ),
            replication_method=stream_meta.get(
                'replication_method',
                None,
            ),
        )

        # Create a catalog entry
        streams.append(
            CatalogEntry(
                tap_stream_id=stream_id,
                stream=stream_id,
                schema=schema,
                key_properties=stream_meta.get('key_properties', None),
                metadata=mdata,
                replication_key=stream_meta.get(
                    'replication_key',
                    None,
                ),
                replication_method=stream_meta.get(
                    'replication_method',
                    None,
                ),
            ),
        )
    return Catalog(streams)
Beispiel #12
0
def discover(ctx):
    ctx.refresh_credentials()
    catalog = Catalog([])
    for stream in streams.all_streams:
        schema_dict = load_correct_schema(stream.tap_stream_id)
        mdata = load_metadata(stream, schema_dict)

        schema = Schema.from_dict(schema_dict)
        catalog.streams.append(
            CatalogEntry(
                stream=stream.tap_stream_id,
                tap_stream_id=stream.tap_stream_id,
                key_properties=stream.pk_fields,
                schema=schema,
                metadata=mdata,
            ))
    return catalog
Beispiel #13
0
def discover():
    raw_schemas = load_schemas()
    streams = []
    for stream_id, schema in raw_schemas.items():

        stream_metadata = gen_metadata(stream_id, schema, BASE_METADATA)

        streams.append(
            CatalogEntry(tap_stream_id=stream_id,
                         stream=stream_id,
                         schema=schema,
                         key_properties=metadata.get(
                             metadata.to_map(stream_metadata), (),
                             'key_properties'),
                         metadata=stream_metadata))

    return Catalog(streams)
Beispiel #14
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]

        catalog.streams.append(CatalogEntry(
            stream=stream_name,
            tap_stream_id=stream_name,
            key_properties=STREAMS[stream_name]['key_properties'],
            schema=schema,
            metadata=mdata
        ))

    return catalog
def mock_catalog():
    return Catalog(streams=[
        CatalogEntry(
            tap_stream_id="records",
            stream="records",
            schema=Schema.from_dict(json.loads(SCHEMA)),
            key_properties=["id"],
            metadata=[{
                "breadcrumb": [],
                "metadata": {
                    "selected": True
                }
            }],
            replication_key="date",
            replication_method="INCREMENTAL",
        )
    ])
Beispiel #16
0
    def test_insights_start_dates(self):
        insights = AdsInsights(
            name='insights',
            account=None,
            stream_alias="insights",
            options={},
            catalog_entry=CatalogEntry(schema={'properties': {'something': {'type': 'object'}}},
                                       metadata=[{'breadcrumb': ('properties', 'something'),
                                                  'metadata': {'selected' : True}}]),
            state={'bookmarks':{'insights': {'date_start': '2017-01-31'}}})
        params = list(itertools.islice(insights.job_params(), 5))
        self.assertEqual(params[0]['time_ranges'],
                         [{'since': '2017-01-03',
                           'until': '2017-01-03'}])

        self.assertEqual(params[4]['time_ranges'],
                         [{'since': '2017-01-07',
                           'until': '2017-01-07'}])
def discover(config):
    client = Client(config)
    streams = []
    for _, stream in STREAM_OBJECTS.items():
        raw_schema = load_schema(stream.tap_stream_id)
        schema = Schema.from_dict(raw_schema)
        streams.append(
            CatalogEntry(stream=stream.tap_stream_id,
                         tap_stream_id=stream.tap_stream_id,
                         key_properties=stream.pk_fields,
                         schema=schema,
                         metadata=metadata.get_standard_metadata(
                             schema=raw_schema,
                             schema_name=stream.tap_stream_id,
                             key_properties=stream.pk_fields,
                             valid_replication_keys=stream.replication_keys,
                             replication_method=stream.replication_method)))
    return Catalog(streams)
Beispiel #18
0
def get_dynamic_streams():
    """ Get dynamic table schemas
    """
    entries = []
    org = get_organization(CONFIG["organization_id"])
    tables = org.pop("tables", [])
    tables = [tab["node"] for tab in tables.get("edges", [])]

    for table in tables:
        stream = "table_{}".format(table["id"])
        entry = CatalogEntry(tap_stream_id=stream,
                             stream=stream,
                             key_properties=["__id"],
                             schema=get_schema_for_table(table))
        entries.append(entry)

    LOGGER.info("There are %s tables (dynamic schemas)", len(tables))
    return entries
Beispiel #19
0
    def test_two_pks(self):
        catalog_entry = CatalogEntry(schema=Schema.from_dict({
            'properties': {
                'id1': {
                    'type': ['integer']
                },
                'str': {
                    'type': ['string']
                }
            }
        }))
        key_properties = ['id1', 'str']
        last_pk_fetched = {'id1': 4, 'str': 'apples'}

        expected = '(`id1` > 4) OR (`id1` = 4 AND `str` > \'apples\')'
        actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched,
                                             catalog_entry)

        self.assertEqual(expected, actual)
Beispiel #20
0
    def test_three_pk_values_with_bookmark(self):
        catalog_entry = CatalogEntry(tap_stream_id='foo',
                                     schema=Schema.from_dict({
                                         'properties': {
                                             'id1': {
                                                 'type': ['integer']
                                             },
                                             'id2': {
                                                 'type': ['string']
                                             },
                                             'id3': {
                                                 'type': ['integer']
                                             }
                                         }
                                     }),
                                     metadata=[{
                                         'breadcrumb': (),
                                         'metadata': {
                                             'table-key-properties':
                                             ['id1', 'id2', 'id3']
                                         }
                                     }])
        state = {
            'bookmarks': {
                'foo': {
                    'last_pk_fetched': {
                        'id1': 4,
                        'id2': 6,
                        'id3': 2
                    },
                    'max_pk_values': {
                        'id1': 10,
                        'id2': 8,
                        'id3': 3
                    }
                }
            }
        }

        expected = ' WHERE ((`id1` > 4) OR (`id1` = 4 AND `id2` > \'6\') OR (`id1` = 4 AND `id2` = \'6\' AND `id3` > 2)) AND `id1` <= 10 AND `id2` <= \'8\' AND `id3` <= 3 ORDER BY `id1`, `id2`, `id3` ASC'
        actual = generate_pk_clause(catalog_entry, state)

        self.assertEqual(expected, actual)
Beispiel #21
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    flat_streams = flatten_streams()
    for stream_name, schema_dict in schemas.items():
        LOGGER.info('discover schema for stream: {}'.format(stream_name))
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]
        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=flat_streams.get(stream_name, {}).get(
                             'key_properties', None),
                         schema=schema,
                         metadata=mdata))

    LOGGER.info('Returning catalog: {}'.format(catalog))
    return catalog
Beispiel #22
0
def tabsInfo(sheetsService, row):
    result = []
    with rate_limiter:
        tabs = makeRequestWithExponentialBackoff(sheetsService, row)
    for tab_id, tab in enumerate(tabs["sheets"]):
        sheet_id = row['id']
        sheet_name = row['name'].lower().replace(" ", "")
        tab_id = str(tab_id)
        tab_name = tab["properties"]["title"].lower().replace(" ", "")
        entry = CatalogEntry(
            tap_stream_id=sheet_id + "?" + sheet_name + "?" + tab_id + "?" +
            tab_name + "?" + sheet_name + "_" + tab_name,
            stream=tab["properties"]["title"].lower().replace(" ", ""),
            database=row['name'].lower().replace(" ", "") + '&' + row['id'],
            table=tab["properties"]["title"].lower().replace(" ", "") + '&' +
            str(tab_id),
        )
        result.append(entry)
    return (result)
Beispiel #23
0
def discover(config):
    client = Client(config)
    ensure_credentials_are_authorized(client)
    include_account_stream = is_account_endpoint_authorized(client)
    catalog = Catalog([])
    for stream in streams_.all_streams:
        if (not include_account_stream
                and stream.tap_stream_id == streams_.ACCOUNT.tap_stream_id):
            continue
        schema = Schema.from_dict(load_schema(stream.tap_stream_id),
                                  inclusion="automatic")
        catalog.streams.append(
            CatalogEntry(
                stream=stream.tap_stream_id,
                tap_stream_id=stream.tap_stream_id,
                key_properties=stream.pk_fields,
                schema=schema,
            ))
    return catalog
Beispiel #24
0
    def discover_base(cls, base_id, base_name=None):
        cls.logger.info("discover base " + base_id)
        headers = cls.__get_auth_header()
        response = requests.get(url=cls.metadata_url + base_id,
                                headers=headers)
        response.raise_for_status()
        entries = []

        for table in response.json()["tables"]:
            schema_cols = {
                "id": Schema(inclusion="automatic", type=['null', "string"])
            }

            meta = {}

            table_name = table["name"]
            keys = []
            meta = metadata.write(meta, (), "inclusion", "available")
            meta = metadata.write(meta, 'database_name', 'base_id', base_id)

            for field in table["fields"]:
                col_schema = cls.column_schema(field)
                if col_schema.inclusion == "automatic":
                    keys.append(field["name"])
                schema_cols[field["name"]] = col_schema
                meta = metadata.write(meta, ('properties', field["name"]),
                                      'inclusion', 'available')
                meta = metadata.write(meta, ('properties', field["name"]),
                                      'airtable_type', field["config"]["type"]
                                      or None)

            schema = Schema(type='object', properties=schema_cols)

            entry = CatalogEntry(tap_stream_id=table["id"],
                                 database=base_name or base_id,
                                 table=table_name,
                                 stream=table_name,
                                 metadata=metadata.to_list(meta),
                                 key_properties=keys,
                                 schema=schema)
            entries.append(entry)

        return entries
Beispiel #25
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata.get(stream_name, {})

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=[],
                         schema=schema,
                         metadata=[{
                             "metadata": mdata,
                             "breadcrumb": []
                         }]))

    return catalog
Beispiel #26
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        schema_metadata = field_metadata[stream_name]

        # Assume id key for each stream for now...
        key_properties = ['id']

        # Selects all streams by default
        metadata = [{
            'metadata': {
                'inclusion': 'available',
                'table-key-properties': key_properties,
                'selected': True,
                'schema-name': stream_name
            },
            'breadcrumb': []
        }]

        for prop, json_schema in schema_metadata:
            inclusion = 'available'

            if prop in key_properties:
                inclusion = 'automatic'

            metadata.append({
                'breadcrumb': ['properties', prop],
                'metadata': {
                    'inclusion': inclusion
                }
            })

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=key_properties,
                         schema=schema,
                         metadata=metadata))

    return catalog
Beispiel #27
0
    def do_discover(self):
        logger.info("Starting discover")

        catalog = Catalog([])

        for stream in self.streams:
            stream.tap = self

            schema = Schema.from_dict(stream.get_schema())
            key_properties = stream.key_properties

            metadata = [{
                "metadata": {
                    "inclusion": "available",
                    "table-key-properties": ["id"],
                    "selected": True,
                    "schema-name": stream.get_name()
                },
                "breadcrumb": []
            }]
            for prop, json_schema in schema.properties.items():
                inclusion = "available"
                if prop in key_properties or (stream.state_field
                                              and prop == stream.state_field):
                    inclusion = "automatic"
                metadata.append({
                    "breadcrumb": ["properties", prop],
                    "metadata": {
                        "inclusion": inclusion
                    },
                })

            catalog.streams.append(
                CatalogEntry(
                    stream=stream.schema,
                    tap_stream_id=stream.schema,
                    key_properties=key_properties,
                    schema=schema,
                    metadata=metadata,
                ))

        return catalog
Beispiel #28
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=STREAMS[stream_name]['key_properties'],
                         schema=schema,
                         metadata=mdata))

    return catalog


# def discover():
#     raw_schemas = load_schemas()
#     streams = []
#     for stream_id, schema in raw_schemas.items():
#         # TODO: populate any metadata and stream's key properties here..
#         stream_metadata = []
#         key_properties = []
#         streams.append(
#             CatalogEntry(
#                 tap_stream_id=stream_id,
#                 stream=stream_id,
#                 schema=schema,
#                 key_properties=key_properties,
#                 metadata=stream_metadata,
#                 replication_key=None,
#                 is_view=None,
#                 database=None,
#                 table=None,
#                 row_count=None,
#                 stream_alias=None,
#                 replication_method=None,
#             )
#         )
#     return Catalog(streams)
Beispiel #29
0
def discover():
    streams = []
    for schema_name, schema_dict in get_schemas().items():
        schema = Schema.from_dict(schema_dict)
        schema_meta = metadata.get_standard_metadata(
            schema=schema_dict,
            schema_name=schema_name,
            key_properties=STREAMS[schema_name]['key_properties'],
            valid_replication_keys=STREAMS[schema_name]['replication_keys'],
            replication_method=STREAMS[schema_name]['replication_method'])

        streams.append(
            CatalogEntry(
                tap_stream_id=schema_name,
                stream=schema_name,
                schema=schema,
                key_properties=STREAMS[schema_name]['key_properties'],
                metadata=schema_meta,
            ))
    return Catalog(streams)
def discover(client):
    schemas, field_metadata = get_schemas(client)
    catalog = Catalog([])

    for stream_name, schema_dict in schemas.items():

        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]
        key_properties = STREAMS.get(stream_name,
                                     {}).get('key_properties',
                                             ['dimensions_hash_key', 'date'])

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=key_properties,
                         schema=schema,
                         metadata=mdata))

    return catalog