def setUp(self):
        self.catalog = Catalog.load('test_catalog.json')

        with open('test_state.json') as f:
            self.state = json.load(f)

        self.start_date = "2019-11-12T06:47:14.000000Z"
def main():

    # DEBUG
    # logger.setLevel('DEBUG')

    # Parse command line arguments
    args = utils.parse_args(REQUIRED_CONFIG_KEYS)
    config = args.config

    # create instance of BigCommerce client
    bigcommerce = BigCommerce(client_id=config['client_id'],
                              access_token=config['access_token'],
                              store_hash=config['store_hash'])

    # If discover flag was passed, run discovery mode and dump output to stdout
    if args.discover:
        do_discover(bigcommerce)
    # Otherwise run in sync mode
    else:
        if args.catalog:
            catalog = args.catalog
        else:
            catalog = Catalog.from_dict(discover_streams(bigcommerce))

        do_sync(client=bigcommerce,
                catalog=catalog,
                state=args.state,
                start_date=config['start_date'])
Exemple #3
0
def discover(client):
    LOGGER.info("Starting discovery mode")
    raw_schemas = _load_schemas(client)
    streams = []

    for stream_name, schema in raw_schemas.items():
        # create and add catalog entry
        stream = STREAM_OBJECTS[stream_name]
        catalog_entry = {
            "stream":
            stream_name,
            "tap_stream_id":
            stream_name,
            "schema":
            schema,
            "metadata":
            metadata.get_standard_metadata(
                schema=schema,
                key_properties=stream.key_properties,
                valid_replication_keys=stream.replication_keys,
                replication_method=stream.replication_method,
            ),
            "key_properties":
            stream.key_properties,
        }
        streams.append(catalog_entry)

    return Catalog.from_dict({"streams": streams})
Exemple #4
0
def sync(client: RechargeClient, config: dict, state: dict,
         catalog: Catalog) -> dict:
    """Sync data from tap source"""

    with Transformer() as transformer:
        for stream in catalog.get_selected_streams(state):
            tap_stream_id = stream.tap_stream_id
            stream_obj = STREAMS[tap_stream_id](client)
            stream_schema = stream.schema.to_dict()
            stream_metadata = metadata.to_map(stream.metadata)

            LOGGER.info('Starting sync for stream: %s', tap_stream_id)

            state = singer.set_currently_syncing(state, tap_stream_id)
            singer.write_state(state)

            singer.write_schema(tap_stream_id, stream_schema,
                                stream_obj.key_properties,
                                stream.replication_key)

            state = stream_obj.sync(state, stream_schema, stream_metadata,
                                    config, transformer)
            singer.write_state(state)

    state = singer.set_currently_syncing(state, None)
    singer.write_state(state)
def generate_catalog(client, standard_fields, custom_fields, all_cubes,
                     cubes_lookup, profile_id):
    schema, mdata = generate_catalog_entry(client, standard_fields,
                                           custom_fields, all_cubes,
                                           cubes_lookup, profile_id)
    # Do the thing to generate the thing
    catalog_entry = CatalogEntry(schema=Schema.from_dict(schema),
                                 key_properties=['_sdc_record_hash'],
                                 stream='report',
                                 tap_stream_id='report',
                                 metadata=metadata.to_list(mdata))
    return Catalog([catalog_entry])
Exemple #6
0
def generate_catalog(client, report_config, standard_fields, custom_fields,
                     all_cubes, cubes_lookup, profile_ids):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []
    # for report in PREMADE_REPORTS:
    for report in report_config:
        # change to safe name for bigquery
        temp = report['name'].replace(' ', '_').lower()
        report['name'] = temp

        metrics_dimensions = set(report['metrics'] + report['dimensions'])
        selected_by_default = {
            *report['metrics'][:10],  # Use first 10 metrics in definition
            *report.get('default_dimensions', [])
        }
        premade_fields = [
            field for field in standard_fields
            if field['id'] in metrics_dimensions
        ]
        schema, mdata = generate_premade_catalog_entry(premade_fields,
                                                       all_cubes, cubes_lookup)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['name'],
                         metadata=metadata.to_list(mdata)))

    # for report in report_config:
    for report in []:
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        catalog_entries.append(
            CatalogEntry(schema=Schema.from_dict(schema),
                         key_properties=['_sdc_record_hash'],
                         stream=report['name'],
                         tap_stream_id=report['id'],
                         metadata=metadata.to_list(mdata)))
    return Catalog(catalog_entries)
Exemple #7
0
def generate_streams(conn, table_info):
    entries = []
    for schema_name in table_info.keys():
        for table in table_info[schema_name].keys():

            with conn.cursor() as cur:
                sql = f"""
SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS kcu
    INNER JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME 
                                                                 AND tc.CONSTRAINT_TYPE = 'PRIMARY KEY'
WHERE kcu.TABLE_SCHEMA = '{schema_name}' AND kcu.TABLE_NAME = '{table}'"""
                cur.execute(sql)
                table_pks = [
                    col['COLUMN_NAME'] for col in convert_result_to_dict(cur)
                ]

                sql = """SELECT db_name()"""
                cur.execute(sql)
                database = cur.fetchone()[0]

            meta = {}
            columns = table_info[schema_name][table]['columns']

            metadata.write(meta, (), 'table-key-properties', table_pks)
            metadata.write(meta, (), 'schema-name', schema_name)
            metadata.write(meta, (), 'database-name', database)
            metadata.write(meta, (), 'row-count',
                           table_info[schema_name][table]['row_count'])
            metadata.write(meta, (), 'is-view',
                           table_info[schema_name][table]['is_view'])

            column_schemas = {
                col_name: schema_for_column(col_info, table_pks)
                for col_name, col_info in columns.items()
            }

            schema = Schema(type=object, properties=column_schemas)

            entry = CatalogEntry(table=table,
                                 stream=table,
                                 metadata=metadata.to_list(meta),
                                 tap_stream_id=get_tap_stream_id(
                                     database, schema_name, table),
                                 schema=schema)
            entries.append(entry)

    return Catalog(entries)
def parse_args(required_config_keys):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-c', '--config',
        help='Config file',
        required=True)

    parser.add_argument(
        '-s', '--state',
        help='state file')

    parser.add_argument(
        '-p', '--properties',
        help='Property selections: DEPRECATED, Please use --catalog instead')

    parser.add_argument(
        '--catalog',
        help='Catalog file')

    parser.add_argument(
        '-d', '--discover',
        action='store_true',
        help='Do schema discovery')

    args = parser.parse_args()
    if args.config:
        setattr(args, 'config_path', args.config)
        args.config = utils.load_json(args.config)
    if args.state:
        setattr(args, 'state_path', args.state)
        args.state_file = args.state
        args.state = utils.load_json(args.state)
    else:
        args.state_file = None
        args.state = {}
    if args.properties:
        setattr(args, 'properties_path', args.properties)
        args.properties = utils.load_json(args.properties)
    if args.catalog:
        setattr(args, 'catalog_path', args.catalog)
        args.catalog = Catalog.load(args.catalog)

    utils.check_config(args.config, required_config_keys)

    return args
def parse_args(required_config_keys):
    '''Parse standard command-line args.

    -c,--config     Config file
    -s,--state      State file
    -d,--discover   Run in discover mode
    -a,--select_all Select all streams and fields for discover mode
    --catalog       Catalog file

    Returns the parsed args object from argparse. For each argument that
    point to JSON files (config, state, properties), we will automatically
    load and parse the JSON file.
    '''
    parser = argparse.ArgumentParser()

    parser.add_argument('--config', '-c', help='Config file', required=True)
    parser.add_argument('--state', '-s', help='State file')
    parser.add_argument('--catalog', help='Catalog file')
    parser.add_argument('--discover',
                        '-d',
                        action='store_true',
                        help='Do schema discovery')
    parser.add_argument('--select_all',
                        '-a',
                        action='store_true',
                        help='Select all streams and fields in discover mode')

    args = parser.parse_args()
    if args.config:
        setattr(args, 'config_path', args.config)
        args.config = load_json(args.config)
    if args.state:
        setattr(args, 'state_path', args.state)
        args.state = load_json(args.state)
    else:
        args.state = {}
    if args.catalog:
        setattr(args, 'catalog_path', args.catalog)
        args.catalog = Catalog.load(args.catalog)
    if args.select_all and not args.discover:
        parser.error('Select all only available for discovery mode')

    check_config(args.config, required_config_keys)
    return args
Exemple #10
0
def generate_catalog(
    client,
    report_config,
    standard_fields,
    custom_fields,
    all_cubes,
    cubes_lookup,
    profile_ids,
):
    """
    Generate a catalog entry for each report specified in `report_config`
    """
    catalog_entries = []

    for report in report_config:
        selected_by_default = {
            *report['metrics'][:10], *report.get('dimensions', [])
        }
        schema, mdata = generate_catalog_entry(client, standard_fields,
                                               custom_fields, all_cubes,
                                               cubes_lookup, profile_ids)

        mdata = reduce(
            lambda mdata, field_name: metadata.write(mdata, (
                "properties", field_name), "selected-by-default", True),
            selected_by_default, mdata)

        catalog_entries.append(
            CatalogEntry(
                schema=Schema.from_dict(schema),
                key_properties=['_sdc_record_hash'],
                stream=report['name'],
                tap_stream_id=report['name'],
                metadata=metadata.to_list(mdata),
            ))
    return Catalog(catalog_entries)
Exemple #11
0
def dump_catalog(catalog: Catalog):
    catalog.dump()
Exemple #12
0
def discover(conn, config):

    with connect_with_backoff(conn) as open_conn:
        with open_conn.cursor() as cur:
            cur.execute("""
            SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES
            """)

            table_info = {}

            schemas = cur.fetchall()
            for (db, schema, table, table_type) in schemas:
                if db not in table_info:
                    table_info[db] = {}
                if schema not in table_info[db]:
                    table_info[db][schema] = {}

                table_info[db][schema][table] = {
                    # 'row_count': rows,
                    'is_view': table_type == 'VIEW'
                }

            cur.execute("""
            SELECT
       C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION,
       C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE
FROM INFORMATION_SCHEMA.COLUMNS C
    LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME
    LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME
ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME
            """)
            # res = cur.fetchall()

            columns = []
            rec = cur.fetchone()
            while rec is not None:
                columns.append(Column(*rec))
                rec = cur.fetchone()

            entries = []
            for (k, cols) in itertools.groupby(
                    columns, lambda c: (c.table_schema, c.table_name)):
                cols = list(cols)
                (table_schema, table_name) = k
                schema = Schema(type='object',
                                properties={
                                    c.column_name: schema_for_column(c)
                                    for c in cols
                                })
                md = create_column_metadata(cols)
                md_map = metadata.to_map(md)

                md_map = metadata.write(md_map, (), 'database-name',
                                        table_schema)

                is_view = table_info[db][table_schema][table_name]['is_view']

                if table_schema in table_info and table_name in table_info[
                        table_schema]:
                    row_count = table_info[table_schema][table_name].get(
                        'row_count')

                    if row_count is not None:
                        md_map = metadata.write(md_map, (), 'row-count',
                                                row_count)

                    md_map = metadata.write(md_map, (), 'is-view', is_view)

                column_is_key_prop = lambda c, s: (
                    c.constraint_type == 'PRI' and s.properties[
                        c.column_name].inclusion != 'unsupported')

                key_properties = [
                    c.column_name for c in cols
                    if column_is_key_prop(c, schema)
                ]

                if not is_view:
                    md_map = metadata.write(md_map, (), 'table-key-properties',
                                            key_properties)

                entry = CatalogEntry(
                    table=table_name,
                    stream=table_name,
                    metadata=metadata.to_list(md_map),
                    tap_stream_id=common.generate_tap_stream_id(
                        table_schema, table_name),
                    schema=schema)

                entries.append(entry)

        return Catalog(entries)

    raw_schemas = load_schemas()
    streams = []

    for schema_name, schema in raw_schemas.items():
        # TODO: populate any metadata and stream's key properties here..
        stream_metadata = []
        stream_key_properties = []

        # create and add catalog entry
        catalog_entry = {
            'stream': schema_name,
            'tap_stream_id': schema_name,
            'schema': schema,
            'metadata': [],
            'key_properties': []
        }
        streams.append(catalog_entry)

    return {'streams': streams}
class TestValidateDependencies(unittest.TestCase):
    catalog = Catalog([
        CatalogEntry(tap_stream_id='boards',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': False
                         },
                         'breadcrumb': []
                     }]),
        CatalogEntry(tap_stream_id='issue_board',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': True
                         },
                         'breadcrumb': []
                     }]),
        CatalogEntry(tap_stream_id='project_board',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': False
                         },
                         'breadcrumb': []
                     }]),
        CatalogEntry(tap_stream_id='epics',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': False
                         },
                         'breadcrumb': []
                     }]),
        CatalogEntry(tap_stream_id='sprints',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': True
                         },
                         'breadcrumb': []
                     }]),
        CatalogEntry(tap_stream_id='issue_comments',
                     schema=Schema(),
                     metadata=[{
                         'metadata': {
                             'selected': True
                         },
                         'breadcrumb': []
                     }]),
    ])

    def test_is_selected(self):
        selected = utils.is_selected(streams.IssueBoard, self.catalog)
        self.assertTrue(selected)

    def test_raises_substream_error(self):
        test_streams = {'boards': streams.STREAMS['boards']}
        # test recursive checking
        test_streams['boards']['substreams']['issues'] = streams.STREAMS[
            'issues']
        self.assertRaises(utils.DependencyException,
                          utils.validate_dependencies, test_streams,
                          self.catalog)

    def test_raises_right_amount_of_substream_errors(self):
        test_streams = {'boards': streams.STREAMS['boards']}
        # test recursive checking
        test_streams['boards']['substreams']['issues'] = streams.STREAMS[
            'issues']
        with self.assertRaises(utils.DependencyException) as context:
            utils.validate_dependencies(test_streams, self.catalog)
            self.assertTrue(len(context.exception.errors) == 3)