Exemple #1
0
def persist_lines_stream(config, lines=None, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    # bigquery_client = bigquery.Client(project=project_id)
    service = service_account.Credentials.from_service_account_file(
        config['key_file_location'])

    bigquery_client = bigquery.Client(project=config['project_id'],
                                      credentials=service)

    dataset_ref = bigquery_client.dataset(config['dataset_id'])
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            js = json.loads(line)
            # msg = singer.parse_message(line)
            if js['type'] == 'RECORD':
                msg = singer.messages.RecordMessage(stream=js.get('stream'),
                                                    record=js.get('record'),
                                                    version=js.get('version'),
                                                    time_extracted=None)
            else:
                msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.insert_rows_json(
                tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logger.info('Loaded {} row(s) into {}:{}'.format(
                rows[table], config['dataset_id'], table, tables[table].path))
            emit_state(state)
        else:
            logger.error('Errors:', errors[table])

    return state
Exemple #2
0
 def test_parse_message_record_missing_record(self):
     with self.assertRaises(Exception):
         singer.parse_message('{"type": "RECORD", "stream": "users"}')
Exemple #3
0
 def test_parse_message_schema_missing_stream(self):
     with self.assertRaises(Exception):
         message = singer.parse_message('{"type": "SCHEMA", "schema": {"type": "object", "properties": {"name": {"type": "string"}}}, "key_properties": ["name"]}')  # nopep8
def persist_lines_stream(project_id,
                         dataset_id,
                         lines=None,
                         validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            err = None
            try:
                err = bigquery_client.insert_rows_json(tables[msg.stream],
                                                       [msg.record])
            except Exception as exc:
                logger.error(
                    f"failed to insert rows for {tables[msg.stream]}: {str(exc)}\n{msg.record}"
                )
                raise

            errors[msg.stream] = err
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info("Loaded {} row(s) from {} into {}:{}".format(
                rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error("Errors: %s", errors[table])

    return state
def persist_messages(delimiter, quotechar, messages, destination_path,
                     without_headers):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            validators[o['stream']].validate(o['record'])

            filename = o['stream'] + '-' + now + '.csv'
            filename = os.path.expanduser(
                os.path.join(destination_path, filename))
            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = flatten(o['record'])

            if o['stream'] not in headers and not file_is_empty:
                with open(filename, 'r') as csvfile:
                    reader = csv.reader(csvfile,
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                    first_line = next(reader)
                    headers[o[
                        'stream']] = first_line if first_line else flattened_record.keys(
                        )
            else:
                headers[o['stream']] = flattened_record.keys()

            with open(filename, 'a') as csvfile:
                writer = csv.DictWriter(csvfile,
                                        headers[o['stream']],
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                if file_is_empty and not without_headers:
                    writer.writeheader()

                writer.writerow(flattened_record)

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            validators[stream] = Draft4Validator(o['schema'])
            key_properties[stream] = o['key_properties']
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    return state
Exemple #6
0
def persist_lines_stream(
    project_id,
    dataset_id,
    lines=None,
    validate_records=True,
    key_path=None,
):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    if key_path:
        credentials = service_account.Credentials.from_service_account_file(
            key_path,
            scopes=['https://www.googleapis.com/auth/cloud-platform'],
        )
        bigquery_client = bigquery.Client(credentials=credentials,
                                          project=project_id)
    else:
        bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(
            Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error('Unable to parse:\n{}'.format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    'A record for stream {} was encountered before a corresponding schema'
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.insert_rows_json(
                tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table),
                                           schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception('Unrecognized message {}'.format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(
                rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error(errors[table])

    return state
Exemple #7
0
 def test_parse_message_state_good(self):
     message = singer.parse_message(
         '{"type": "STATE", "value": {"seq": 1}}')
     self.assertEqual(message, singer.StateMessage(value={'seq': 1}))
Exemple #8
0
def persist_lines_job(project_id, dataset_id, lines=None):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    # try:
    #     dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            validate(msg.record, schema)

            dat = bytes(str(json.loads(json.dumps(msg.record), object_pairs_hook=clear_dict_hook)) + '\n', 'UTF-8')
            
            rows[msg.stream].write(dat)
            #rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream 
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            #tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
            # try:
            #     tables[table] = bigquery_client.create_table(tables[table])
            # except exceptions.Conflict:
            #     pass
        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON
        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))
        load_job = bigquery_client.load_table_from_file(
            rows[table], table_ref, job_config=load_config)
        logger.info("loading job {}".format(load_job.job_id))
        logger.info(load_job.result())


    # for table in errors.keys():
    #     if not errors[table]:
    #         print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
    #     else:
    #         print('Errors:', errors[table], sep=" ")

    return state
Exemple #9
0
def persist_lines_stream(project_id, dataset_id, lines=None):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            validate(msg.record, schema)

            errors[msg.stream] = bigquery_client.create_rows(tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream 
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass
        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
        else:
            print('Errors:', errors[table], sep=" ")

    return state
Exemple #10
0
def write_records(
    project_id,
    dataset_name,
    lines=None,
    stream=False,
    on_invalid_record="abort",
    partition_by=None,
    partition_type="day",
    partition_exp_ms=None,
    table_prefix="",
    table_ext="",
    load_config_properties=None,
    numeric_type="NUMERIC",
    max_warnings=20,
):
    if on_invalid_record not in ("abort", "skip", "force"):
        raise ValueError("on_invalid_record must be one of" +
                         " (abort, skip, force)")

    state = None
    schemas = {}
    bq_schemas = {}
    tables = {}
    key_properties = {}
    table_files = {}
    row_count = {}
    invalids = {}
    errors = {}

    client = bigquery.Client(project=project_id)

    dataset = get_or_create_dataset(client, project_id, dataset_name)

    count = 0
    for line in lines:
        try:
            message = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(message, singer.RecordMessage):
            if stream:
                json_dumps = False
            else:
                json_dumps = True
            record, validation = clean_and_validate(
                message,
                schemas,
                json_dumps,
            )

            if not validation["is_valid"]:
                invalids[message.stream] += 1
                instance = validation["instance"]
                type_ = validation["type"]
                invalid_record_str = json.dumps(validation["record"])
                invalid_message = validation["message"]
                if invalids[message.stream] <= max_warnings:
                    logger.warn(
                        f"Invalid record found and the process will {on_invalid_record}. "
                        f"[{instance}] :: {type_} :: {invalid_record_str} :: {message}"
                    )
                if invalids[message.stream] == max_warnings:
                    logger.warn("Max validation warning reached. "
                                "Further validation warnings are suppressed.")

                if on_invalid_record == "abort":
                    raise Exception(
                        "Validation required and failed. Aborting.")

            if validation["is_valid"] or on_invalid_record == "force":
                # https://cloud.google.com/bigquery/streaming-data-into-bigquery
                if stream:
                    errors[message.stream] = client.insert_rows(
                        tables[message.stream], [record])
                else:
                    table_files[message.stream].write(record)
                row_count[message.stream] += 1

            state = None

        elif isinstance(message, singer.StateMessage):
            state = message.value
            # State may contain sensitive info. Not logging in production
            logger.debug("State: %s" % state)
            currently_syncing = state.get("currently_syncing")
            bookmarks = state.get("bookmarks")
            if currently_syncing and bookmarks:
                logger.info(
                    f"State: currently_syncing {currently_syncing} - bookmark: {bookmarks.get(currently_syncing)}"
                )

        elif isinstance(message, singer.SchemaMessage):
            table_name = message.stream

            if schemas.get(table_name):
                # Redundant schema rows
                continue

            schemas[table_name] = message.schema
            bq_schema = parse_schema(schemas[table_name], numeric_type)
            bq_schemas[table_name] = bq_schema

            tables[table_name] = get_or_create_table(
                client,
                project_id,
                dataset_name,
                f"{table_prefix}{table_name}{table_ext}",
                bq_schema,
                partition_by,
                partition_type,
                partition_exp_ms,
            )

            if stream:
                # Ensure the table is created before streaming...
                time.sleep(3)

            if not stream:
                table_files[table_name] = TemporaryFile(mode='w+b')

            key_properties[table_name] = message.key_properties
            row_count[table_name] = 0
            invalids[table_name] = 0
            errors[table_name] = None

        elif isinstance(message, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(message))

        count = count + 1

    # We already wrote the data in the streaming mode
    if stream:
        for table_name in errors.keys():
            if not errors[table_name]:
                logger.info("Streamed {} row(s) into {}.{}.{}".format(
                    row_count[table_name], project_id, dataset_name,
                    table_name))
            else:
                logger.warn("Errors:", errors[table_name], sep=" ")
        return state

    # For batch job mode only
    for table_name in table_files.keys():
        if invalids[table_name] > 0:
            if on_invalid_record == "skip":
                logger.warn(
                    f"Persisting {table_name} stream by skipping the invalid records."
                )
            elif on_invalid_record == "force":
                logger.warn(
                    f"Persisting {table_name} stream by replacing invalids with null."
                )

        bq_schema = bq_schemas[table_name]

        # We should already have get-or-created:
        table = tables[table_name]

        load_config_props = {
            "schema": bq_schema,
            "source_format": SourceFormat.NEWLINE_DELIMITED_JSON
        }
        if load_config_properties:
            load_config_props.update(load_config_properties)
        load_config = LoadJobConfig(**load_config_props)

        if row_count[table_name] == 0:
            logger.info(f"Zero records for {table}. Skip loading.")
            continue
        logger.info(f"Batch loading {table} to Bigquery")
        table_files[table_name].seek(0)
        table_id = f"{project_id}.{dataset_name}.{table_prefix}{table_name}{table_ext}"
        try:
            load_job = client.load_table_from_file(table_files[table_name],
                                                   table_id,
                                                   job_config=load_config)
        except exceptions.BadRequest:
            logger.error("Error loading records for table " + table_name)
            logger.error(bq_schema)
            table_files[table_name].seek(0)
            logger.debug(table_files[table_name].read())
            raise
        logger.info("Batch loading job {}".format(load_job.job_id))
        try:
            logger.debug(load_job.result())
        except Exception as e:
            logger.critical(load_job.errors)
            raise

    for key, value in row_count.items():
        row_uploads = {
            "type": "counter",
            "metric": "row_uploads",
            "value": value,
            "tags": {
                "endpoint": key
            },
        }
        logger.info(f"{json.dumps(row_uploads)}")
    for key, value in invalids.items():
        invalid_rows = {
            "type": "counter",
            "metric": "invalid_records",
            "value": value,
            "tags": {
                "endpoint": key
            },
        }
        logger.info(f"{json.dumps(invalid_rows)}")

    return state
Exemple #11
0
def persist_messages(messages, config, s3_client):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    delimiter = config.get("delimiter", ",")
    quotechar = config.get("quotechar", '"')

    # Use the system specific temp directory if no custom temp_dir provided
    temp_dir = os.path.expanduser(config.get("temp_dir",
                                             tempfile.gettempdir()))

    # Create temp_dir if not exists
    if temp_dir:
        os.makedirs(temp_dir, exist_ok=True)

    filenames = []
    now = datetime.now().strftime("%Y%m%dT%H%M%S")

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o["type"]
        if message_type == "RECORD":
            if o["stream"] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o["stream"]))

            # Validate record
            try:
                validators[o["stream"]].validate(
                    utils.float_to_decimal(o["record"]))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n"
                        "'multipleOf' validations that allows long precisions are not supported"
                        " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema."
                        .format(o["record"]))
                    raise ex

            record_to_load = o["record"]
            if config.get("add_metadata_columns"):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            filename = o["stream"] + "-" + now + ".csv"
            filename = os.path.expanduser(os.path.join(temp_dir, filename))
            target_key = utils.get_target_key(
                o,
                prefix=config.get("s3_key_prefix", ""),
                timestamp=now,
                naming_convention=config.get("naming_convention"),
            )
            if not (filename, target_key) in filenames:
                filenames.append((filename, target_key))

            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = utils.flatten_record(record_to_load)

            if o["stream"] not in headers and not file_is_empty:
                with open(filename, "r") as csvfile:
                    reader = csv.reader(csvfile,
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                    first_line = next(reader)
                    headers[o["stream"]] = (first_line if first_line else
                                            flattened_record.keys())
            else:
                headers[o["stream"]] = flattened_record.keys()

            with open(filename, "a") as csvfile:
                if file_is_empty:
                    header = (",".join([
                        json.dumps(
                            v, ensure_ascii=False, default=decimal_default)
                        for v in headers[o["stream"]]
                    ]) + "\n")
                    # header = header.encode('UTF-8')
                    csvfile.write(header)

                row = (",".join([
                    json.dumps(
                        flattened_record[k],
                        ensure_ascii=False,
                        default=decimal_default,
                    ) for k in headers[o["stream"]]
                ]) + "\n")
                # row = row.encode('UTF-8')

                csvfile.write(row)

            state = None
        elif message_type == "STATE":
            logger.debug("Setting state to {}".format(o["value"]))
            state = o["value"]
        elif message_type == "SCHEMA":
            stream = o["stream"]
            schemas[stream] = o["schema"]
            if config.get("add_metadata_columns"):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o["schema"])
            validators[stream] = Draft7Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o["key_properties"]
        elif message_type == "ACTIVATE_VERSION":
            logger.debug("ACTIVATE_VERSION message")
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o["type"], o))

    # Upload created CSV files to S3
    targets = []
    for filename, target_key in filenames:
        compressed_file = None
        if config.get("compression") is None or config["compression"].lower(
        ) == "none":
            pass  # no compression
        else:
            if config["compression"] == "gzip":
                compressed_file = f"{filename}.gz"
                try:
                    with open(filename, "rb") as f_in:
                        with gzip.open(compressed_file, "wb") as f_out:
                            logger.info(
                                f"Compressing file as '{compressed_file}'")
                            shutil.copyfileobj(f_in, f_out)
                except FileNotFoundError:
                    logger.error(
                        "Could not find file (most likely already in AWS): {}".
                        format(filename))

            else:
                raise NotImplementedError(
                    "Compression type '{}' is not supported. "
                    "Expected: 'none' or 'gzip'".format(config["compression"]))
        s3.upload_file(
            compressed_file or filename,
            s3_client,
            config.get("s3_bucket"),
            target_key,
            encryption_type=config.get("encryption_type"),
            encryption_key=config.get("encryption_key"),
        )

        # Remove the local file(s)
        os.remove(filename)
        if compressed_file:
            os.remove(compressed_file)

        targets.append(target_key)

    return state, targets
Exemple #12
0
def persist_messages(messages, config, s3_client, do_timestamp_file=True):
    logger.info('persist_messages')
    state = None
    schemas = {}
    key_properties = {}
    validators = {}

    filenames = []
    filename = None
    timestamp_file_part = '-' + datetime.now().strftime(
        '%Y%m%dT%H%M%S') if do_timestamp_file else ''
    max_file_size_mb = config.get('max_temp_file_size_mb', 50)
    stream = None

    if config.get('record_unique_field'):
        a = set()
        write_temp_pickle()

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']

        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                validators[o['stream']].validate(
                    utils.float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        """Data validation failed and cannot load to destination. RECORD: {}\n
                    'multipleOf' validations that allows long precisions are not supported 
                    (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema.
                    """.format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            flattened_record = utils.flatten(record_to_load)
            filename = o['stream'] + timestamp_file_part + '.jsonl'
            filename = os.path.join(tempfile.gettempdir(), filename)
            filename = os.path.expanduser(filename)

            if not (filename, o['stream']) in filenames:
                filenames.append((filename, o['stream']))

            with open(filename, 'a') as f:
                f.write(json.dumps(flattened_record, cls=DecimalEncoder))
                f.write('\n')

            file_size = os.path.getsize(filename) if os.path.isfile(
                filename) else 0
            if file_size >> 20 > max_file_size_mb:
                logger.info('file_size: {} MB, filename: {}'.format(
                    round(file_size >> 20, 2), filename))
                upload_to_s3(s3_client, config.get("s3_bucket"),
                             os.environ["TARGET_S3_SOURCE_NAME"], filename,
                             o['stream'],
                             config.get('field_to_partition_by_time'),
                             config.get('record_unique_field'),
                             config.get("compression"),
                             config.get('encryption_type'),
                             config.get('encryption_key'))
                filenames.remove((filename, o['stream']))
            state = None
        elif message_type == 'STATE':
            logger.info('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, stream in filenames:
        upload_to_s3(s3_client, config.get("s3_bucket"),
                     os.environ["TARGET_S3_SOURCE_NAME"], filename, stream,
                     config.get('field_to_partition_by_time'),
                     config.get('record_unique_field'),
                     config.get("compression"), config.get('encryption_type'),
                     config.get('encryption_key'))

    return state
def persist_messages(messages, config, s3_client):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    delimiter = config.get('delimiter', ',')
    quotechar = config.get('quotechar', '"')

    # Use the system specific temp directory if no custom temp_dir provided
    temp_dir = os.path.expanduser(config.get('temp_dir',
                                             tempfile.gettempdir()))

    # Create temp_dir if not exists
    if temp_dir:
        os.makedirs(temp_dir, exist_ok=True)

    filenames = []
    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                validators[o['stream']].validate(
                    utils.float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n"
                        "'multipleOf' validations that allows long precisions are not supported"
                        " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema."
                        .format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            filename = o['stream'] + '-' + now + '.csv'
            filename = os.path.expanduser(os.path.join(temp_dir, filename))
            target_key = utils.get_target_key(
                o,
                prefix=config.get('s3_key_prefix', ''),
                timestamp=now,
                naming_convention=config.get('naming_convention'))
            if not (filename, target_key) in filenames:
                filenames.append((filename, target_key))

            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = utils.flatten_record(record_to_load)

            if o['stream'] not in headers and not file_is_empty:
                with open(filename, 'r') as csvfile:
                    reader = csv.reader(csvfile,
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                    first_line = next(reader)
                    headers[o[
                        'stream']] = first_line if first_line else flattened_record.keys(
                        )
            else:
                headers[o['stream']] = flattened_record.keys()

            with open(filename, 'a') as csvfile:
                writer = csv.DictWriter(csvfile,
                                        headers[o['stream']],
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                if file_is_empty:
                    writer.writeheader()

                writer.writerow(flattened_record)

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft7Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, target_key in filenames:
        compressed_file = None
        if config.get("compression") is None or config["compression"].lower(
        ) == "none":
            pass  # no compression
        else:
            if config["compression"] == "gzip":
                compressed_file = f"{filename}.gz"
                with open(filename, 'rb') as f_in:
                    with gzip.open(compressed_file, 'wb') as f_out:
                        logger.info(f"Compressing file as '{compressed_file}'")
                        shutil.copyfileobj(f_in, f_out)
            else:
                raise NotImplementedError(
                    "Compression type '{}' is not supported. "
                    "Expected: 'none' or 'gzip'".format(config["compression"]))
        s3.upload_file(compressed_file or filename,
                       s3_client,
                       config.get('s3_bucket'),
                       target_key,
                       encryption_type=config.get('encryption_type'),
                       encryption_key=config.get('encryption_key'))

        # Remove the local file(s)
        os.remove(filename)
        if compressed_file:
            os.remove(compressed_file)

    return state
def persist_lines_stream(project_id, dataset_id, ensure_ascii, lines=None, validate_records=True, array_nodes=[],
                         force_to_string_fields=[]):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = collections.defaultdict(list)
    data_holder = []
    lines_read = False
    stream = None

    if flags.no_records:
        no_records = int(flags.no_records)
    else:
        logger.info('Number of records not specified. Setting to maximum: {}'.format(MAX_NO_RECORDS))
        no_records = MAX_NO_RECORDS

    if flags.data_location:
        bigquery_client = bigquery.Client(project=project_id, location=flags.data_location)
    else:
        bigquery_client = bigquery.Client(project=project_id)

    dataset_ref = bigquery_client.dataset(dataset_id)
    dataset = Dataset(dataset_ref)
    try:
        dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    payload_size = 0
    for line in lines:
        lines_read = True
        # skip SCHEMA messages (except for the the intial one)
        if '{"anyOf": [{' in line:
            continue
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if is_record_deleted(msg.record, SDC_DELETED_AT):
                continue

            if validate_records:
                validate(msg.record, schema)

            modified_record = handle_decimal_values(msg.record)
            modified_record = handle_empty_arrays(array_nodes, modified_record)
            modified_record = force_fields_to_string(force_to_string_fields, modified_record, ensure_ascii)

            item_size = getsize(modified_record)
            if payload_size + item_size >= MAX_PAYLOAD_SIZE:
                logger.info('Near max request size. Sending: {} records, payload size: {}.'.format(len(data_holder),
                                                                                                   payload_size))
                upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder)
                if upload_res:
                    logger.error('Upload error: {}'.format(upload_res))
                else:
                    rows[msg.stream] += len(data_holder)
                data_holder = []
                payload_size = 0
                data_holder.append(modified_record)
                payload_size += item_size
            else:
                if len(data_holder) >= no_records:
                    logger.info(
                        "Max request size not reached, max #records reached. Sending: {} records, payload size: {} bytes.".format(
                            len(data_holder), item_size + payload_size))
                    upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder)
                    if upload_res:
                        logger.error('Upload error: {}'.format(upload_res))
                    else:
                        rows[msg.stream] += len(data_holder)
                    data_holder = []
                    payload_size = 0
                data_holder.append(modified_record)
                payload_size += item_size

            stream = msg.stream

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            try:
                tables[table] = bigquery_client.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    if len(data_holder) > 0 and lines_read and stream:
        logger.info(
            "Remaining records. Sending: {} records, payload size: {} bytes.".format(len(data_holder), payload_size))
        upload_res = bigquery_client.insert_rows_json(tables[stream], data_holder)
        if upload_res:
            logger.error('Upload error: {}'.format(upload_res))
        else:
            rows[stream] += len(data_holder)

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path))
            emit_state(state)
        else:
            logging.error('Errors:', errors[table])

    return state
Exemple #15
0
def process(ProcessHandler, tap_stream, **kwargs):
    handler = ProcessHandler(logger, **kwargs)
    assert isinstance(handler, BaseProcessHandler)

    if handler.emit_initial_state():
        s = kwargs.get("initial_state", {})
        assert isinstance(s, dict)
        logger.info(f"Pushing state: {s}")
        yield s  # yield init state, so even if there is an exception right after we get proper state emitted

    update_fields = kwargs.get("update_fields", False)

    for line in tap_stream:

        if update_fields:
            obj = json.loads(line.strip())
            msg_type = obj['type']
            new_obj = dict()
            # only deal with the first depth of fields, for Google Analytics schemas and records
            if msg_type == 'RECORD':
                for key, value in obj.items():
                    if key == 'record':
                        new_obj[key] = dict()
                        for k, v in obj[key].items():
                            new_obj[key][k.replace(':', '_')] = v
                    else:
                        new_obj[key] = value
                line = json.dumps(new_obj)
            elif msg_type == 'SCHEMA':
                for key, value in obj.items():
                    if key == 'schema':
                        new_obj[key] = dict()
                        for schema_key, schema_value in obj[key].items():
                            if schema_key == 'properties':
                                new_obj[key][schema_key] = dict()
                                for k, v in obj[key][schema_key].items():
                                    new_obj[key][schema_key][k.replace(
                                        ':', '_')] = v
                            else:
                                new_obj[key][schema_key] = schema_value
                    else:
                        new_obj[key] = value
                line = json.dumps(new_obj)

        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            for s in handler.handle_record_message(msg):
                logger.info(f"Pushing state: {s}")
                yield s

        elif isinstance(msg, singer.StateMessage):
            logger.info("Updating state with {}".format(msg.value))
            for s in handler.handle_state_message(msg):
                logger.info(f"Pushing state: {s}")
                yield s

        elif isinstance(msg, singer.SchemaMessage):
            logger.info("{} schema: {}".format(msg.stream, msg.schema))
            for s in handler.handle_schema_message(msg):
                logger.info(f"Pushing state: {s}")
                yield s

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for s in handler.on_stream_end():
        logger.info(f"Pushing state: {s}")
        yield s
Exemple #16
0
def persist_messages(messages, config, s3_client):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    filenames = []
    file_size_counters = dict()
    file_count_counters = dict()
    file_data = dict()
    filename = None
    s3_path, s3_filename = None, None
    now = datetime.now().strftime('%Y%m%dT%H%M%S')
    max_file_size_mb = config.get('max_temp_file_size_mb', 1000)
    stream = None

    if config.get('record_unique_field'):
        a = set()
        write_temp_pickle()

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        # if message_type != 'RECORD':
        #     logger.info("{} - message: {}".format(message_type, o))

        # if message_type not in message_types:
        #     logger.info("{} - message: {}".format(message_type, o))

        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                validators[o['stream']].validate(
                    utils.float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        """Data validation failed and cannot load to destination. RECORD: {}\n
                    'multipleOf' validations that allows long precisions are not supported 
                    (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema.
                    """.format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            flattened_record = utils.flatten_record(record_to_load)

            if filename is None:
                filename = '{}.jsonl'.format(now)
                filename = os.path.join(tempfile.gettempdir(), filename)
                filename = os.path.expanduser(filename)
                file_size_counters[filename] = 0
                file_count_counters[filename] = file_count_counters.get(
                    filename, 1)

            full_s3_target = str(
                file_count_counters[filename]) + '_' + filename

            if not (filename, full_s3_target) in filenames:
                filenames.append((filename, full_s3_target))

            file_size = os.path.getsize(filename) if os.path.isfile(
                filename) else 0
            if file_size >> 20 > file_size_counters[
                    filename] and file_size >> 20 % 100 == 0:
                logger.info('file_size: {} MB, filename: {}'.format(
                    round(file_size >> 20, 2), filename))
                file_size_counters[filename] = file_size_counters.get(
                    filename, 0) + 10

            if file_size >> 20 > max_file_size_mb:
                logger.info(
                    'Max file size reached: {}, dumping to s3...'.format(
                        max_file_size_mb))

                upload_to_s3(s3_client, config.get("s3_bucket"), filename,
                             stream, config.get('field_to_partition_by_time'),
                             config.get('record_unique_field'),
                             config.get("compression"),
                             config.get('encryption_type'),
                             config.get('encryption_key'))
                file_size = 0
                file_count_counters[filename] = file_count_counters.get(
                    filename, 1) + 1
                if filename in headers:
                    del headers[filename]

            file_is_empty = file_size == 0
            if file_is_empty:
                logger.info('creating file: {}'.format(filename))

            with open(filename, 'a') as f:
                f.write(json.dumps(flattened_record))
                f.write('\n')

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
            filename = None

            if config.get('field_to_partition_by_time'
                          ) not in key_properties[stream]:
                raise Exception(
                    """field_to_partition_by_time '{}' is not in key_properties: {}"""
                    .format(config.get('field_to_partition_by_time'),
                            key_properties[stream]))

        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, s3_target in filenames:
        upload_to_s3(s3_client, config.get("s3_bucket"), filename, stream,
                     config.get('field_to_partition_by_time'),
                     config.get('record_unique_field'),
                     config.get("compression"), config.get('encryption_type'),
                     config.get('encryption_key'))

    return state
Exemple #17
0
def persist_lines_job(
    project_id,
    dataset_id,
    lines=None,
    truncate=False,
    validate_records=True,
    key_path=None,
):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    if key_path:
        credentials = service_account.Credentials.from_service_account_file(
            key_path,
            scopes=['https://www.googleapis.com/auth/cloud-platform'],
        )
        bigquery_client = bigquery.Client(credentials=credentials,
                                          project=project_id)
    else:
        bigquery_client = bigquery.Client(project=project_id)

    # try:
    #     dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error('Unable to parse:\n{}'.format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    'A record for stream {} was encountered before a corresponding schema'
                    .format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            dat = bytes(json.dumps(msg.record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)
            # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            # tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
            # try:
            #     tables[table] = bigquery_client.create_table(tables[table])
            # except exceptions.Conflict:
            #     pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception('Unrecognized message {}'.format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0)
        logger.info('loading {} to Bigquery.\n'.format(table))
        load_job = bigquery_client.load_table_from_file(rows[table],
                                                        table_ref,
                                                        job_config=load_config)
        logger.info('loading job {}'.format(load_job.job_id))
        logger.info(load_job.result())

    # for table in errors.keys():
    #     if not errors[table]:
    #         print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path)
    #     else:
    #         print('Errors:', errors[table], sep=" ")

    return state
Exemple #18
0
def summarize_output(output):
    summary = OutputSummary()
    for line in output:
        summary.add(singer.parse_message(line))
    return summary
Exemple #19
0
 def test_parse_message_record_naive_extraction_time(self):
     with self.assertRaisesRegex(ValueError, "must be either None or an aware datetime"):
         message = singer.parse_message(
             '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2, "time_extracted": "1970-01-02T00:00:00"}')
Exemple #20
0
def persist_lines_job(lines=None, truncate=False, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    bq_schemas = {}
    rows = {}
    errors = {}

    # try:
    #     dataset = BIGQUERY_CLIENT.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    # except exceptions.Conflict:
    #     pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            msg.record = apply_string_conversions(msg.record, bq_schemas[msg.stream])
            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            new_record = apply_decimal_conversions(msg.record)
            dat = bytes(json.dumps(new_record) + '\n', 'UTF-8')

            rows[msg.stream].write(dat)
            # rows[msg.stream].write(bytes(str(msg.record) + '\n', 'UTF-8'))

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            bq_schemas[table] = build_schema(schemas[table])
            rows[table] = TemporaryFile(mode='w+b')
            errors[table] = None
            # try:
            #     tables[table] = BIGQUERY_CLIENT.create_table(tables[table])
            # except exceptions.Conflict:
            #     pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = BIGQUERY_CLIENT.dataset(DATASET_ID).table(fix_name(table))
        load_config = LoadJobConfig()
        load_config.schema = bq_schemas[table]
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        rows[table].seek(0, os.SEEK_END)
        if rows[table].tell() == 0:
            continue
        rows[table].seek(0)
        logger.info("loading {} to Bigquery.\n".format(table))
        load_job = BIGQUERY_CLIENT.load_table_from_file(
            rows[table], table_ref, job_config=load_config)
        logger.info("loading job {}".format(load_job.job_id))
        logger.info(load_job.result())
        sync_state_for_table(table, state)

    # for table in errors.keys():
    #     if not errors[table]:
    #         print('Loaded {} row(s) into {}:{}'.format(rows[table], DATASET_ID, table), tables[table].path)
    #     else:
    #         print('Errors:', errors[table], sep=" ")

    return state
Exemple #21
0
 def test_parse_message_state_missing_value(self):
     with self.assertRaises(Exception):
         singer.parse_message('{"type": "STATE"}')
Exemple #22
0
def persist_lines_stream(lines=None, validate_records=True):
    state = None
    schemas = {}
    key_properties = {}
    tables = {}
    rows = {}
    errors = {}

    dataset_ref = BIGQUERY_CLIENT.dataset(DATASET_ID)
    dataset = Dataset(dataset_ref)
    try:
        dataset = BIGQUERY_CLIENT.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref)
    except exceptions.Conflict:
        pass

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema".format(msg.stream))

            schema = schemas[msg.stream]

            if validate_records:
                validate(msg.record, schema)

            msg.record = apply_string_conversions(msg.record, tables[msg.stream].schema)
            msg.record = apply_decimal_conversions(msg.record)
            errors[msg.stream] = BIGQUERY_CLIENT.insert_rows_json(tables[msg.stream], [msg.record])
            rows[msg.stream] += 1

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value
            sync_state(state)
            emit_state(state)

        elif isinstance(msg, singer.SchemaMessage):
            table = msg.stream
            schemas[table] = msg.schema
            key_properties[table] = msg.key_properties
            tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table]))
            rows[table] = 0
            errors[table] = None
            try:
                tables[table] = BIGQUERY_CLIENT.create_table(tables[table])
            except exceptions.Conflict:
                pass

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in errors.keys():
        if not errors[table]:
            logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], DATASET_ID, table, tables[table].path))
            emit_state(state)
        else:
            logging.error('Errors:', str(errors[table]))

    return state
def persist_lines_job(
    project_id,
    dataset_id,
    lines=None,
    truncate=False,
    validate_records=True,
    table_suffix=None,
):
    state = None
    schemas = {}
    key_properties = {}
    rows = {}
    errors = {}
    table_suffix = table_suffix or ""

    class DecimalEncoder(json.JSONEncoder):
        # pylint: disable=method-hidden
        def default(self, o):
            if isinstance(o, decimal.Decimal):
                return str(o)
            return super(DecimalEncoder, self).default(o)

    bigquery_client = bigquery.Client(project=project_id)

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            table_name = msg.stream + table_suffix

            if table_name not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(table_name))

            schema = schemas[table_name]

            if validate_records:
                validate(msg.record, schema)

            new_rec = filter(schema, msg.record)

            # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row.
            dat = bytes(
                json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8")

            rows[table_name].write(dat)

            state = None

        elif isinstance(msg, singer.StateMessage):
            logger.debug("Setting state to {}".format(msg.value))
            state = msg.value

        elif isinstance(msg, singer.SchemaMessage):
            table_name = msg.stream + table_suffix

            if table_name in rows:
                continue

            schemas[table_name] = msg.schema
            key_properties[table_name] = msg.key_properties
            rows[table_name] = TemporaryFile(mode="w+b")
            errors[table_name] = None

        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(msg))

    for table in rows.keys():
        table_ref = bigquery_client.dataset(dataset_id).table(table)
        SCHEMA = build_schema(schemas[table])
        load_config = LoadJobConfig()
        load_config.schema = SCHEMA
        load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON

        if truncate:
            load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE

        logger.info("loading {} to Bigquery.\n".format(table))

        try:
            load_job = bigquery_client.load_table_from_file(
                rows[table], table_ref, job_config=load_config, rewind=True)
            logger.info("loading job {}".format(load_job.job_id))
            logger.info(load_job.result())
        except exceptions.BadRequest as err:
            logger.error("failed to load table {} from file: {}".format(
                table, str(err)))
            if load_job.errors:
                messages = [
                    f"reason: {err['reason']}, message: {err['message']}"
                    for err in load_job.errors
                ]
                logger.error("errors:\n{}".format("\n".join(messages)))
            raise

    return state
Exemple #24
0
def persist_messages(delimiter, quotechar, file, messages):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    logger.info("do persists")

    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    if file is not None:
        try:
            os.remove(file)
        except FileNotFoundError:
            pass

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            validators[o['stream']].validate(o['record'])

            filename = o['stream'] + '-' + now + '.csv'

            if file is not None:
                filename = file

            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = flatten(o['record'])

            with open(filename, 'a') as csvfile:
                if o['stream'] not in headers:
                    if not file_is_empty:
                        with open(filename, 'r') as csvfile:
                            reader = csv.reader(csvfile,
                                                delimiter=delimiter,
                                                quotechar=quotechar)
                            first_line = next(reader)
                            headers[o[
                                'stream']] = first_line if first_line else flattened_record.keys(
                                )
                    else:
                        headers[o['stream']] = extract_header_names(
                            property=schemas[o['stream']]['properties'])
                    logger.info(f"generated headers: {headers[o['stream']]}")

                writer = csv.DictWriter(csvfile,
                                        headers[o['stream']],
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                if file_is_empty:

                    header_names = headers[o['stream']]
                    writer.writeheader()

                for header in headers[o['stream']]:
                    if header not in flattened_record:
                        flattened_record[header] = None

                writer.writerow(flattened_record)

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            validators[stream] = Draft4Validator(o['schema'])
            key_properties[stream] = o['key_properties']
        else:
            raise Exception("Unknown message type {} in message {}".format(
                o['type'], o))

    return state
Exemple #25
0
 def test_parse_message_record_with_version_good(self):
     message = singer.parse_message(
         '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2}')
     self.assertEqual(
         message,
         singer.RecordMessage(record={'name': 'foo'}, stream='users', version=2))
def persist_messages(messages, config, s3_client):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}
    file_counts = {}

    delimiter = config.get('delimiter', ',')
    quotechar = config.get('quotechar', '"')
    max_file_size = config.get('max_file_size_mb', 1000) * 1000000
    compression = config.get('compression')
    flatten = config.get('flatten', True)
    s3_bucket = config.get('s3_bucket')
    skip_upload = s3_bucket == 'localhost'

    # Use the system specific temp directory if no custom temp_dir provided
    temp_dir = os.path.expanduser(config.get('temp_dir',
                                             tempfile.gettempdir()))

    # Create temp_dir if not exists
    if temp_dir:
        os.makedirs(temp_dir, exist_ok=True)

    filenames = []
    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                # validators[o['stream']].validate(utils.float_to_decimal(o['record']))
                pass  # Skipping validation as it slows things way down
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n"
                        "'multipleOf' validations that allows long precisions are not supported"
                        " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema."
                        .format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            filename = o['stream'] + '-' + now + '.csv'
            filename = os.path.expanduser(os.path.join(temp_dir, filename))
            target_key = utils.get_target_key(
                o,
                prefix=config.get('s3_key_prefix', ''),
                timestamp=now,
                naming_convention=config.get('naming_convention'))
            if not (filename, target_key) in filenames:
                filenames.append((filename, target_key))

            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = utils.flatten_record(
                record_to_load) if flatten else record_to_load

            if o['stream'] not in headers and not file_is_empty:
                with open(filename, 'r') as csvfile:
                    reader = csv.reader(csvfile,
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                    first_line = next(reader)
                    headers[o[
                        'stream']] = first_line if first_line else flattened_record.keys(
                        )
            else:
                headers[o['stream']] = flattened_record.keys()

            with open(filename, 'a') as csvfile:
                writer = csv.DictWriter(csvfile,
                                        headers[o['stream']],
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                if file_is_empty:
                    writer.writeheader()

                writer.writerow(flattened_record)

            if os.stat(filename).st_size > max_file_size:
                cnt = file_counts[filename] = file_counts.get(filename, 0) + 1
                # Add counter sequence to filename
                rename_file = utils.add_file_count(filename, cnt)
                os.rename(filename, rename_file)
                filename = rename_file

                compressed_file = utils.compress_file(filename, compression)
                comp_ext = '.gz' if compressed_file else ''

                if skip_upload:
                    continue  # Skip S3 upload and keep local files

                # upload to s3 with amended target_key
                s3.upload_file(compressed_file or filename,
                               s3_client,
                               config.get('s3_bucket'),
                               utils.add_file_count(target_key, cnt) +
                               comp_ext,
                               encryption_type=config.get('encryption_type'),
                               encryption_key=config.get('encryption_key'))

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft7Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, target_key in filenames:
        if not os.path.isfile(filename):
            continue

        cnt = file_counts.get(filename, 0) + 1
        if cnt > 1:
            target_key = utils.add_file_count(target_key, cnt)

        compressed_file = utils.compress_file(filename, compression)
        comp_ext = '.gz' if compressed_file else ''

        if skip_upload:
            continue  # Skip S3 upload and keep local files

        s3.upload_file(compressed_file or filename,
                       s3_client,
                       s3_bucket,
                       target_key + comp_ext,
                       encryption_type=config.get('encryption_type'),
                       encryption_key=config.get('encryption_key'))

    return state
Exemple #27
0
 def test_parse_message_record_missing_stream(self):
     with self.assertRaises(Exception):
         singer.parse_message(
             '{"type": "RECORD", "record": {"name": "foo"}}')
Exemple #28
0
def write_records(cluster,
                  username,
                  password,
                  bucket,
                  lines=None,
                  collection_map=None,
                  index_keys=None,
                  on_invalid_record="abort"):
    if on_invalid_record not in ("abort", "skip", "force"):
        raise ValueError("on_invalid_record must be one of" +
                         " (abort, skip, force)")

    state = None
    schemas = {}
    tables = {}
    key_properties = {}
    table_files = {}
    row_count = {}
    errors = {}

    cluster = Cluster(
        "couchbase://" + cluster,
        ClusterOptions(PasswordAuthenticator(username, password)))

    count = 0
    invalids = 0
    current_batch = defaultdict(dict)

    for line in lines:
        try:
            message = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(message, singer.RecordMessage):
            json_dumps = False
            record, invalids = clean_and_validate(message, schemas, invalids,
                                                  on_invalid_record,
                                                  json_dumps)

            if invalids == 0 or on_invalid_record == "force":
                record["_stream"] = message.stream
                collection_name = None
                if collection_map:
                    collection_name = collection_map.get(message.stream)
                if index_keys and index_keys.get(message.stream):
                    key = record[index_keys[message.stream]]
                else:
                    key = uuid.uuid4().hex

                current_batch[collection_name or "_"][key] = record

                if is_batch_ready(current_batch[collection_name or "_"]):
                    flush_batch(cluster, bucket,
                                current_batch.pop(collection_name or "_"))

            row_count[message.stream] += 1
            state = None

        elif isinstance(message, singer.StateMessage):
            state = message.value
            # State may contain sensitive info. Not logging in production
            logger.debug("State: %s" % state)
            currently_syncing = state.get("currently_syncing")
            bookmarks = state.get("bookmarks")
            if currently_syncing and bookmarks:
                logger.info(
                    "State: currently_syncing %s - last_update: %s" %
                    (currently_syncing, bookmarks.get(
                        currently_syncing, dict()).get("last_update")))

        elif isinstance(message, singer.SchemaMessage):
            table_name = message.stream

            if schemas.get(table_name):
                # Redundant schema rows
                continue

            schemas[table_name] = message.schema
            key_properties[table_name] = message.key_properties
            row_count[table_name] = 0
            errors[table_name] = None

        elif isinstance(message, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass

        else:
            raise Exception("Unrecognized message {}".format(message))

        count = count + 1

    for collection_name, batch in current_batch.items():
        if batch:
            if collection_name == "_":
                collection_name = None
            flush_batch(cluster, bucket, batch, collection_name)

    return state
Exemple #29
0
 def test_parse_message_schema_missing_schema(self):
     with self.assertRaises(Exception):
         message = singer.parse_message(
             '{"type": "SCHEMA", "stream": "users", "key_properties": ["name"]}')  # nopep8
def persist_lines(project_id, dataset_id, table_id, lines):
    state = None
    schemas = {}
    key_properties = {}

    rows = []

    for line in lines:
        try:
            msg = singer.parse_message(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if isinstance(msg, singer.RecordMessage):
            if msg.stream not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(msg.stream))

            schema = schemas[msg.stream]
            validate(msg.record, schema)

            bigquery_client = bigquery.Client(project=project_id)

            dataset_ref = bigquery_client.dataset(dataset_id)
            dataset = Dataset(dataset_ref)

            try:
                dataset = bigquery_client.create_dataset(
                    Dataset(dataset_ref)) or Dataset(dataset_ref)
            except exceptions.Conflict:
                pass

            table_ref = dataset.table(table_id)
            table_schema = build_schema(schema)

            table = bigquery.Table(table_ref, schema=table_schema)
            try:
                table = bigquery_client.create_table(table)
            except exceptions.Conflict:
                pass

            rows.append(msg.record)

            state = None
        elif isinstance(msg, singer.StateMessage):
            logger.debug('Setting state to {}'.format(msg.value))
            state = msg.value
        elif isinstance(msg, singer.SchemaMessage):
            schemas[msg.stream] = msg.schema
            key_properties[msg.stream] = msg.key_properties
        elif isinstance(msg, singer.ActivateVersionMessage):
            # This is experimental and won't be used yet
            pass
        else:
            raise Exception("Unrecognized message {}".format(msg))

    errors = bigquery_client.create_rows(table, rows)

    if not errors:
        print('Loaded {} row(s) into {}:{}'.format(len(rows), dataset_id,
                                                   table_id))
    else:
        print('Errors:')
        pprint(errors)

    return state