Ejemplo n.º 1
0
    def for_each_time_entry(time_entry, time_extracted, stream_version=None):
        # Extract external_reference
        external_reference_schema = load_and_write_schema("external_reference")
        load_and_write_schema(
            "time_entry_external_reference",
            key_properties=["time_entry_id", "external_reference_id"])
        if time_entry['external_reference'] is not None:
            with Transformer() as transformer:
                external_reference = time_entry['external_reference']
                external_reference = transformer.transform(
                    external_reference, external_reference_schema)

                new_record = singer.RecordMessage(
                    stream="external_reference",
                    record=external_reference,
                    version=stream_version,
                    time_extracted=time_extracted)
                singer.write_message(new_record)

                # Create pivot row for time_entry and external_reference
                pivot_row = {
                    'time_entry_id': time_entry['id'],
                    'external_reference_id': external_reference['id']
                }

                new_record = singer.RecordMessage(
                    stream="time_entry_external_reference",
                    record=pivot_row,
                    version=stream_version,
                    time_extracted=time_extracted)
                singer.write_message(new_record)
Ejemplo n.º 2
0
def row_to_singer_message(stream, row, version, columns, time_extracted):
    row_to_persist = ()
    for idx, elem in enumerate(row):
        property_type = stream.schema.properties[columns[idx]].type
        multiple_of = stream.schema.properties[columns[idx]].multipleOf
        format = stream.schema.properties[columns[idx]].format #date-time
        if elem is None:
            row_to_persist += (elem,)
        elif 'integer' in property_type or property_type == 'integer':
            integer_representation = int(elem)
            row_to_persist += (integer_representation,)
        elif ('number' in property_type or property_type == 'number') and multiple_of:
            decimal_representation = decimal.Decimal(elem)
            row_to_persist += (decimal_representation,)
        elif ('number' in property_type or property_type == 'number'):
            row_to_persist += (float(elem),)
        elif format == 'date-time':
            row_to_persist += (elem,)
        else:
            row_to_persist += (elem,)

    rec = dict(zip(columns, row_to_persist))
    return singer.RecordMessage(
        stream=stream.stream,
        record=rec,
        version=version,
        time_extracted=time_extracted)
Ejemplo n.º 3
0
def sync_report(report, stream, config):
    report_url = report['report_url']
    username = config['username']
    password = config['password']

    LOGGER.info('Syncing report "%s".', report_url)

    record_count = 0

    record = {}

    stream_version = int(time.time() * 1000)
    extraction_time = utils.now().isoformat()

    singer.write_version(stream.tap_stream_id, stream_version)

    with Transformer() as transformer:
        for record in stream_report(report_url, username, password):
            to_write = transformer.transform(record, stream.schema.to_dict(),
                                             metadata.to_map(stream.metadata))
            to_write['_sdc_extracted_at'] = extraction_time
            record_message = singer.RecordMessage(stream.tap_stream_id,
                                                  to_write,
                                                  version=stream_version)
            singer.write_message(record_message)
            record_count += 1

    return record_count
Ejemplo n.º 4
0
def row_to_singer_record(catalog_entry, version, db_column_map, row,
                         time_extracted):
    row_to_persist = {}

    for column_name, val in row.items():
        property_type = catalog_entry.schema.properties[column_name].type
        db_column_type = db_column_map.get(column_name)

        if isinstance(val,
                      (datetime.datetime, datetime.date, datetime.timedelta)):
            the_utc_date = common.to_utc_datetime_str(val)
            row_to_persist[column_name] = the_utc_date

        elif db_column_type == FIELD_TYPE.JSON:
            row_to_persist[column_name] = json.dumps(json_bytes_to_string(val))

        elif 'boolean' in property_type or property_type == 'boolean':
            if val is None:
                boolean_representation = None
            elif val == 0:
                boolean_representation = False
            elif db_column_type == FIELD_TYPE.BIT:
                boolean_representation = int(val) != 0
            else:
                boolean_representation = True
            row_to_persist[column_name] = boolean_representation

        else:
            row_to_persist[column_name] = val

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=row_to_persist,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 5
0
def sync_file(bucket, s3_path, stream, version=None):
    LOGGER.info('Syncing file "%s".', s3_path)

    table_name = stream['stream']

    s3_file_handle = s3.get_file_handle(bucket, s3_path)
    iterator = fastavro.reader(s3_file_handle._raw_stream)
    mdata = metadata.to_map(stream['metadata'])
    schema = generate_schema_from_avro(iterator.schema)

    key_properties = metadata.get(mdata, (), 'table-key-properties')
    singer.write_schema(table_name, schema, key_properties)

    # Activate a version so we execute a full table sync
    if version is not None:
        LOGGER.info('Sending Activate Version Message with version %d',
                    version)
        message = singer.ActivateVersionMessage(stream=table_name,
                                                version=version)
        singer.write_message(message)

    records_synced = 0
    with Transformer() as transformer:
        for row in iterator:
            to_write = transformer.filter_data_by_metadata(row, mdata)
            singer.write_message(
                singer.RecordMessage(table_name, to_write, version=version))
            records_synced += 1

    return records_synced
Ejemplo n.º 6
0
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted):
    row_to_persist = ()
    for idx, elem in enumerate(row):
        property_type = catalog_entry.schema.properties[columns[idx]].type

        if isinstance(elem,
                      (datetime.datetime, datetime.date, datetime.timedelta)):
            the_utc_date = to_utc_datetime_str(elem)
            row_to_persist += (the_utc_date, )

        elif isinstance(elem, bytes):
            # for BIT value, treat 0 as False and anything else as True
            boolean_representation = elem != b'\x00'
            row_to_persist += (boolean_representation, )

        elif 'boolean' in property_type or property_type == 'boolean':
            if elem is None:
                boolean_representation = None
            elif elem == 0:
                boolean_representation = False
            else:
                boolean_representation = True
            row_to_persist += (boolean_representation, )

        else:
            row_to_persist += (elem, )
    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=rec,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 7
0
    def do_sync(self):
        bookmark_date_as_date = str_to_date(self.bookmark_date)
        max_product_date = bookmark_date_as_date

        product_response = self.client.make_request("/products/mine")
        product_ids = []

        with singer.metrics.Counter('record_count',
                                    {'endpoint': 'products'}) as counter:

            for product in product_response.json().values():
                product_ids.append(product['id'])

                # Only upsert messages which have changed
                product_date = product['updated_date'] if product['updated_date']\
                    else product['added_date']
                product_date = str_to_date(product_date)

                product = tidy_dates(product)

                if product_date > bookmark_date_as_date:
                    singer.write_message(
                        singer.RecordMessage(
                            stream='products',
                            record=product,
                        ))
                max_product_date = max(max_product_date, product_date)

                counter.increment()

        self.state = singer.write_bookmark(self.state, self.STREAM_NAME,
                                           'last_record',
                                           date_to_str(max_product_date))

        self.product_ids = product_ids
Ejemplo n.º 8
0
def row_to_singer_message(stream, row, version, columns, time_extracted,
                          md_map, conn_info):
    row_to_persist = ()
    md_map[('properties', '_sdc_deleted_at')] = {
        'sql-datatype': 'timestamp with time zone'
    }
    md_map[('properties', '_sdc_lsn')] = {'sql-datatype': "character varying"}

    for idx, elem in enumerate(row):
        sql_datatype = md_map.get(
            ('properties', columns[idx])).get('sql-datatype')

        if not sql_datatype:
            LOGGER.info("No sql-datatype found for stream %s: %s", stream,
                        columns[idx])
            raise Exception(
                "Unable to find sql-datatype for stream {}".format(stream))

        cleaned_elem = selected_value_to_singer_value(elem, sql_datatype,
                                                      conn_info)
        row_to_persist += (cleaned_elem, )

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(
        stream=post_db.calculate_destination_stream_name(stream, md_map),
        record=rec,
        version=version,
        time_extracted=time_extracted)
Ejemplo n.º 9
0
    def do_sync(self):
        """
        Main sync functionality
        Most of the streams use this
        A few of the streams work differently and override this method
        """
        start_date = str_to_date(self.bookmark_date).strftime('%Y-%m-%d')

        try:
            response = self.client.make_request(self.URI.format(start_date))
        except RequestError:
            return

        new_bookmark_date = self.bookmark_date

        with singer.metrics.Counter('record_count',
                                    {'endpoint': self.STREAM_NAME}) as counter:
            for entry in self.traverse_nested_dicts(response.json(),
                                                    self.RESPONSE_LEVELS):
                new_bookmark_date = max(new_bookmark_date, entry['date'])
                entry = strings_to_floats(entry)
                singer.write_message(
                    singer.RecordMessage(
                        stream=self.STREAM_NAME,
                        record=entry,
                    ))
            counter.increment()

        self.state = singer.write_bookmark(self.state, self.STREAM_NAME,
                                           'last_record', new_bookmark_date)
Ejemplo n.º 10
0
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted):
    row_to_persist = ()
    for idx, elem in enumerate(row):
        property_type = catalog_entry.schema.properties[columns[idx]].type
        if isinstance(elem, datetime.datetime):
            row_to_persist += (elem.isoformat() + '+00:00', )

        elif isinstance(elem, datetime.date):
            row_to_persist += (elem.isoformat() + 'T00:00:00+00:00', )

        elif isinstance(elem, datetime.timedelta):
            epoch = datetime.datetime.utcfromtimestamp(0)
            timedelta_from_epoch = epoch + elem
            row_to_persist += (timedelta_from_epoch.isoformat() + '+00:00', )

        elif 'boolean' in property_type or property_type == 'boolean':
            if elem is None:
                boolean_representation = None
            elif elem == 0:
                boolean_representation = False
            else:
                boolean_representation = True
            row_to_persist += (boolean_representation, )

        else:
            row_to_persist += (elem, )
    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=rec,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 11
0
def row_to_singer_record(stream, row, version, time_extracted):
    row_to_persist = {k: transform_value(v) for k, v in row.items()}

    return singer.RecordMessage(stream=stream['tap_stream_id'],
                                record=row_to_persist,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 12
0
    def do_sync(self):
        max_product_date = self.bookmark_date

        product_response = self.client.make_request("/products/mine")
        product_ids = []
        product_types = []
        with singer.metrics.Counter('record_count',
                                    {'endpoint': 'products'}) as counter:

            for product in product_response.json().values():
                record = ProductRecord(product, self.schema)
                product_ids.append(record.clean_data['id'])
                product_types.append(record.clean_data['type'])

                # Only upsert messages which have changed
                if record.product_date > self.bookmark_date:
                    singer.write_message(
                        singer.RecordMessage(
                            stream='products',
                            record=product,
                        ))
                    max_product_date = max(max_product_date,
                                           record.product_date)

                    counter.increment()

        self.state = singer.write_bookmark(self.state, self.STREAM_NAME,
                                           'last_record',
                                           date_to_str(max_product_date))

        self.product_ids = product_ids
        self.product_types = product_types
Ejemplo n.º 13
0
    def test_round_trip(self):
        record_message = singer.RecordMessage(record={'name': 'foo'},
                                              stream='users')

        schema_message = singer.SchemaMessage(stream='users',
                                              key_properties=['name'],
                                              schema={
                                                  'type': 'object',
                                                  'properties': {
                                                      'name': {
                                                          'type': 'string'
                                                      }
                                                  }
                                              })

        state_message = singer.StateMessage(value={'seq': 1})

        self.assertEqual(
            record_message,
            singer.parse_message(singer.format_message(record_message)))
        self.assertEqual(
            schema_message,
            singer.parse_message(singer.format_message(schema_message)))
        self.assertEqual(
            state_message,
            singer.parse_message(singer.format_message(state_message)))
Ejemplo n.º 14
0
    def do_sync(self):
        """
        Main sync functionality
        Allows for differences in schemas between catalog and the actual received data to unravel lists
        This permits the user to get more granular ratings info (e.g. number of reviews for each rating)
        """
        start_date = str_to_date(self.bookmark_date).strftime('%Y-%m-%d')

        while str_to_date(start_date).date() < datetime.date.today():
            end_date = min(
                str_to_date(start_date).date() + datetime.timedelta(days=28),
                datetime.date.today() - datetime.timedelta(days=1))

            try:
                response = self.client.make_request(
                    self.URI.format(start_date, end_date.strftime('%Y-%m-%d')))
            except RequestError:
                return

            new_bookmark_date = self.bookmark_date
            with singer.metrics.Counter(
                    'record_count', {'endpoint': self.STREAM_NAME}) as counter:
                for entry in self.traverse_nested_dicts(
                        response.json(), self.RESPONSE_LEVELS):
                    new_bookmark_date = max(new_bookmark_date, entry['date'])

                    schema_keys = [
                        x for x in self.schema['properties'].keys()
                        if x not in entry.keys()
                    ]
                    entry_keys = [
                        x for x in entry.keys()
                        if x not in self.schema['properties'].keys()
                    ]
                    if schema_keys and entry_keys:
                        entries = list(
                            itertools.chain.from_iterable([
                                entry[entry_item] for entry_item in entry_keys
                            ]))
                        for j, schema_item in enumerate(schema_keys):
                            entry[schema_item] = entries[j]
                        for key in entry_keys:
                            del (entry[key])

                    entry = strings_to_floats(entry)

                    singer.write_message(
                        singer.RecordMessage(
                            stream=self.STREAM_NAME,
                            record=entry,
                        ))
                counter.increment()

            self.state = singer.write_bookmark(self.state, self.STREAM_NAME,
                                               'last_record',
                                               new_bookmark_date)
            if end_date == datetime.date.today() - datetime.timedelta(days=1):
                break
            start_date = end_date.strftime('%Y-%m-%d')
Ejemplo n.º 15
0
def sync_stream(kafka_config, stream, state):
    consumer = KafkaConsumer(
        kafka_config['topic'],
        group_id=kafka_config['group_id'],
        enable_auto_commit=False,
        consumer_timeout_ms=kafka_config.get('consumer_timeout_ms', 10000),
        auto_offset_reset='earliest',
        value_deserializer=lambda m: json.loads(m.decode('ascii')),
        bootstrap_servers=kafka_config['bootstrap_servers'])

    send_schema_message(stream)
    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['tap_stream_id'], version=stream_version)

    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    for message in consumer:
        LOGGER.info("%s:%s:%s: key=%s value=%s" %
                    (message.topic, message.partition, message.offset,
                     message.key, message.value))
        # stream['schema']
        record = singer.RecordMessage(stream=stream['tap_stream_id'],
                                      record=message.value,
                                      time_extracted=time_extracted)

        [valid, error] = validate_record(stream['schema'], record)
        rows_saved = rows_saved + 1

        if valid:
            singer.write_message(record)
        elif kafka_config.get('reject_topic'):
            send_reject_message(kafka_config, record, error)
        else:
            raise Exception(
                "record failed validation and no reject_topic was specified")

        state = singer.write_bookmark(state, stream['tap_stream_id'], 'offset',
                                      message.offset)

        #commit offsets because we processed the message
        tp = TopicPartition(message.topic, message.partition)
        consumer.commit({tp: OffsetAndMetadata(message.offset + 1, None)})

        if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Ejemplo n.º 16
0
def sync_query(config, state, stream):
    table_name = stream['tap_stream_id']

    #before writing the table version to state, check if we had one to begin with
    first_run = singer.get_bookmark(state, table_name, 'version') is None

    # last run was interrupted if there is a last_id_fetched bookmark
    was_interrupted = singer.get_bookmark(state, table_name,
                                          'last_evaluated_key') is not None

    #pick a new table version if last run wasn't interrupted
    if was_interrupted:
        stream_version = singer.get_bookmark(state, table_name, 'version')
    else:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, table_name, 'version', stream_version)
    singer.write_state(state)

    # For the initial replication, emit an ACTIVATE_VERSION message
    # at the beginning so the records show up right away.
    if first_run:
        singer.write_version(table_name, stream_version)

    mdata = metadata.to_map(stream['metadata'])
    queries = metadata.get(mdata, (), "queries")

    rows_saved = 0

    deserializer = Deserializer()
    for result in query_table(table_name, queries, config):
        for item in result.get('Items', []):
            rows_saved += 1
            # TODO: Do we actually have to put the item we retreive from
            # dynamo into a map before we can deserialize?
            record = deserializer.deserialize_item(item)
            record_message = singer.RecordMessage(stream=table_name,
                                                  record=record,
                                                  version=stream_version)

            singer.write_message(record_message)
        if result.get('LastEvaluatedKey'):
            state = singer.write_bookmark(state, table_name,
                                          'last_evaluated_key',
                                          result.get('LastEvaluatedKey'))
            singer.write_state(state)

    state = singer.clear_bookmark(state, table_name, 'last_evaluated_key')

    state = singer.write_bookmark(state, table_name,
                                  'initial_full_table_complete', True)

    singer.write_state(state)

    singer.write_version(table_name, stream_version)

    return rows_saved
Ejemplo n.º 17
0
 def test_extraction_time_strftime(self):
     """ Test that we're not corrupting timestamps with cross platform parsing. (Test case for OSX, specifically) """
     message = singer.RecordMessage(
         record={'name': 'foo'},
         stream='users',
         version=2,
         time_extracted=dateutil.parser.parse("1970-01-02T00:00:00.000Z"))
     expected = "1970-01-02T00:00:00.000000Z"
     self.assertEqual(message.asdict()["time_extracted"], expected)
Ejemplo n.º 18
0
    def do_sync(self):
        start_date = self.bookmark_date
        new_bookmark_date = self.bookmark_date

        # Ranks cannot be fetched for inapp
        product_ids = ','.join([
            str(id) for i, id in enumerate(self.product_ids)
            if self.product_types[i] != "inapp"
        ])

        if any(
            [product_type == "inapp" for product_type in self.product_types]):
            LOGGER.info(
                "Skipping id={} since ranks cannot be fetched for inapp purchases."
                .format(','.join([
                    str(id) for i, id in enumerate(self.product_ids)
                    if self.product_types[i] == "inapp"
                ])))

        while start_date.date() <= date.today():
            end_date = start_date + timedelta(days=28)
            uri = '/ranks/{}/daily/{}/{}'.format(
                product_ids, start_date.strftime('%Y-%m-%d'),
                end_date.strftime('%Y-%m-%d'))

            data = self.client.make_request(uri).json()
            rank_dates = data['dates']
            rank_data = data['data']

            with singer.metrics.Counter('record_count',
                                        {'endpoint': 'ranks'}) as counter:
                for rank_entry in rank_data:
                    for i, rank_date in enumerate(rank_dates):
                        record = RankRecord(
                            dict(
                                country=rank_entry['country'],
                                category=rank_entry['category'],
                                product_id=rank_entry['product_id'],
                                position=rank_entry['positions'][i],
                                delta=rank_entry['deltas'][i],
                                date=rank_date,
                            ), self.schema)

                        new_bookmark_date = max(new_bookmark_date,
                                                record.bookmark)
                        singer.write_message(
                            singer.RecordMessage(
                                stream=self.STREAM_NAME,
                                record=record.for_export,
                            ))
                        counter.increment()

            self.state = singer.write_bookmark(self.state, self.STREAM_NAME,
                                               'last_record',
                                               date_to_str(new_bookmark_date))

            start_date = end_date
Ejemplo n.º 19
0
 def test_parse_message_record_with_version_good(self):
     message = singer.parse_message(
         '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2}'
     )
     self.assertEqual(
         message,
         singer.RecordMessage(record={'name': 'foo'},
                              stream='users',
                              version=2))
Ejemplo n.º 20
0
def row_to_record(catalog_entry, version, row, columns, time_extracted):
    row_to_persist = ()
    for idx, elem in enumerate(row):
        if isinstance(elem, datetime.datetime):
            elem = elem.isoformat('T') + 'Z'
        row_to_persist += (elem, )
    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=dict(zip(columns, row_to_persist)),
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 21
0
def resume_syncing_bulk_query(sf, catalog_entry, job_id, state, counter):
    bulk = Bulk(sf)
    current_bookmark = singer.get_bookmark(
        state, catalog_entry['tap_stream_id'],
        'JobHighestBookmarkSeen') or sf.get_start_date(state, catalog_entry)
    current_bookmark = singer_utils.strptime_with_tz(current_bookmark)
    batch_ids = singer.get_bookmark(state, catalog_entry['tap_stream_id'],
                                    'BatchIDs')

    start_time = singer_utils.now()
    stream = catalog_entry['stream']
    stream_id = catalog_entry['tap_stream_id']
    stream_alias = catalog_entry.get('stream_alias')
    catalog_metadata = metadata.to_map(catalog_entry.get('metadata'))
    replication_key = catalog_metadata.get((), {}).get('replication-key')
    stream_version = get_stream_version(catalog_entry, state)
    schema = catalog_entry['schema']

    if not bulk.job_exists(job_id):
        LOGGER.info(
            "Found stored Job ID that no longer exists, resetting bookmark and removing JobID from state."
        )
        return counter

    # Iterate over the remaining batches, removing them once they are synced
    for batch_id in batch_ids[:]:
        with Transformer(pre_hook=transform_bulk_data_hook) as transformer:
            for rec in bulk.get_batch_results(job_id, batch_id, catalog_entry):
                counter.increment()
                rec = transformer.transform(rec, schema)
                rec = fix_record_anytype(rec, schema)
                singer.write_message(
                    singer.RecordMessage(stream=(stream_id or stream_alias
                                                 or stream),
                                         record=rec,
                                         version=stream_version,
                                         time_extracted=start_time))

                # Update bookmark if necessary
                replication_key_value = replication_key and singer_utils.strptime_with_tz(
                    rec[replication_key])
                if replication_key_value and replication_key_value <= start_time and replication_key_value > current_bookmark:
                    current_bookmark = singer_utils.strptime_with_tz(
                        rec[replication_key])

        state = singer.write_bookmark(state, catalog_entry['tap_stream_id'],
                                      'JobHighestBookmarkSeen',
                                      singer_utils.strftime(current_bookmark))
        batch_ids.remove(batch_id)
        LOGGER.info("Finished syncing batch %s. Removing batch from state.",
                    batch_id)
        LOGGER.info("Batches to go: %d", len(batch_ids))
        singer.write_state(state)

    return counter
Ejemplo n.º 22
0
def sync_table_file(config, s3_path, table_spec, stream, last_modified):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config["bucket"]
    table_name = table_spec["table_name"]

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)

    encoding_module = singer_encodings_csv
    if 'encoding_module' in config:
        try:
            encoding_module = importlib.import_module(
                config['encoding_module'])
        except ModuleNotFoundError:
            LOGGER.warning(
                f'Failed to load encoding module [{config["encoding_module"]}]. Defaulting to [singer_encodings.csv]'
            )

    iterator = encoding_module.get_row_iterator(s3_file_handle._raw_stream,
                                                table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,
            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2,
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream["schema"], metadata.to_map(stream["metadata"]))

        to_write_with_sequence = RecordMessageWithSequence(
            singer.RecordMessage(stream=table_name, record=to_write),
            last_modified)

        singer.write_message(to_write_with_sequence)
        records_synced += 1

    return records_synced
Ejemplo n.º 23
0
def row_to_singer_record(catalog_entry, version, db_column_map, row,
                         time_extracted):
    row_to_persist = {}

    for column_name, val in row.items():
        property_type = catalog_entry.schema.properties[column_name].type
        db_column_type = db_column_map.get(column_name)

        if isinstance(val, datetime.datetime):
            if db_column_type in MYSQL_TIMESTAMP_TYPES:
                # The mysql-replication library creates datetimes from TIMESTAMP columns using fromtimestamp which
                # will use the local timezone thus we must set tzinfo accordingly See:
                # https://github.com/noplay/python-mysql-replication/blob/master/pymysqlreplication/row_event.py#L143
                # -L145
                timezone = tzlocal.get_localzone()
                local_datetime = timezone.localize(val)
                utc_datetime = local_datetime.astimezone(pytz.UTC)
                row_to_persist[column_name] = utc_datetime.isoformat()
            else:
                row_to_persist[column_name] = val.isoformat() + '+00:00'

        elif isinstance(val, datetime.date):
            row_to_persist[column_name] = val.isoformat() + 'T00:00:00+00:00'

        elif isinstance(val, datetime.timedelta):
            timedelta_from_epoch = datetime.datetime.utcfromtimestamp(0) + val
            row_to_persist[column_name] = timedelta_from_epoch.isoformat(
            ) + '+00:00'

        elif db_column_type == FIELD_TYPE.JSON:
            row_to_persist[column_name] = json.dumps(json_bytes_to_string(val))

        elif isinstance(val, bytes):
            # encode bytes as hex bytes then to utf8 string
            row_to_persist[column_name] = codecs.encode(val,
                                                        'hex').decode('utf-8')

        elif 'boolean' in property_type or property_type == 'boolean':
            if val is None:
                boolean_representation = None
            elif val == 0:
                boolean_representation = False
            elif db_column_type == FIELD_TYPE.BIT:
                boolean_representation = int(val) != 0
            else:
                boolean_representation = True
            row_to_persist[column_name] = boolean_representation

        else:
            row_to_persist[column_name] = val

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=row_to_persist,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 24
0
 def test_parse_message_record_aware_extraction_time(self):
     message = singer.parse_message(
         '{"type": "RECORD", "record": {"name": "foo"}, "stream": "users", "version": 2, "time_extracted": "1970-01-02T00:00:00.000Z"}')
     expected = singer.RecordMessage(
         record={'name': 'foo'},
         stream='users',
         version=2,
         time_extracted=dateutil.parser.parse("1970-01-02T00:00:00.000Z"))
     print(message)
     print(expected)
     self.assertEqual(message, expected)
Ejemplo n.º 25
0
    def sync(self):
        """
        Perform sync action
        These steps are the same for all streams
        Differences between streams are implemented by overriding .do_sync() method
        """
        if not self.KEEP_IDS and not self.include_stream:
            LOGGER.info('Skipping stream %s - excluded in catalog',
                        self.STREAM_NAME)
            return

        new_bookmark_date = self.bookmark_date = self.starting_bookmark_date()
        # amazon doesn't guarantee that all orders created after the createdafter data that you specify will be returned

        # Will be set to false if we stop early due to reaching the end of a batch
        # to tell the runner to continue with the next batch
        all_done = True

        singer.write_schema(self.STREAM_NAME, self.schema, self.key_properties)
        rows = self.request_list()
        self.ids = []
        with singer.metrics.Counter('record_count',
                                    {'endpoint': self.STREAM_NAME}) as counter:
            for row in rows:
                row_as_dict = self.row_to_dict(row)
                if self.KEEP_IDS:
                    self.ids.append(row_as_dict[self.ID_FIELD])
                self.remove_excluded_fields(row_as_dict)
                message = singer.RecordMessage(
                    stream=self.STREAM_NAME,
                    record=row_as_dict,
                    time_extracted=singer.utils.now())
                if self.include_stream:
                    singer.write_message(message)
                if self.BOOKMARK_FIELD:
                    new_bookmark_date = max(new_bookmark_date,
                                            row_as_dict[self.BOOKMARK_FIELD])
                counter.increment()

                # Stop if we've done enough for one batch
                if self.BATCH_SIZE and counter.value >= self.BATCH_SIZE:
                    # Sync action stopped due to end of batch - so probably more rows
                    # Note that there is a 1/BATCH_SIZE chance that the end of a
                    # batch is exactly the end of the whole process. In that case
                    # the runner will make one more .sync request, for one more (empty) batch
                    all_done = False
                    break

        if self.BOOKMARK_FIELD:
            singer.write_bookmark(self.state, self.STREAM_NAME,
                                  self.BOOKMARK_FIELD, new_bookmark_date)

        return all_done
Ejemplo n.º 26
0
    def for_each_role(role, time_extracted, stream_version=None):
        # Extract user_roles
        load_and_write_schema("user_roles",
                              key_properties=["user_id", "role_id"])
        for user_id in role['user_ids']:
            pivot_row = {'role_id': role['id'], 'user_id': user_id}

            new_record = singer.RecordMessage(stream="user_roles",
                                              record=pivot_row,
                                              version=stream_version,
                                              time_extracted=time_extracted)
            singer.write_message(new_record)
Ejemplo n.º 27
0
def row_to_singer_record(stream, row, version, time_extracted):
    # pylint: disable=unidiomatic-typecheck
    try:
        row_to_persist = {k:transform_value(v, [k]) for k, v in row.items()
                          if type(v) not in [bson.min_key.MinKey, bson.max_key.MaxKey]}
    except MongoInvalidDateTimeException as ex:
        raise Exception("Error syncing collection {}, object ID {} - {}".format(stream["tap_stream_id"], row['_id'], ex))

    return singer.RecordMessage(
        stream=calculate_destination_stream_name(stream),
        record=row_to_persist,
        version=version,
        time_extracted=time_extracted)
Ejemplo n.º 28
0
def row_to_singer_record(catalog_entry, version, row, columns, time_extracted):
    """Transform SQL row to singer compatible record message"""
    row_to_persist = ()
    for idx, elem in enumerate(row):
        property_type = catalog_entry.schema.properties[columns[idx]].type
        if isinstance(elem, datetime.datetime):
            row_to_persist += (elem.isoformat() + '+00:00', )

        elif isinstance(elem, datetime.date):
            row_to_persist += (elem.isoformat() + 'T00:00:00+00:00', )

        elif isinstance(elem, datetime.timedelta):
            epoch = datetime.datetime.utcfromtimestamp(0)
            timedelta_from_epoch = epoch + elem
            row_to_persist += (timedelta_from_epoch.isoformat() + '+00:00', )

        elif isinstance(elem, datetime.time):
            row_to_persist += (str(elem), )

        elif isinstance(elem, bytes):
            # for BIT value, treat 0 as False and anything else as True
            if 'boolean' in property_type:
                boolean_representation = elem != b'\x00'
                row_to_persist += (boolean_representation, )
            else:
                row_to_persist += (elem.hex(), )

        elif 'boolean' in property_type or property_type == 'boolean':
            if elem is None:
                boolean_representation = None
            elif elem == 0:
                boolean_representation = False
            else:
                boolean_representation = True
            row_to_persist += (boolean_representation, )

        elif 'object' in property_type or property_type == 'object':
            obj_rep = None
            if elem:
                obj_rep = json.loads(elem)
            row_to_persist += (obj_rep, )

        else:
            row_to_persist += (elem, )

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(stream=catalog_entry.stream,
                                record=rec,
                                version=version,
                                time_extracted=time_extracted)
Ejemplo n.º 29
0
def sync_shard(shard, seq_number_bookmarks, streams_client, stream_arn,
               projection, deserializer, table_name, stream_version, state):
    seq_number = seq_number_bookmarks.get(shard['ShardId'])

    rows_synced = 0

    for record in get_shard_records(streams_client, stream_arn, shard,
                                    seq_number):
        if record['eventName'] == 'REMOVE':
            record_message = deserializer.deserialize_item(
                record['dynamodb']['Keys'])
            record_message[SDC_DELETED_AT] = singer.utils.strftime(
                record['dynamodb']['ApproximateCreationDateTime'])
        else:
            record_message = deserializer.deserialize_item(
                record['dynamodb'].get('NewImage'))
            if record_message is None:
                LOGGER.fatal(
                    'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                )
                raise RuntimeError(
                    'Dynamo stream view type must be either "NEW_IMAGE" "NEW_AND_OLD_IMAGES"'
                )
            if projection is not None and projection != '':
                try:
                    record_message = deserializer.apply_projection(
                        record_message, projection)
                except:
                    LOGGER.fatal("Projection failed to apply: %s", projection)
                    raise RuntimeError(
                        'Projection failed to apply: {}'.format(projection))

        record_message = singer.RecordMessage(stream=table_name,
                                              record=record_message,
                                              version=stream_version)
        singer.write_message(record_message)

        rows_synced += 1

        seq_number_bookmarks[
            shard['ShardId']] = record['dynamodb']['SequenceNumber']
        state = singer.write_bookmark(state, table_name, 'shard_seq_numbers',
                                      seq_number_bookmarks)

        # Every 100 rows write the state
        if rows_synced % 100 == 0:
            singer.write_state(state)

    singer.write_state(state)
    return rows_synced
Ejemplo n.º 30
0
def selected_row_to_singer_message(stream, row, version, columns, time_extracted, md_map):
    row_to_persist = ()
    for idx, elem in enumerate(row):
        sql_datatype = md_map.get(('properties', columns[idx]))['sql-datatype']
        cleaned_elem = selected_value_to_singer_value(elem, sql_datatype)
        row_to_persist += (cleaned_elem,)

    rec = dict(zip(columns, row_to_persist))

    return singer.RecordMessage(
        stream=calculate_destination_stream_name(stream, md_map),
        record=rec,
        version=version,
        time_extracted=time_extracted)