Esempio n. 1
0
def sync(  # noqa: WPS210, WPS213
        wp: WordpressReviews,
        catalog: Catalog,
) -> None:
    """Sync data from tap source.

    Arguments:
        wp {WordpressReviews} -- WordpressReviews client
        catalog {Catalog} -- Stream catalog
    """
    # For every stream in the catalog
    LOGGER.info('Sync')

    # Only selected streams are synced, whether a stream is selected is
    # determined by whether the key-value: "selected": true is in the schema
    # file.
    for stream in catalog.get_selected_streams({}):
        LOGGER.info(f'Syncing stream: {stream.tap_stream_id}')

        # Write the schema
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties,
        )

        # The tap_data method yields rows of data from the API
        for row in wp.reviews():

            # Write a row to the stream
            singer.write_record(
                stream.tap_stream_id,
                row,
                time_extracted=datetime.now(timezone.utc),
            )
Esempio n. 2
0
def sync(config: Dict[str, Any], state: Dict[str, Any],
         catalog: Catalog) -> None:
    # For looking up Catalog-configured streams more efficiently
    # later Singer stores catalog entries as a list and iterates
    # over it with .get_stream()
    stream_defs: Dict[str, Union["Stream", "Substream"]] = {}
    stream_versions: Dict[str, Optional[int]] = {}

    check_dependency_conflicts(catalog)

    for stream in catalog.get_selected_streams(state):
        if is_substream(AVAILABLE_STREAMS[stream.tap_stream_id]):
            LOGGER.info(
                'Skipping substream "%s" until parent stream is reached',
                stream.tap_stream_id,
            )

            continue

        LOGGER.info("Syncing stream: %s", stream.tap_stream_id)

        filter_datetime = prepare_stream(stream.tap_stream_id, stream_defs,
                                         stream_versions, catalog, config,
                                         state)
        stream_def = stream_defs[stream.tap_stream_id]

        LOGGER.info("Querying since: %s", filter_datetime)

        for tap_stream_id, record in stream_def.sync(
                filter_datetime):  # type: ignore
            state = handle_record(
                tap_stream_id,
                record,
                stream_defs[tap_stream_id],
                stream_versions[tap_stream_id],
                state,
            )

        write_state(state)

        for substream_def in stream_def.substreams:  # type: ignore
            if not substream_def.is_selected:
                continue

            # All substreams are necessarily FULL_TABLE and thus have a version,
            # so write their ACTIVATE_VERSION messages without check.
            write_activate_version(
                substream_def.tap_stream_id,
                stream_versions[substream_def.tap_stream_id],
            )

        if stream_versions[stream_def.tap_stream_id] is not None:
            write_activate_version(
                stream_def.tap_stream_id,
                stream_versions[stream_def.tap_stream_id],
            )

    state = set_currently_syncing(state, None)
    write_state(state)
Esempio n. 3
0
def sync(
    basecone: Basecone,
    state: dict,
    catalog: Catalog,
    start_date: str,
) -> None:
    """Sync data from tap source.

    Arguments:
        basecone {Basecone} -- Basecone client
        state {dict} -- Tap state
        catalog {Catalog} -- Stream catalog
        start_date {str} -- Start date
    """
    # For every stream in the catalog
    LOGGER.info('Sync')
    LOGGER.debug('Current state:\n{state}')

    # Only selected streams are synced, whether a stream is selected is
    # determined by whether the key-value: "selected": true is in the schema
    # file.
    for stream in catalog.get_selected_streams(state):
        LOGGER.info(f'Syncing stream: {stream.tap_stream_id}')

        # Update the current stream as active syncing in the state
        singer.set_currently_syncing(state, stream.tap_stream_id)

        # Retrieve the state of the stream
        stream_state: dict = tools.get_stream_state(
            state,
            stream.tap_stream_id,
        )

        LOGGER.info(f'Stream state: {stream_state}')

        # Write the schema
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties,
        )

        # Every stream has a corresponding method in the PayPal object e.g.:
        # The stream: paypal_transactions will call: paypal.paypal_transactions
        tap_data: Callable = getattr(basecone, stream.tap_stream_id)

        # The tap_data method yields rows of data from the API
        # The state of the stream is used as kwargs for the method
        # E.g. if the state of the stream has a key 'start_date', it will be
        # used in the method as start_date='2021-01-01T00:00:00+0000'
        for row in tap_data(**stream_state):
            sync_record(stream, row, state)
Esempio n. 4
0
 def test_one_selected_stream(self):
     selected_entry = CatalogEntry(tap_stream_id='a',
                                   schema=Schema(),
                                   metadata=[{'metadata':
                                              {'selected': True},
                                              'breadcrumb': []}])
     catalog = Catalog(
         [selected_entry,
          CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]),
          CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])])
     state = {}
     selected_streams = catalog.get_selected_streams(state)
     self.assertEquals([e for e in selected_streams],[selected_entry])
Esempio n. 5
0
def sync(config: Dict[str, Any], state: Dict[str, Any],
         catalog: Catalog) -> None:
    """ Sync data from tap source """
    client = Client(config["access_token"], config["page_size"])
    # Loop over selected streams in catalog
    for selected_stream in catalog.get_selected_streams(state):
        LOGGER.info("Syncing stream: %s", selected_stream.tap_stream_id)

        bookmark_column = selected_stream.replication_key
        replication_method = (
            ReplicationMethod[selected_stream.replication_method]
            if selected_stream.replication_method else None)
        last_bookmark = state.get(selected_stream.tap_stream_id)

        singer.write_schema(
            stream_name=selected_stream.tap_stream_id,
            schema=selected_stream.schema.to_dict(),
            key_properties=selected_stream.key_properties,
            bookmark_properties=[bookmark_column] if bookmark_column else None,
        )

        stream = streams.get(selected_stream.tap_stream_id)
        max_bookmark = last_bookmark if replication_method == ReplicationMethod.INCREMENTAL else None
        for records in stream().get_records(client, config, bookmark_column,
                                            last_bookmark, replication_method):
            if len(records) == 0:
                continue
            # write one or more rows to the stream:
            for record in records:
                modified_record = Stream.convert_dates_to_rfc3339(
                    record, selected_stream.schema)
                singer.write_record(selected_stream.tap_stream_id,
                                    modified_record,
                                    time_extracted=datetime.now(timezone.utc))
            if bookmark_column:
                if stream.replication_key_is_sorted:
                    # update bookmark to latest value
                    singer.write_state({
                        selected_stream.tap_stream_id:
                        records[-1][bookmark_column]
                    })
                else:
                    local_max_bookmark = max(
                        [row[bookmark_column] for row in records])
                    # if data unsorted, save max value until end of writes
                    max_bookmark = max(
                        max_bookmark, local_max_bookmark
                    ) if max_bookmark else local_max_bookmark
        if bookmark_column and not stream.replication_key_is_sorted:
            singer.write_state({selected_stream.tap_stream_id: max_bookmark})
Esempio n. 6
0
 def test_resumes_currently_syncing_stream(self):
     selected_entry_a = CatalogEntry(tap_stream_id='a',
                                   schema=Schema(),
                                   metadata=[{'metadata':
                                              {'selected': True},
                                              'breadcrumb': []}])
     selected_entry_c = CatalogEntry(tap_stream_id='c',
                                     schema=Schema(),
                                     metadata=[{'metadata':
                                                {'selected': True},
                                                'breadcrumb': []}])
     catalog = Catalog(
         [selected_entry_a,
          CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]),
          selected_entry_c])
     state = {'currently_syncing': 'c'}
     selected_streams = catalog.get_selected_streams(state)
     self.assertEquals([e for e in selected_streams][0],selected_entry_c)
Esempio n. 7
0
def sync(config: Dict[str, Any], state: Dict[str, Any],
         catalog: Catalog) -> None:
    # For looking up Catalog-configured streams more efficiently
    # later Singer stores catalog entries as a list and iterates
    # over it with .get_stream()
    stream_defs: Dict[str, Union["Stream", "Substream"]] = {}
    stream_versions: Dict[str, Optional[int]] = {}

    check_dependency_conflicts(catalog)

    for stream in catalog.get_selected_streams(state):
        if is_substream(AVAILABLE_STREAMS[stream.tap_stream_id]):
            LOGGER.info(
                'Skipping substream "%s" until parent stream is reached',
                stream.tap_stream_id,
            )

            continue

        LOGGER.info("Syncing stream: %s", stream.tap_stream_id)

        filter_datetime = prepare_stream(stream.tap_stream_id, stream_defs,
                                         stream_versions, catalog, config,
                                         state)
        stream_def = stream_defs[stream.tap_stream_id]

        LOGGER.info("Querying since: %s", filter_datetime)

        for tap_stream_id, record in stream_def.sync(
                filter_datetime):  # type: ignore
            state = handle_record(
                tap_stream_id,
                record,
                stream_defs[tap_stream_id],
                stream_versions[tap_stream_id],
                state,
            )

        write_state(state)

    state = set_currently_syncing(state, None)
    write_state(state)
def sync(  # noqa: WPS210, WPS213
    wp: WordPressSupportForums,
    catalog: Catalog,
) -> None:
    """Sync data from tap source.

    Arguments:
        wp {WordPressSupportForums} -- WordPressSupportForums client
        catalog {Catalog} -- Stream catalog
    """
    # For every stream in the catalog
    LOGGER.info('Sync')

    # Only selected streams are synced, whether a stream is selected is
    # determined by whether the key-value: "selected": true is in the schema
    # file.
    for stream in catalog.get_selected_streams({}):
        LOGGER.info(f'Syncing stream: {stream.tap_stream_id}')

        # Write the schema
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties,
        )

        # Every stream has a corresponding method in the WordPress Stats object
        # The stream: mysql will call: wp.mysql
        tap_data: Callable = getattr(wp, stream.tap_stream_id)

        # The tap_data method yields rows of data from the API
        for row in tap_data():

            # Write a row to the stream
            singer.write_record(
                stream.tap_stream_id,
                row,
                time_extracted=datetime.now(timezone.utc),
            )
Esempio n. 9
0
def sync(  # noqa: WPS210, WPS213
    twinfield: Twinfield,
    state: dict,
    catalog: Catalog,
    start_date: str,
) -> None:
    """Sync data from tap source.

    Arguments:
        twinfield {Twinfield} -- Twinfield client
        state {dict} -- Tap state
        catalog {Catalog} -- Stream catalog
        start_date {str} -- Start date
    """
    # For every stream in the catalog
    LOGGER.info('Sync')
    LOGGER.debug('Current state:\n{state}')

    # Only selected streams are synced, whether a stream is selected is
    # determined by whether the key-value: "selected": true is in the schema
    # file.
    for stream in catalog.get_selected_streams(state):
        LOGGER.info(f'Syncing stream: {stream.tap_stream_id}')

        # Update the current stream as active syncing in the state
        singer.set_currently_syncing(state, stream.tap_stream_id)

        # Retrieve the state of the stream
        stream_state: dict = tools.get_stream_state(
            state,
            stream.tap_stream_id,
        )

        LOGGER.debug(f'Stream state: {stream_state}')

        # Write the schema
        singer.write_schema(
            stream_name=stream.tap_stream_id,
            schema=stream.schema.to_dict(),
            key_properties=stream.key_properties,
        )

        # Every stream has a corresponding method in the PayPal object e.g.:
        # The stream: paypal_transactions will call: paypal.paypal_transactions
        tap_data: Callable = getattr(twinfield, stream.tap_stream_id)

        # The tap_data method yields rows of data from the API
        # The state of the stream is used as kwargs for the method
        # E.g. if the state of the stream has a key 'start_date', it will be
        # used in the method as start_date='2021-01-01T00:00:00+0000'
        for row in tap_data(**stream_state):

            # Write a row to the stream
            singer.write_record(
                stream.tap_stream_id,
                row,
                time_extracted=datetime.now(timezone.utc),
            )

            bookmark: Optional[str] = tools.get_bookmark_value(
                stream.tap_stream_id,
                row,
            )

        # Update bookmark
        tools.update_bookmark(stream, bookmark, state)
        sys.stdout.flush()