Example #1
0
    def test_datetime_transform(self):
        schema = {"type": "string", "format": "date-time"}
        string_datetime = "2017-01-01T00:00:00Z"
        transformed_string_datetime = "2017-01-01T00:00:00.000000Z"
        self.assertEqual(
            transformed_string_datetime,
            transform(string_datetime, schema, NO_INTEGER_DATETIME_PARSING))
        self.assertEqual(
            '1970-01-02T00:00:00.000000Z',
            transform(86400, schema, UNIX_SECONDS_INTEGER_DATETIME_PARSING))
        self.assertEqual(
            transformed_string_datetime,
            transform(string_datetime, schema,
                      UNIX_SECONDS_INTEGER_DATETIME_PARSING))
        self.assertEqual(
            '1970-01-01T00:01:26.400000Z',
            transform(86400, schema,
                      UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING))
        self.assertEqual(
            transformed_string_datetime,
            transform(string_datetime, schema,
                      UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING))

        trans = Transformer(NO_INTEGER_DATETIME_PARSING)
        self.assertIsNone(trans._transform_datetime('cat'))
        self.assertIsNone(trans._transform_datetime(0))

        trans.integer_datetime_fmt = UNIX_SECONDS_INTEGER_DATETIME_PARSING
        self.assertIsNone(trans._transform_datetime('cat'))
Example #2
0
 def test_null_object_transform(self):
     schema =  {"type": "object",
                "properties": {"addrs": {"type": ["null", "object"],
                                         "properties": {"city": {"type": "string"}}}}}
     none_data = {'addrs': None}
     self.assertDictEqual(none_data, transform(none_data, schema))
     empty_data = {'addrs': {}}
     self.assertDictEqual(empty_data, transform(empty_data, schema))
Example #3
0
 def test_multi_type_array_transform(self):
     schema =  {"type": ["null", "array", "integer"],
                "items": {"type": "date-time", "format": "date-time"}}
     data = ["2017-01-01"]
     expected = ["2017-01-01T00:00:00.000000Z"]
     self.assertEqual(expected, transform(data, schema))
     data = 23
     expected = 23
     self.assertEqual(expected, transform(data, schema))
Example #4
0
 def test_multi_type_object_transform(self):
     schema =  {"type": ["null", "object", "string"],
                "properties": {"whatever": {"type": "date-time",
                                            "format": "date-time"}}}
     data = {"whatever": "2017-01-01"}
     expected = {"whatever": "2017-01-01T00:00:00.000000Z"}
     self.assertDictEqual(expected, transform(data, schema))
     data = "justastring"
     expected = "justastring"
     self.assertEqual(expected, transform(data, schema))
Example #5
0
 def test_anyof_datetime(self):
     schema = {
         'anyOf': [{
             'type': 'null'
         }, {
             'format': 'date-time',
             'type': 'string'
         }]
     }
     string_datetime = '2016-03-10T18:47:20Z'
     transformed_string_datetime = '2016-03-10T18:47:20.000000Z'
     self.assertEqual(transformed_string_datetime,
                      transform(string_datetime, schema))
     self.assertIsNone(transform(None, schema))
 def test_multi_type_array_transform(self):
     schema = {
         'type': ['null', 'array', 'integer'],
         'items': {
             'type': 'date-time',
             'format': 'date-time'
         }
     }
     data = ['2017-01-01']
     expected = ['2017-01-01T00:00:00.000000Z']
     self.assertEqual(expected, transform(data, schema))
     data = 23
     expected = 23
     self.assertEqual(expected, transform(data, schema))
 def test_nested_transform(self):
     schema = {
         'type': 'object',
         'properties': {
             'addrs': {
                 'type': 'array',
                 'items': {
                     'type': 'object',
                     'properties': {
                         'addr1': {
                             'type': 'string'
                         },
                         'city': {
                             'type': 'string'
                         },
                         'state': {
                             'type': 'string'
                         },
                         'amount': {
                             'type': 'integer'
                         }
                     }
                 }
             }
         }
     }
     data = {'addrs': [{'amount': '123'}, {'amount': '456'}]}
     expected = {'addrs': [{'amount': 123}, {'amount': 456}]}
     self.assertDictEqual(expected, transform(data, schema))
Example #8
0
 def test_pattern_properties_match_multiple(self):
     schema = {"type": "object",
               "patternProperties": { ".+?cost": {"type": "number"},
                                      ".+(?<!cost)$": {"type": "string"}}}
     dict_value = {"name": "chicken", "unit_cost": 1.45, "SKU": '123456'}
     expected = dict(dict_value)
     self.assertEqual(expected, transform(dict_value, schema))
Example #9
0
 def test_nested_transform(self):
     schema = {
         "type": "object",
         "properties": {
             "addrs": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
                         "addr1": {
                             "type": "string"
                         },
                         "city": {
                             "type": "string"
                         },
                         "state": {
                             "type": "string"
                         },
                         'amount': {
                             'type': 'integer'
                         }
                     }
                 }
             }
         }
     }
     data = {'addrs': [{'amount': '123'}, {'amount': '456'}]}
     expected = {'addrs': [{'amount': 123}, {'amount': 456}]}
     self.assertDictEqual(expected, transform(data, schema))
Example #10
0
def get_all_pull_requests(stream, config, state):
    '''
    https://developer.github.com/v3/pulls/#list-pull-requests
    '''
    query = urllib.parse.urlencode({
        'state': 'all',
        'sort': 'updated',
        'direction': 'asc'
    })
    repo = config[REPOSITORY]
    pr_state = state.get(PULL_REQUESTS)
    pr_state = dateutil.parser.parse(pr_state) if pr_state else _MIN_TS
    with metrics.record_counter(PULL_REQUESTS) as counter:
        url = 'https://api.github.com/repos/{}/pulls?{}'.format(repo, query)
        for response in authed_get_all_pages(PULL_REQUESTS, url):
            pull_requests = response.json()
            extraction_time = singer.utils.now()
            for pr in pull_requests:
                if dateutil.parser.parse(pr['updated_at']) > pr_state:
                    rec = singer.transform(pr, stream)
                    singer.write_record(PULL_REQUESTS,
                                        rec,
                                        time_extracted=extraction_time)
                    counter.increment()
    # handle the case when there are no PRs to pull which means no pr object
    try:
        state[PULL_REQUESTS] = pr['updated_at']
    except NameError:
        state[PULL_REQUESTS] = state.get(PULL_REQUESTS)
    return state
Example #11
0
    def _sync(self, ctx, path=None, product_id=None):
        if path is None:
            path = self.path

        if product_id:
            bookmark_name = 'product_{}.since_date'.format(product_id)
        else:
            bookmark_name = 'since_date'
        ctx.update_start_date_bookmark([self.tap_stream_id, bookmark_name])

        schema = ctx.catalog.get_stream(self.tap_stream_id).schema.to_dict()

        page = 1
        while True:
            params = self.get_params(ctx, page)
            opts = {"path": path, "params": params}
            resp = ctx.client.GET(self.version, opts, self.tap_stream_id)
            raw_records = self.format_response(resp)
            records = [transform(record, schema) for record in raw_records]

            if not self.on_batch_complete(ctx, records, product_id):
                break

            if len(records) == 0:
                break
            page += 1
Example #12
0
def get_all_issues(stream, config, state):
    '''
    https://developer.github.com/v3/issues/#list-issues-for-a-repository
    '''
    repo_path = config[REPOSITORY]
    params = {'sort': 'updated', 'direction': 'asc'}
    if ISSUES in state and state[ISSUES] is not None:
        params['since'] = format(state[ISSUES])
    query = urllib.parse.urlencode(params)
    url = 'https://api.github.com/repos/{}/issues?{}'.format(repo_path, query)
    with metrics.record_counter(ISSUES) as counter:
        for response in authed_get_all_pages(ISSUES, url):
            issues = response.json()
            extraction_time = singer.utils.now()
            for issue in issues:
                updated_at = dateutil.parser.parse(issue['updated_at'])
                ts_state = max(updated_at, _MIN_TS)
                rec = singer.transform(issue, stream)
                singer.write_record(ISSUES,
                                    rec,
                                    time_extracted=extraction_time)
                counter.increment()
    try:
        state[ISSUES] = ts_state.isoformat()
    except UnboundLocalError:
        state[ISSUES] = state.get(ISSUES)
    return state
Example #13
0
def get_all_commits(stream, config, state):
    '''
    https://developer.github.com/v3/repos/commits/#list-commits-on-a-repository
    '''
    repo_path = config[REPOSITORY]
    query_string = ''
    if COMMITS in state and state[COMMITS] is not None:
        query_string = '?since={}'.format(state[COMMITS])

    ts_state = _MIN_TS
    url = 'https://api.github.com/repos/{}/commits{}'.format(
        repo_path, query_string)
    with metrics.record_counter(COMMITS) as counter:
        for response in authed_get_all_pages(COMMITS, url):
            commits = response.json()
            extraction_time = singer.utils.now()
            for commit in commits:
                commit_date = dateutil.parser.parse(
                    commit['commit']['author']['date'])
                ts_state = max(commit_date, ts_state)
                rec = singer.transform(commit, stream)
                singer.write_record(COMMITS,
                                    rec,
                                    time_extracted=extraction_time)
                counter.increment()
    try:
        ts_state = clean_tz(ts_state)
    except UnboundLocalError:
        ts_state = state.get(COMMITS)
    state[COMMITS] = ts_state
    return state
Example #14
0
def get_all_stargazers(stream, config, state):
    '''
    https://developer.github.com/v3/activity/starring/#list-stargazers
    '''
    repo_path = config[REPOSITORY]
    params = {'sort': 'updated', 'direction': 'asc'}
    if state.get(STARGAZERS):
        params['since'] = state[STARGAZERS]
    query = urllib.parse.urlencode(params)
    stargazers_headers = {'Accept': 'application/vnd.github.v3.star+json'}
    url = 'https://api.github.com/repos/{}/stargazers?{}'.format(
        repo_path, query)
    ts_state = _MIN_TS
    with metrics.record_counter(STARGAZERS) as counter:
        for response in authed_get_all_pages(STARGAZERS, url,
                                             stargazers_headers):
            stargazers = response.json()
            extraction_time = singer.utils.now()
            for stargazer in stargazers:
                starred_at = dateutil.parser.parse(stargazer['starred_at'])
                ts_state = max(starred_at, ts_state)
                rec = singer.transform(stargazer, stream)
                rec['user_id'] = rec['user']['id']
                singer.write_record(STARGAZERS,
                                    rec,
                                    time_extracted=extraction_time)
                counter.increment()
    try:
        state[STARGAZERS] = ts_state.isoformat()
    except UnboundLocalError:
        state[STARGAZERS] = state.get(STARGAZERS)
    return state
 def test_multi_type_object_transform(self):
     schema = {
         'type': ['null', 'object', 'string'],
         'properties': {
             'whatever': {
                 'type': 'date-time',
                 'format': 'date-time'
             }
         }
     }
     data = {'whatever': '2017-01-01'}
     expected = {'whatever': '2017-01-01T00:00:00.000000Z'}
     self.assertDictEqual(expected, transform(data, schema))
     data = 'justastring'
     expected = 'justastring'
     self.assertEqual(expected, transform(data, schema))
 def test_null_object_transform(self):
     schema = {
         'type': 'object',
         'properties': {
             'addrs': {
                 'type': ['null', 'object'],
                 'properties': {
                     'city': {
                         'type': 'string'
                     }
                 }
             }
         }
     }
     none_data = {'addrs': None}
     self.assertDictEqual(none_data, transform(none_data, schema))
     empty_data = {'addrs': {}}
     self.assertDictEqual(empty_data, transform(empty_data, schema))
Example #17
0
 def test_drops_fields_which_are_unsupported(self):
     schema = {"type": "object", "properties": {"name": {"type": "string"}}}
     metadata = {('properties', 'name'): {"inclusion": "unsupported"}}
     dict_value = {"name": "chicken"}
     self.assertEqual({},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
Example #18
0
 def test_keeps_fields_without_metadata(self):
     schema = {"type": "object", "properties": {"name": {"type": "string"}}}
     metadata = {('properties', 'age'): {"inclusion": "automatic"}}
     dict_value = {"name": "chicken"}
     self.assertEqual({"name": "chicken"},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
Example #19
0
 def test_keeps_selected_data_from_dicts(self):
     schema = {"type": "object", "properties": {"name": {"type": "string"}}}
     metadata = {('properties', 'name'): {"selected": True}}
     dict_value = {"name": "chicken"}
     self.assertEqual({"name": "chicken"},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
 def test_keeps_selected_data_from_dicts(self):
     schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
     metadata = {('properties', 'name'): {'selected': True}}
     dict_value = {'name': 'chicken'}
     self.assertEqual({'name': 'chicken'},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
 def test_drops_fields_which_are_unsupported(self):
     schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
     metadata = {('properties', 'name'): {'inclusion': 'unsupported'}}
     dict_value = {'name': 'chicken'}
     self.assertEqual({},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
 def test_keeps_fields_without_metadata(self):
     schema = {'type': 'object', 'properties': {'name': {'type': 'string'}}}
     metadata = {('properties', 'age'): {'inclusion': 'automatic'}}
     dict_value = {'name': 'chicken'}
     self.assertEqual({'name': 'chicken'},
                      transform(dict_value,
                                schema,
                                NO_INTEGER_DATETIME_PARSING,
                                metadata=metadata))
Example #23
0
 def test_drops_no_data_when_not_dict(self):
     schema = {"type": "string"}
     metadata = {}
     string_value = "hello"
     self.assertEqual(
         string_value,
         transform(string_value,
                   schema,
                   NO_INTEGER_DATETIME_PARSING,
                   metadata=metadata))
 def test_drops_no_data_when_not_dict(self):
     schema = {'type': 'string'}
     metadata = {}
     string_value = 'hello'
     self.assertEqual(
         string_value,
         transform(string_value,
                   schema,
                   NO_INTEGER_DATETIME_PARSING,
                   metadata=metadata))
Example #25
0
def write_record(record, state, stream, replication_keys):
    stream_name = stream.tap_stream_id
    transformed = singer.transform(record, stream.schema.to_dict())
    singer.write_record(stream_name, transformed)
    bookmark = get_replication_value(record, replication_keys)
    state = singer.write_bookmark(state=state,
                                  tap_stream_id=stream_name,
                                  key=replication_keys[-1],
                                  val=bookmark)
    singer.write_state(state)
Example #26
0
def sync_file_ids(file_ids, client, state, stream, api, counter):
    if stream.get("replication_key"):
        start_date = state["bookmarks"][stream["tap_stream_id"]][
            stream["replication_key"]]
    else:
        start_date = None

    while file_ids:
        file_id = file_ids.pop(0)
        try:
            lines = api.stream_file(client, file_id)
        except ApiException as ex:
            # If the file has been deleted, write state with "file_ids" removed and re-raise.
            # Don't advance the bookmark until all files in the window have been synced.
            if ex.resp.status_code == 404:
                state["bookmarks"][stream["tap_stream_id"]].pop(
                    "file_ids", None)
                singer.write_state(state)
                raise Exception((
                    "File ID {} has been deleted, making the sync window invalid. "
                    "Removing partially exported files from state and will resume from bookmark on the next extraction."
                ).format(file_id)) from ex
            raise
        header = parse_header_line(next(lines), stream["tap_stream_id"])
        for line in lines:
            if not line:
                continue

            parsed_line = parse_csv_line(line)
            row = dict(zip(header, parsed_line))
            record = transform(row, stream['schema'])
            if stream.get("replication_key"):
                bookmark = record.get(stream["replication_key"])
                if not bookmark:
                    # There's a chance we get back a bad record here, and we don't want to null the bookmark
                    continue

                if bookmark and bookmark < start_date:
                    continue

                singer.write_record(stream["tap_stream_id"], record)
                state["bookmarks"][stream["tap_stream_id"]][
                    stream["replication_key"]] = bookmark
                singer.write_state(state)
            else:
                singer.write_record(stream["tap_stream_id"], record)

            counter.increment()

        state["bookmarks"][stream["tap_stream_id"]]["file_ids"] = file_ids
        singer.write_state(state)

    state["bookmarks"][stream["tap_stream_id"]]["file_ids"] = None
    singer.write_state(state)
    return counter
 def test_pattern_properties_match(self):
     schema = {
         'type': 'object',
         'patternProperties': {
             '.+': {
                 'type': 'string'
             }
         }
     }
     dict_value = {'name': 'chicken', 'unit_cost': '1.45', 'SKU': '123456'}
     expected = dict(dict_value)
     self.assertEqual(expected, transform(dict_value, schema))
Example #28
0
 def test_pattern_properties_match(self):
     schema = {
         "type": "object",
         "patternProperties": {
             ".+": {
                 "type": "string"
             }
         }
     }
     dict_value = {"name": "chicken", "unit_cost": '1.45', "SKU": '123456'}
     expected = dict(dict_value)
     self.assertEqual(expected, transform(dict_value, schema))
Example #29
0
 def test_drops_nested_object_fields_which_are_unselected(self):
     schema = {
         "type": "object",
         "properties": {
             "addr": {
                 "type": "object",
                 "properties": {
                     "addr1": {
                         "type": "string"
                     },
                     "city": {
                         "type": "string"
                     },
                     "state": {
                         "type": "string"
                     },
                     'amount': {
                         'type': 'integer'
                     }
                 }
             }
         }
     }
     metadata = {
         ('properties', 'addr'): {
             "selected": True
         },
         ('properties', 'addr', 'properties', 'amount'): {
             "selected": False
         }
     }
     data = {
         'addr': {
             'addr1': 'address_1',
             'city': 'city_1',
             'state': 'state_1',
             'amount': '123'
         }
     }
     expected = {
         'addr': {
             'addr1': 'address_1',
             'city': 'city_1',
             'state': 'state_1'
         },
     }
     self.assertDictEqual(
         expected,
         transform(data,
                   schema,
                   NO_INTEGER_DATETIME_PARSING,
                   metadata=metadata))
 def test_pattern_properties_match_multiple(self):
     schema = {
         'type': 'object',
         'patternProperties': {
             '.+?cost': {
                 'type': 'number'
             },
             '.+(?<!cost)$': {
                 'type': 'string'
             }
         }
     }
     dict_value = {'name': 'chicken', 'unit_cost': 1.45, 'SKU': '123456'}
     expected = dict(dict_value)
     self.assertEqual(expected, transform(dict_value, schema))