def cdms_list(client, entity_name, offset): ''' Call the `cdms_api.list` method, passing through the entity_name and offset. This function records the duration of the network request. It also caches the resulting response if it’s successful and raises an informative exception if it’s not. ''' cached, cache_path = is_cached(entity_name, offset) if cached: # nothing to do, just load resp from cache return services.redis.get(cache_path) start_time = datetime.datetime.now() if client is None: client = CDMSRestApi() resp = client.list(entity_name, skip=offset) # the actual request time_delta = (datetime.datetime.now() - start_time).seconds # the below will raise something useful, or pass by quietly raise_on_cdms_resp_errors(entity_name, offset, resp) # record our expensive network request services.redis.set(duration_record(entity_name, offset), str(time_delta)) services.redis.set(cache_path, resp.content.decode(resp.encoding or 'utf-8')) LOGGER.info("{0} ({1}) {2}s".format(entity_name, offset, time_delta)) return resp.content
def test_setup_session_if_cookie_expired(self): """ If the cookie is expired, a call to an arbitrary endpoint should reauthenticate and retry one more time transparently. """ url = 'https://test/' body_response = 'success' def endpoint_callback(): index = 0 def wrapper(request): nonlocal index status_code = 200 if index else 401 index += 1 return (status_code, [], json.dumps({'d': body_response})) return wrapper responses.add_callback(responses.GET, url, match_querystring=True, callback=endpoint_callback()) self.mock_initial_login() self.mock_login_step(1) self.mock_login_step(2) self.mock_login_step(3) api = CDMSRestApi() resp = api.make_request('get', url) self.assertEqual(resp, body_response) self.assertTrue(api.auth.session)
def test_create(self): api = CDMSRestApi() resp = api.create(self.service, data=self.data) self.assertEqual(len(responses.calls), 1) self.assertEqual(resp, 'something') self.assertDictEqual(json.loads(responses.calls[0].request.body), self.data)
def test_defaults(self): """ Call to the list endpoint with the defaults params. """ api = CDMSRestApi() api.list(self.service) self.assertEqual(len(responses.calls), 1) self.assertEqual( urlparse(responses.calls[0].request.url).query, '$top=50&$skip=0&')
def test_order_by_as_string(self): """ Call to the list endpoint with the order_by param as a string instead of a list. """ api = CDMSRestApi() api.list(self.service, order_by='something') self.assertEqual(len(responses.calls), 1) self.assertEqual( urlparse(responses.calls[0].request.url).query, '$top=50&$skip=0&$orderby=something')
def delete_odata(odata_tablename, ident): cdms_client = CDMSRestApi() resp = cdms_client.delete(odata_tablename, "guid'{0}'".format(ident)) print(resp) try: resp.json() # TODO: handle deauth (could raise json.JSONDecodeError) return resp.status_code == 204 except json.JSONDecodeError as exc: cdms_client.auth.setup_session(True) resp = cdms_client.delete(odata_tablename, "guid'{0}'".format(ident)) print(resp) resp.json() return resp.status_code == 204
def test_complete(self): """ Call to the list endpoint with all params defined. """ api = CDMSRestApi() api.list(self.service, top=10, skip=1, select=['a', 'b'], filters='c,d', order_by=['e', 'f']) self.assertEqual(len(responses.calls), 1) self.assertEqual( urlparse(responses.calls[0].request.url).query, '$top=10&$skip=1&$filter=c,d&$orderby=e,f&$select=a,b')
def fetch_missing(metadata, missing, attempts=0): if attempts < constants.DJANGO_INITIAL_MISSING_ATTEMPTS: pass else: return client = CDMSRestApi() for _, django_name in django_tables_dep_order(metadata): guids = missing[django_name] if not guids: continue LOGGER.info('Backfilling %s entries for %s after %s attempts', len(guids), django_name, attempts) table = metadata.tables[django_name] get_fn = functools.partial(utils.get_django, client, table.name) django_dicts = list(map(get_fn, guids)) results, still_missing = etl.load.to_sqla_table_idempotent( table, [x for _, x in django_dicts if x]) count_non_existant = len([x for _, x in django_dicts if x is False]) if count_non_existant: LOGGER.info('%s has %s non-existant entries', django_name, count_non_existant) if still_missing: return fetch_missing(metadata, still_missing, attempts=attempts + 1)
def test_reuse_existing_cookie(self): """ If the cookie file exists, use that without making any auth calls. """ self.mock_cookie() api = CDMSRestApi() self.assertEqual(len(responses.calls), 0) self.assertTrue(api.session)
def test_500(self): """ Endpoint returning an error other than 401/404 should raise ErrorResponseException. """ url = 'https://test/' responses.add(responses.GET, url, match_querystring=True, status=500) api = CDMSRestApi() self.assertRaises(ErrorResponseException, api.make_request, 'get', url)
def test_404(self): """ Endpoint returning 404 should raise CDMSNotFoundException. """ url = 'https://test/' responses.add(responses.GET, url, match_querystring=True, status=404) api = CDMSRestApi() self.assertRaises(CDMSNotFoundException, api.make_request, 'get', url)
def test_invalid_credentials(self): """ CDMSRestApi raises LoginErrorException on init when un/pw are invalid """ self.mock_initial_login() self.mock_login_step(1, errors=True) with self.assertRaises(LoginErrorException): CDMSRestApi()
def test_exception_with_initial_form(self): """ CDMSRestApi raises if AD login returns 500 In case of exception with the initial login url, the constructor should raise UnexpectedResponseException. """ self.mock_initial_login(status_code=500) with self.assertRaises(UnexpectedResponseException): CDMSRestApi()
def test_first_successful_login(self): """ CDMSRestApi logs in using AD on init When logging in for the first time (=> no cookie exists), the constructor logs in and saves the valid cookie on the filesystem. """ self.mock_initial_login() self.mock_login_step(1) self.mock_login_step(2) self.mock_login_step(3) api = CDMSRestApi() self.assertTrue(self.cookie_storage.exists()) self.assertTrue(api.auth.session)
def test_setup_session_tries_only_once_if_cookie_expired(self): """ If the cookie is expired, a call to an arbitrary endpoint should retry just once and fail after that. """ url = 'https://test/' responses.add(responses.GET, url, match_querystring=True, status=401) self.mock_initial_login() self.mock_login_step(1) self.mock_login_step(2) self.mock_login_step(3) api = CDMSRestApi() self.assertRaises(CDMSUnauthorizedException, api.make_request, 'get', url) self.assertEqual(len(responses.calls), 6)
def validate_credentials(request): 'Validate a set of CDMS credentials' cdms_cookie_path = uuid.uuid4().hex try: json_data = request.json_body username = json_data.get('username') password = json_data.get('password') if not (username and password): SENTRY_CLIENT.captureMessage( 'Missing credentials from validate-credentials request body') return False auth = ActiveDirectoryAuth(username, password, cdms_cookie_path) api_client = CDMSRestApi(auth) api_client.auth.login() except (ValueError, RequestException): SENTRY_CLIENT.captureException() return False return True
def cdms_client_fn(username, password): auth = ActiveDirectoryAuth(username=username, password=password, cookie_path=cookie_path) return CDMSRestApi(auth=auth)
def cdms_client(): 'Placeholder for disconnect management and stuff?' client = CDMSRestApi() return client
) def main(client, traversal_spec): ''' Download everything, traversing from company to contact and then interaction. Tee the data to the OData database and Leeloo web API. ''' (root_table, root_pkey), children = traversal_spec odata_metadata = services.db.get_odata_metadata() odata_table = odata_metadata.tables[root_table] base_select = sqla.select([odata_table]) execute = odata_metadata.bind.execute odata_chunks = select_chunks(execute, odata_table, base_select) for odata_chunk in odata_chunks: for odata_row in odata_chunk: guid = getattr(odata_row, root_pkey) traverse(client, odata_metadata, guid, children) if __name__ == '__main__': traversal_spec = ( ('AccountSet', 'AccountId'), ( ('ContactSet', 'ParentCustomerId/Id'), ('detica_interactionSet', 'optevia_Organisation/Id'), ), ) client = CDMSRestApi() main(client, traversal_spec)
def test_delete(self): api = CDMSRestApi() api.delete(self.service, self.guid) self.assertEqual(len(responses.calls), 1)
def test_exception_if_credentials_configured(self): """ CDMSRestApi raises when CDMS un / pw settings are left blank """ with self.assertRaises(ImproperlyConfigured): CDMSRestApi()
def test_get(self): api = CDMSRestApi() resp = api.get(self.service, self.guid) self.assertEqual(len(responses.calls), 1) self.assertEqual(resp, 'something')
def main(names=None, client=None): if not client: # assume this is not a testing case # force login to setup cookie to be used by subsequent client instances CDMSRestApi().auth.setup_session(True) if names is None: names = etl.spec.MAPPINGS.keys() else: names = set(names.split(',')) pool = multiprocessing.Pool(processes=scrape_constants.PROCESSES) entity_chunks = [] metadata = services.db.get_odata_metadata() spent = set(json.loads(services.redis.get(SPENT_KEY) or '[]')) len_spent = len(spent) if len_spent: LOGGER.info("Skipping {0} entity types \o/".format(len_spent)) to_scrape = names - spent LOGGER.info('Scraping the following entities:') for name in names: LOGGER.info(' %s %s', name, '✔' if name in to_scrape else '✘') for entity_name in to_scrape: try: # validate cache is in good shape (ie. no missing requests) cache_names = map( lambda path: path.split('/')[-1], services.redis.keys( os.path.join('cache', 'json', entity_name, '*'))) caches = sorted(map(int, cache_names)) for index, offset in list(enumerate(caches)): if caches[index - 1] != offset - 50: start = caches[index - 1] + 50 LOGGER.info('In a previous run %s broke at %s', entity_name, start) break else: start = max(caches) + 50 except (FileNotFoundError, ValueError): start = 0 end = start + (scrape_constants.CHUNKSIZE * scrape_constants.PAGESIZE) entity_chunks.append( classes.EntityChunk(client, entity_name, start, end)) last_report = 0 final_tick = False while True: # take a deep breath # use the magic of modulo now = datetime.datetime.now() report_conditions = ( now.second, now.second % scrape_constants.INTERVAL == 0, last_report != now.second, ) if not all(report_conditions): continue # this isn’t a report loop LOGGER.info("Tick at {0}".format(now.strftime("%Y-%m-%d %H:%M:%S"))) last_report = now.second reauthd_this_tick = False for entity_chunk in random.sample(entity_chunks, len(entity_chunks)): if entity_chunk.state in (types.EntityChunkState.complete, types.EntityChunkState.spent): continue # NOQA # how many tasks pending in total pending = sum(entity_chunk.pending() for entity_chunk in entity_chunks) if pending <= scrape_constants.PROCESSES: # throttling if entity_chunk.state == types.EntityChunkState.incomplete: entity_chunk.start(pool) else: fmt_str = "Throttling {0.entity_name} ({0.offset_start}-{0.offset_end})" # noqa: E501 LOGGER.info(fmt_str.format(entity_chunk)) for entity_page in entity_chunk.entity_pages: entity_page.poll() # updates the state of the EntityPage if entity_page.state == types.EntityPageState.complete: # make cheeky call to etl.load results, _ = etl.main.from_odata_json( metadata.tables[entity_page.entity_name], utils.json_cache_key(entity_page.entity_name, entity_page.offset)) LOGGER.info("Records {0}-{1} went into {2}".format( entity_page.offset, entity_page.offset + sum(result.rowcount for result in results), entity_page.entity_name)) entity_page.state = types.EntityPageState.inserted if entity_page.state == types.EntityPageState.spent: # make cheeky call to etl.load try: results, _ = etl.main.from_odata_json( metadata.tables[entity_page.entity_name], utils.json_cache_key(entity_page.entity_name, entity_page.offset)) LOGGER.info("Records {0}-{1} went into {2}".format( entity_page.offset, entity_page.offset + sum(result.rowcount for result in results), entity_page.entity_name)) except TypeError as exc: # happens when spent EntityPage doesn’t have any data pass # if there is no pending requests, stop requesting this # entity (it’s spent) entitypage_states = set(x.state for x in entity_chunk.entity_pages) if types.EntityPageState.pending not in entitypage_states: entity_chunk.state = types.EntityChunkState.spent spent = set( json.loads(services.redis.get(SPENT_KEY) or '[]')) spent.add(entity_chunk.entity_name) services.redis.set(SPENT_KEY, json.dumps(tuple(spent))) LOGGER.error("{0} ({1}) spent".format( entity_page.entity_name, entity_page.offset)) if entity_page.state == types.EntityPageState.deauthd: if not reauthd_this_tick: CDMSRestApi().auth.setup_session(True) reauthd_this_tick = True entity_page.reset() entity_chunk.poll() # update state of EntityChunk done = ( # ask if all the EntityChunks are done (entity_chunk.state == types.EntityChunkState.complete or entity_chunk.state == types.EntityChunkState.spent) for entity_chunk in entity_chunks) if all(done): if not final_tick: # make sure last page is processed final_tick = True continue LOGGER.info('Waiting for Pool.close ...') pool.close() LOGGER.info('Waiting for Pool.join ...') pool.join() if not client: # assume this is not a testing case exit(1) return LOGGER.info("{0}/{1} entity chunks report complete".format( len([x for x in done if x]), len(entity_chunks))) time.sleep(1) # don’t spam
def test_exception_if_urls_not_configured(self): """ CDMSRestApi raises when CDMS URL settings are left blank """ with self.assertRaises(ImproperlyConfigured): CDMSRestApi()