def fetch_comtrade_services_data( table_name: str, ts_nodash: str, **kwargs, ): s3 = S3Data(table_name, ts_nodash) expected_keys = [ 'Classification', 'Year', 'Period', 'Period Desc.', 'Aggregate Level', 'Is Leaf Code', 'Trade Flow Code', 'Trade Flow', 'Reporter Code', 'Reporter', 'Reporter ISO', 'Partner Code', 'Partner', 'Partner ISO', 'Commodity Code', 'Commodity', 'Trade Value (US$)', 'Flag', ] _fetch(s3, 'S', expected_keys)
def fetch_from_hosted_csv( table_name: str, source_url: str, page_size: int = 1000, allow_empty_strings: bool = True, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) results = [] page = 1 with closing(requests.get(source_url, stream=True)) as request: reader = csv.DictReader( codecs.iterdecode(request.iter_lines(), 'utf-8')) for row in reader: if not allow_empty_strings: row = {k: v if v != '' else None for k, v in row.items()} # type: ignore results.append(row) if len(results) >= page_size: s3.write_key(f"{page:010}.json", results) results = [] page += 1 if results: s3.write_key(f"{page:010}.json", results) logger.info("Fetching from source completed")
def query_database(query: str, target_db: str, table_name, batch_size: int = 100000, **kwargs): s3 = S3Data(table_name, kwargs["ts_nodash"]) total_records = 0 next_batch = 1 connection = PostgresHook(postgres_conn_id=target_db).get_conn() try: # create connection with named cursor to fetch data in batches cursor = connection.cursor(name='query_database') cursor.execute(query) rows = cursor.fetchmany(batch_size) fields = [d[0] for d in cursor.description] while rows: records = [] for row in rows: record = {fields[col]: row[col] for col in range(len(row))} records.append(record) s3.write_key(f'{next_batch:010}.json', records) next_batch += 1 total_records += len(records) rows = cursor.fetchmany(batch_size) finally: if connection: cursor.close() connection.close()
def fetch_apple_mobility_data( table_name: str, base_url: str, config_path: str, df_transform: Callable[[pd.DataFrame], pd.DataFrame], page_size: int = 1000, **kwargs, ): s3 = S3Data(table_name, kwargs['ts_nodash']) api_config = requests.get(base_url + config_path).json() source_url = ( base_url + api_config['basePath'] + api_config['regions']['en-us']['csvPath'] ) logger.info(f'Fetching csv from {source_url}') response = requests.get(source_url) df = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) df = df_transform(df) page = 1 for i in range((len(df) // page_size) + 1): results = df.iloc[page_size * i : page_size * (i + 1)].to_json( orient="records", date_format="iso" ) s3.write_key(f"{page:010}.json", results, jsonify=False) page += 1 logger.info('Fetching from source completed')
def write_prediction(table_name, df, context): # note: if making this to a separate task, then **context need to be passed to the function and # df = context['task_instance'].xcom_pull(task_ids='predict-tags') df_json = df.to_json(orient="records") df_json = json.loads(df_json) s3 = S3Data(table_name, context["ts_nodash"]) s3.write_key('tags_prediction.json', df_json)
def fetch_from_sharepoint_list(table_name: str, sub_site_id: str, list_id: str, **kwargs): s3 = S3Data(table_name, kwargs["ts_nodash"]) app = ConfidentialClientApplication( DIT_SHAREPOINT_CREDENTIALS['client_id'], authority= f'https://login.microsoftonline.com/{DIT_SHAREPOINT_CREDENTIALS["tenant_id"]}', client_credential=DIT_SHAREPOINT_CREDENTIALS['client_secret'], ) token_response = app.acquire_token_for_client( scopes=['https://graph.microsoft.com/.default']) if 'access_token' not in token_response: raise InvalidAuthCredentialsError( f'Failed to acquire token: {token_response.get("error_description")}' ) access_token = token_response['access_token'] tenant = DIT_SHAREPOINT_CREDENTIALS["tenant_domain"] sharepoint_site = ( f':/sites/{DIT_SHAREPOINT_CREDENTIALS["site_name"]}:/sites/{sub_site_id}' ) list_url = f'https://graph.microsoft.com/v1.0/sites/{tenant}{sharepoint_site}/lists/{list_id}' items_url = f'{list_url}/items' # Fetch a copy of the column names, this needs to be done separately from # fetching items as otherwise paging will not work. logger.info('Fetching column names from %s', list_url) graph_data = _make_request(list_url, access_token, {'expand': 'columns'}) column_map = { col['name']: col['displayName'] for col in graph_data['columns'] } # Fetch the list item data page = 1 while items_url is not None: logger.info('Fetching from %s', items_url) graph_data = _make_request(items_url, access_token, {'expand': 'fields'}) records = [{ **{ column_map[col_id]: row['fields'][col_id] for col_id in row['fields'].keys() if col_id in column_map }, 'createdBy': row['createdBy']['user'], 'lastModifiedBy': row['lastModifiedBy']['user'] if row.get('lastModifiedBy') is not None else None, } for row in graph_data['value']] s3.write_key(f"{page:010}.json", records) page += 1 items_url = graph_data.get('@odata.nextLink') logger.info('Finished fetching from sharepoint')
def fetch_from_ons_sparql(table_name: str, query: str, index_query: Optional[str], **kwargs): s3 = S3Data(table_name, kwargs["ts_nodash"]) if index_query is None: _store_ons_sparql_pages(s3, query) else: index_values = _ons_sparql_request(config.ONS_SPARQL_URL, index_query)["results"]["bindings"] for index in index_values: _store_ons_sparql_pages(s3, query.format(**index), index['label']['value'] + "-")
def fetch_daily_tickets( schema_name: str, table_name: str, account: str, **kwargs, ): """ Download zendesk json data and reformat it for ingestion into table """ today = kwargs["execution_date"] yesterday = (today - timedelta(days=1)).strftime("%Y%m%d") logger.info( f"Fetching data from source for day '{yesterday}' on '{today}'") results = [] query = f"type:ticket updated:{yesterday} status:closed" base_url = config.ZENDESK_CREDENTIALS[account]["url"] for page in itertools.count(1): url = f"{base_url}/search.json?query={query}&sort_by=created_at&sort_order=asc&page={page}" data = _query(url=url, account=account) results.extend(data["results"]) if not data["next_page"]: break if page >= 11: raise Exception("Too many iterations") # Get metrics fields which aren't returned from the search endpoint for ticket in results: metrics = _get_ticket_metrics(id=ticket["id"], account=account) ticket.update(metrics) # Remove covid-19 related tickets from dit zendesk tickets if account == 'dit': results = _remove_covid_related_tickets(results) s3upstream = S3Upstream(f"{schema_name}_{table_name}") s3upstream.write_key(f"{yesterday}.json", results, jsonify=True) logger.info("Fetching from source completed") s3data = S3Data(table_name, kwargs["ts_nodash"]) for source in s3upstream.list_keys(): s3data.client.copy_object( source_bucket_key=source, dest_bucket_key=source.replace(s3upstream.prefix, s3data.prefix), source_bucket_name=s3data.bucket, dest_bucket_name=s3data.bucket, ) logger.info("Copy from upstream completed")
def _store_ons_sparql_pages(s3: S3Data, query: str, prefix: str = ""): next_page = 1 total_records = 0 while next_page: logger.info(f"Fetching page {prefix}{next_page}") data = _ons_sparql_request(config.ONS_SPARQL_URL, query, page=next_page) if not data["results"]["bindings"]: next_page = 0 continue total_records += len(data["results"]["bindings"]) s3.write_key(f"{prefix}{next_page:010}.json", data["results"]["bindings"]) logger.info(f"Fetched {total_records} records") next_page += 1 logger.info(f"Fetching from source completed, total {total_records}")
def fetch_from_activity_stream(table_name: str, index_name: str, query: dict, **kwargs): s3 = S3Data(table_name, kwargs["ts_nodash"]) query = { "query": query, "size": config.ACTIVITY_STREAM_RESULTS_PER_PAGE, "sort": [{ "id": "asc" }], } next_page = 1 source_url = f"{config.ACTIVITY_STREAM_BASE_URL}/v3/{index_name}/_search" while next_page: logger.info(f"Fetching page {next_page} of {source_url}") data = _hawk_api_request( source_url, "GET", query, ACTIVITY_STREAM_HAWK_CREDENTIALS, 'hits', ) if "failures" in data["_shards"]: logger.warning("Request failed on {} shards: {}".format( data['_shards']['failed'], data['_shards']['failures'])) if not data["hits"]["hits"]: next_page = 0 continue s3.write_key(f"{next_page:010}.json", [item["_source"] for item in data["hits"]["hits"]]) logger.info( f"Fetched {len(data['hits']['hits'])} of {data['hits']['total']} records" ) query = query.copy() query["search_after"] = data["hits"]["hits"][-1]["sort"] next_page += 1 logger.info( f"Fetching from source completed, total {data['hits']['total']}")
def fetch_companies_house_companies( table_name: str, source_url: str, number_of_files: int, page_size: int = 10000, **kwargs, ): """ Loop through `number_of_files`, build the url, download the zip file, extract and write data in batches of `page_size` to s3 """ s3 = S3Data(table_name, kwargs['ts_nodash']) page = 1 results = [] publish_date = datetime(kwargs['next_execution_date'].year, kwargs['next_execution_date'].month, 1).strftime('%Y-%m-01') for file_num in range(1, number_of_files + 1): url = source_url.format( file_date=publish_date, file_num=file_num, num_files=number_of_files, ) logger.info('Fetching zip file from %s', url) with zipfile.ZipFile(io.BytesIO(_download(url))) as archive: with archive.open(archive.namelist()[0], 'r') as f: reader = csv.DictReader(codecs.iterdecode(f, 'utf-8')) if reader.fieldnames is not None: reader.fieldnames = [x.strip() for x in reader.fieldnames] for row in reader: row['publish_date'] = publish_date results.append(row) if len(results) >= page_size: s3.write_key(f'{page:010}.json', results) results = [] page += 1 if results: s3.write_key(f'{page:010}.json', results) logger.info('Fetching from source completed')
def run_ipython_ons_extraction(table_name: str, script_name: str, **kwargs): with TemporaryDirectory() as tempdir: os.chdir(tempdir) shutil.copytree('/app/dataflow/ons_scripts', 'ons_scripts') logger.info("ONS scraper: start") subprocess.call(['ipython', 'main.py'], cwd=f'ons_scripts/{script_name}') logger.info("ONS scraper: completed") s3 = S3Data(table_name, kwargs['ts_nodash']) for filename in sorted( glob.glob(f"ons_scripts/{script_name}/out/observations*.csv")): logger.info(f"Writing {filename} to S3.") s3.write_key( os.path.basename(filename), open(filename, "r").read(), jsonify=False, )
def fetch_from_gtr_api(table_name: str, resource_type: str, **kwargs): source_url = 'https://gtr.ukri.org/gtr/api' s3 = S3Data(table_name, kwargs["ts_nodash"]) page = 1 while True: response = requests.get( f'{source_url}/{resource_type}s', params={ 'p': page, 's': 100 }, headers={'Accept': 'application/json'}, ) try: response.raise_for_status() except requests.exceptions.HTTPError: logger.error(f"Request failed: {response.text}") raise response_json = response.json() total_pages = response_json['totalPages'] total_number_of_results = response_json['totalSize'] results = response_json[resource_type] s3.write_key(f"{page:010}.json", results) logger.info( f"Fetched {len(results*page)} out of {total_number_of_results} {resource_type} records" ) page += 1 if page > total_pages: break logger.info("Fetching from source completed")
def fetch_from_hawk_api( table_name: str, source_url: str, hawk_credentials: dict, results_key: str = "results", next_key: Optional[str] = "next", validate_response: Optional[bool] = True, force_http: Optional[bool] = False, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) total_records = 0 page = 1 while True: data = _hawk_api_request( source_url, credentials=hawk_credentials, results_key=results_key, next_key=next_key, validate_response=validate_response, force_http=force_http, ) results = get_nested_key(data, results_key) s3.write_key(f"{page:010}.json", results) total_records += len(results) logger.info(f"Fetched {total_records} records") source_url = get_nested_key(data, next_key) if next_key else None if not source_url: break page += 1 logger.info("Fetching from source completed")
def fetch_mapped_hosted_csvs( table_name: str, source_urls: Dict[str, str], df_transform: Callable[[pd.DataFrame], pd.DataFrame], page_size: int = 10000, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) page = 1 for type_, source_url in source_urls.items(): logger.info(f"Fetching {source_url}") response = requests.get(source_url) df = pd.read_csv(io.StringIO(response.content.decode('utf-8'))) df = df_transform(df) df["source_url_key"] = type_ for i in range((len(df) // page_size) + 1): results = df.iloc[page_size * i:page_size * (i + 1)].to_json( orient="records", date_format="iso") s3.write_key(f"{page:010}.json", results, jsonify=False) page += 1 logger.info("Fetching from source completed")
def fetch_companies_house_significant_persons( table_name: str, source_url: str, **kwargs, ): s3 = S3Data(table_name, kwargs['ts_nodash']) page_size = 10000 page = 1 results = [] for file in range(1, config.COMPANIES_HOUSE_PSC_TOTAL_FILES + 1): url = source_url.format(file=file, total=config.COMPANIES_HOUSE_PSC_TOTAL_FILES) logger.info('Fetching zip file from %s', url) with zipfile.ZipFile(io.BytesIO(_download(url))) as archive: with archive.open(archive.namelist()[0], 'r') as f: for line in f: results.append(json.loads(line)) if len(results) >= page_size: s3.write_key(f'{page:010}.json', results) results = [] page += 1 if results: s3.write_key(f'{page:010}.json', results) logger.info("Fetching from source completed")
def fetch_hmrc_trade_data( table_name: str, base_filename: str, records_start_year: int, num_expected_fields: Tuple[int, int], num_per_page: int = 10000, **kwargs, ): s3 = S3Data(table_name, kwargs["ts_nodash"]) min_fields, max_fields = num_expected_fields # New files are uploaded to uktradeinfo 2 months later, usually on 10th of the month. # The files have a predictable name, but not a predictable directory. The best way we have # of finding the URLs is to scrape the pages from which they're linked # # As much as is possible, we stream-process the zips, since some of the unzipped files are # > 100mb. Without the streaming, we sometimes hit memory limits when parsing the contents latest_file_date = kwargs.get( "run_date", kwargs.get("execution_date") ) - relativedelta(months=1) previous_years = list(range(records_start_year, latest_file_date.year)) def get_file_linked_from(url, filename): logger.info('Looking on %s for links to %s', url, filename) html = _download(url) soup = BeautifulSoup(html, "html.parser") links = [link.get('href') for link in soup.find_all('a') if link.get('href')] logger.info("Found links %s", links) matching_links = [link for link in links if link.endswith(filename)] if not matching_links: raise Exception(f'Unable to find link to {filename}') if len(matching_links) > 1: raise Exception(f'Too many links for {filename}') return _download(urljoin(url, matching_links[0])) def first_file_from_zip(zip_bytes): with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive: name = archive.namelist()[0] logger.info('Opening file in zip %s', name) with archive.open(name, "r") as file: yield file, name def nested_files_from_zip(zip_bytes): with zipfile.ZipFile(io.BytesIO(zip_bytes)) as archive: for name in archive.namelist(): if not name.lower().startswith(base_filename): # Some sesx16 files seem to contain unrelated data logger.info('Skipping file %s', name) continue logger.info('Opening file in zip %s', name) with archive.open(name, "r") as file: with zipfile.ZipFile(file) as inner_archive: inner_name = inner_archive.namelist()[0] logger.info('Opening inner file in zip %s', inner_name) with inner_archive.open(inner_name, "r") as inner_file: logger.info('Opened inner file in zip %s', inner_name) yield inner_file, inner_name def get_files(): for year in previous_years: yield from nested_files_from_zip( get_file_linked_from( config.HMRC_UKTRADEINFO_ARCHIVE_URL, f"/{base_filename}_{year}archive.zip", ), ) if latest_file_date.month > 1: yield from nested_files_from_zip( get_file_linked_from( config.HMRC_UKTRADEINFO_LATEST_URL, f"/{base_filename}_{latest_file_date:%Y}archive.zip", ), ) yield from first_file_from_zip( get_file_linked_from( config.HMRC_UKTRADEINFO_LATEST_URL, f"/{base_filename}{latest_file_date:%y%m}.zip", ), ) def get_lines(files): for file, source_name in files: logger.info('Parsing file %s', file) for line in _without_first_and_last(file): data = line.strip().decode('utf-8', errors='replace').split("|") if min_fields <= len(data) < max_fields: yield data + [source_name] else: logger.warn( "Ignoring row with %s fields instead of expected %s: %s", len(data), num_expected_fields, line, ) def paginate(lines): page = [] for line in lines: page.append(line) if len(page) == num_per_page: yield page page = [] if page: yield page files = get_files() lines = get_lines(files) pages = paginate(lines) for i, page in enumerate(pages): output_filename = f"{i:010}.json" logger.info('Saving file to S3 %s', output_filename) s3.write_key(output_filename, page) logging.info("Fetched from source to %s", output_filename)
def fetch_from_api_endpoint( table_name: str, source_url: str, auth_token: Optional[str] = None, auth_token_builder: Optional[Callable] = None, results_key: Optional[str] = "results", next_key: Optional[str] = "next", auth_type: Optional[str] = "Token", extra_headers: Optional[Mapping] = None, **kwargs, ): if auth_token is not None and auth_token_builder is not None: raise ValueError( "You can provide at most one of `auth_token` and `auth_token_builder`" ) s3 = S3Data(table_name, kwargs["ts_nodash"]) total_records = 0 page = 1 while True: if auth_token: request_headers = {"Authorization": f'{auth_type} {auth_token}'} elif auth_token_builder: request_headers = { "Authorization": f'{auth_type} {auth_token_builder()}' } else: request_headers = {} if extra_headers: request_headers = {**request_headers, **extra_headers} response = requests.get(source_url, headers=request_headers) try: response.raise_for_status() except requests.exceptions.HTTPError: logger.error(f"Request failed: {response.text}") raise response_json = response.json() if (next_key and next_key not in response_json) or ( results_key and results_key not in response_json): raise ValueError("Unexpected response structure") if results_key is not None: results = get_nested_key(response_json, results_key) else: results = response_json s3.write_key(f"{page:010}.json", results) total_records += len(results) logger.info(f"Fetched {total_records} records") source_url = get_nested_key(response_json, next_key) if next_key else None if not source_url: break page += 1 logger.info("Fetching from source completed")
def insert_data_into_db( target_db: str, table: Optional[sa.Table] = None, field_mapping: Optional[SingleTableFieldMapping] = None, table_config: Optional[TableConfig] = None, contexts: Tuple = tuple(), **kwargs, ): """Insert fetched response data into temporary DB tables. Goes through the stored response contents and loads individual records into the temporary DB table. DB columns are populated according to the field mapping, which if as list of `(response_field, column)` tuples, where field can either be a string or a tuple of keys/indexes forming a path for a nested value. """ if table_config: if table or field_mapping: raise RuntimeError( "You must exclusively provide either (table_config) or (table && field_mapping), not bits of both." ) table_config.configure(**kwargs) s3 = S3Data(table_config.table_name, kwargs["ts_nodash"]) elif table is not None and field_mapping is not None: warnings.warn( ("`table` and `field_mapping` parameters are deprecated. " "This pipeline should be migrated to use `table_config`/`TableConfig`." ), DeprecationWarning, ) s3 = S3Data(table.name, kwargs["ts_nodash"]) temp_table = get_temp_table(table, kwargs["ts_nodash"]) else: raise RuntimeError( f"No complete table/field mapping configuration provided: {table}, {field_mapping}" ) engine = sa.create_engine( 'postgresql+psycopg2://', creator=PostgresHook(postgres_conn_id=target_db).get_conn, ) count = 0 for page, records in s3.iter_keys(): logger.info('Processing page %s', page) count += 1 with engine.begin() as conn: if table_config: for record in records: for transform in table_config.transforms: record = transform(record, table_config, contexts) conn.execute( table_config.temp_table.insert(), **_get_data_to_insert(table_config.columns, record), ) if table_config.related_table_configs: _insert_related_records(conn, table_config, contexts + (record, )) elif table is not None and field_mapping: for record in records: conn.execute( temp_table.insert(), # pylint: disable=E1120 **_get_data_to_insert(field_mapping, record), ) logger.info('Page %s ingested successfully', page) if count == 0: raise MissingDataError( "There are no pages of records in S3 to insert.")