def index(): landingPage = request.args.get('landingPage') try: scraper = Scraper(landingPage) return markdown(scraper._repr_markdown_()) except NotImplementedError as errNotImplemented: return markdown( f"# Not Implemented\n\n{errNotImplemented}{getPartialMatches(landingPage)}" ) except MetadataError as errMetadata: return markdown(f"# Metadata Error\n\n{errMetadata}") except Exception as err: return markdown(f"# Error\n\n{err}\n\n{GENERIC_EXCEPTION_HELP}")
def step_impl(context, error_message): captured_error_message = "" try: context.scrapper = Scraper(seed=context.seed_path) except ValueError as err: captured_error_message = str(err) assert_equal(error_message, captured_error_message)
def step_impl(context, uri): with vcr.use_cassette( cassette(uri), record_mode=context.config.userdata.get("record_mode", DEFAULT_RECORD_MODE), ): context.scraper = Scraper(uri, requests.Session())
def get_current_and_next_release_date() -> Tuple[datetime, datetime]: exports_scraper = Scraper(EXPORTS_DATASET_URL, session=requests.Session()) imports_scraper = Scraper(IMPORTS_DATASET_URL, session=requests.Session()) oldest_current_release_date: date = min(exports_scraper.dataset.issued, imports_scraper.dataset.issued) latest_update_date = max(exports_scraper.dataset.updateDueOn, exports_scraper.dataset.updateDueOn) return ( datetime( year=oldest_current_release_date.year, month=oldest_current_release_date.month, day=oldest_current_release_date.day, hour=0, minute=0, second=0, ), latest_update_date, )
def get_scrape(seed_path): """Wrap the http get so we can use backoff""" printlog(f'Attempting scrape for seed {seed_path}') return Scraper(seed=seed_path)
def step_impl(context): context.scrape = Scraper(context.url)
def step_impl(context, file_name): feature_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") seed_path = os.path.join(feature_path, "fixtures", file_name) context.scraper = Scraper(seed=seed_path)
def step_impl(context, uri): with vcr.use_cassette('features/fixtures/scrape.yml', record_mode=context.config.userdata.get('record_mode', DEFAULT_RECORD_MODE)): context.scraper = Scraper(uri, requests.Session())
def process_data(flow_type, dataset_url): YEAR_MONTH_RE = re.compile( r'([0-9]{4})(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)') # from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python class Re(object): def __init__(self): self.last_match = None def fullmatch(self, pattern, text): self.last_match = re.fullmatch(pattern, text) return self.last_match def time2period(t): gre = Re() if gre.fullmatch(YEAR_MONTH_RE, t): year, month = gre.last_match.groups() month_num = { 'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12', }.get(month) return f"{year}-{month_num}" else: print(f"no match for {t}") print(datetime.now(), f'process_data({flow_type}, {dataset_url}) start') scraper = Scraper(dataset_url, session=requests.Session()) distribution = scraper.distribution(mediaType=lambda x: 'zip' in x, latest=True) with ZipFile(BytesIO( scraper.session.get(distribution.downloadURL).content)) as zip: assert len(zip.namelist()) == 1 with zip.open(zip.namelist()[0]) as excelFile: table = pd.read_excel( excelFile, sheet_name=1, dtype={ 'COMMODITY': 'category', 'COUNTRY': 'category', 'DIRECTION': 'category', }, na_values=['', 'N/A'], keep_default_na=False, ) print(datetime.now(), flow_type, "loaded dataframe") table.drop(columns='DIRECTION', inplace=True) table.rename(columns={ 'COMMODITY': 'Product', 'COUNTRY': 'Geography' }, inplace=True) table = pd.melt( table, id_vars=['Product', 'Geography'], var_name='Period', value_name='Value', ) print(datetime.now(), flow_type, "melted") table['Period'] = table['Period'].astype('category') product = table['Product'].str.split(' ', n=1, expand=True) table['Product Code'], table['Product Name'] = ( product[0].astype('category'), product[1].astype('category'), ) geography = table['Geography'].str.split(' ', n=1, expand=True) table['Geography Code'], table['Geography Name'] = ( geography[0].astype('category'), geography[1].astype('category'), ) table.drop(columns=['Product', 'Geography'], inplace=True) print(datetime.now(), flow_type, "dropped product+geography") table['Period'].cat.categories = table['Period'].cat.categories.map( time2period) table['Period Type'] = 'month' table['Period Type'] = table['Period Type'].astype('category') table['Seasonal Adjustment'] = pd.Series('NSA', index=table.index, dtype='category') table['Measure Type'] = pd.Series('gbp-total', index=table.index, dtype='category') table['Unit'] = pd.Series('gbp', index=table.index, dtype='category') table['Flow'] = pd.Series(flow_type, index=table.index, dtype='category') print(datetime.now(), flow_type, "starting apply") table['Marker'] = numpy.select( condlist=[table['Value'].isna(), table['Value'].dtype == 'str'], choicelist=['not-available', table['Value']], default='', ) print(datetime.now(), flow_type, "finished apply") table['Marker'] = table['Marker'].astype('category') table['Value'] = pd.to_numeric(table['Value'], errors='coerce') table = table[[ 'Geography Code', 'Geography Name', 'Period', 'Period Type', 'Flow', 'Product Code', 'Product Name', 'Seasonal Adjustment', 'Measure Type', 'Value', 'Unit', 'Marker', ]] table.rename( columns={ 'Geography Code': 'ons_iso_alpha_2_code', 'Geography Name': 'ons_region_name', 'Period': "period", 'Period Type': "period_type", 'Flow': "direction", 'Product Code': "product_code", 'Product Name': "product_name", 'Seasonal Adjustment': "seasonal_adjustment", 'Measure Type': "measure_type", 'Value': "value", 'Unit': "unit", 'Marker': "marker", }, inplace=True, ) print(datetime.now(), f'process_data({flow_type}, {dataset_url}) finished', len(table)) return table
def process_data(): print(datetime.now(), 'process_data start') YEAR_RE = re.compile(r'[0-9]{4}') YEAR_MONTH_RE = re.compile( r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)') YEAR_QUARTER_RE = re.compile(r'([0-9]{4})(Q[1-4])') def product(name): if 'Total Trade' in name: return 'goods-and-services' elif 'TiG' in name: return 'goods' elif 'TiS' in name: return 'services' raise ValueError(f'Unknown product type ${name}') class Re(object): def __init__(self): self.last_match = None def fullmatch(self, pattern, text): self.last_match = re.fullmatch(pattern, text) return self.last_match def time2periodtype(t): gre = Re() if gre.fullmatch(YEAR_RE, t): return "year" elif gre.fullmatch(YEAR_MONTH_RE, t): return "month" elif gre.fullmatch(YEAR_QUARTER_RE, t): return "quarter" else: print(f"no match for {t}") def time2period(t): gre = Re() if gre.fullmatch(YEAR_RE, t): return t elif gre.fullmatch(YEAR_MONTH_RE, t): year, month = gre.last_match.groups() month_num = { 'JAN': '01', 'FEB': '02', 'MAR': '03', 'APR': '04', 'MAY': '05', 'JUN': '06', 'JUL': '07', 'AUG': '08', 'SEP': '09', 'OCT': '10', 'NOV': '11', 'DEC': '12', }.get(month) return f"{year}-{month_num}" elif gre.fullmatch(YEAR_QUARTER_RE, t): year, quarter = gre.last_match.groups() return f"{year}-{quarter}" else: print(f"no match for {t}") def process_sheet(sheetname, tab) -> pandas.DataFrame: print(datetime.now(), f'spreadsheet scrape - {sheetname} - start') if 'Index' in sheetname or 'Contact Sheet' in sheetname: print(datetime.now(), f"skipping {sheetname}") return pandas.DataFrame() observations = (tab.excel_ref('C7').expand(DOWN).expand( RIGHT).is_not_blank().is_not_whitespace()) Year = tab.excel_ref('C4').expand( RIGHT).is_not_blank().is_not_whitespace() Flow = tab.fill(DOWN).one_of(['Exports', 'Imports']) geo = tab.excel_ref('A7').expand( DOWN).is_not_blank().is_not_whitespace() geo_name = tab.excel_ref('B7').expand( DOWN).is_not_blank().is_not_whitespace() Dimensions = [ HDim(Year, 'Period', DIRECTLY, ABOVE), HDim(geo, 'Geography Code', DIRECTLY, LEFT), HDim(geo_name, 'Geography Name', DIRECTLY, LEFT), HDim(Flow, 'Flow', CLOSEST, ABOVE), HDimConst('Measure Type', 'gbp-total'), HDimConst('Unit', 'gbp-million'), ] c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True) new_table = c1.topandas() new_table.rename(columns={ 'OBS': 'Value', 'DATAMARKER': 'Marker' }, inplace=True) new_table['Flow'] = new_table['Flow'].map(lambda s: s.lower().strip()) new_table['Product'] = product(sheetname) new_table['Period'] = new_table['Period'].astype(str) new_table['Marker'] = (new_table['Marker'].fillna('') if 'Marker' in new_table else '') new_table = new_table[[ 'Geography Code', 'Geography Name', 'Period', 'Flow', 'Product', 'Measure Type', 'Value', 'Unit', 'Marker', ]] print(datetime.now(), f'scrape {sheetname} - complete') return new_table print(datetime.now(), "gathering info") scraper = Scraper(DATASET_URL) tabs = { tab.name: tab for tab in scraper.distribution(latest=True).as_databaker() } print(datetime.now(), f'spreadsheet scrape - start') table = pandas.concat([process_sheet(*args) for args in tabs.items()]) print(datetime.now(), f'spreadsheet scrape - complete') table['Period'] = table.Period.str.replace('\.0', '') table['Period Type'] = table['Period'].apply(time2periodtype) table['Period Type'] = table['Period Type'].astype('category') table['Period'] = table['Period'].apply(time2period) table['Period'] = table['Period'].astype('category') table['Period Type'] = table['Period Type'].astype('category') # (pandas) "Int64" type allows null values, unlike (numpy) "int64" - yes, the case matters. table['Value'] = pandas.to_numeric(table['Value'], errors='coerce').astype('Int64') print(datetime.now(), "dropping duplicates") table = table.drop_duplicates() print(datetime.now(), "renaming columns") table.rename( columns={ 'Geography Code': 'ons_iso_alpha_2_code', 'Geography Name': 'ons_region_name', 'Period': "period", 'Period Type': "period_type", 'Flow': "direction", 'Product': "product_name", 'Measure Type': "measure_type", 'Value': "value", 'Unit': "unit", 'Marker': "marker", }, inplace=True, ) print(datetime.now(), f'process_data finished', len(table)) return [table]