def test_numbers(self): DATA_IN = [ ['Organisation', 'Cluster', 'District', 'Count'], ['#org', '#sector+list', '#adm1', '#meta+count'], ['NGO A', 'WASH', 'Coast', ' 200'], ['NGO B', 'Education', 'Plains', '1,100 '], ['NGO B', 'Child Protection', 'Coast', 300], ['NGO A', 'Logistics', 'Coast', '1.7E5'] ] DATA_OUT = [ ['NGO A', 'WASH', 'Coast', '200'], ['NGO B', 'Education', 'Plains', '1100'], ['NGO B', 'Child Protection', 'Coast', '300'], ['NGO A', 'Logistics', 'Coast', '170000'] ] DATA_OUT_FORMATTED = [ ['NGO A', 'WASH', 'Coast', '200.00'], ['NGO B', 'Education', 'Plains', '1100.00'], ['NGO B', 'Child Protection', 'Coast', '300.00'], ['NGO A', 'Logistics', 'Coast', '170000.00'] ] self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(number='meta+count').values) self.assertEqual( DATA_OUT_FORMATTED, hxl.data(DATA_IN).clean_data(number='meta+count', number_format='0.2f').values )
def test_purge_malformed_data(self): DATA_IN = [ ['#date', '#affected', '#geo+lat', '#geo+lon'], ['1/Mar/2017', 'bad', '45N30', 'bad'], ['bad', '2,000', 'bad', '75W30'], ] DATA_OUT_UNPURGED = [ ['2017-03-01', 'bad', '45.5000', 'bad'], ['bad', '2000', 'bad', '-75.5000'], ] DATA_OUT_PURGED = [ ['2017-03-01', '', '45.5000', ''], ['', '2000', '', '-75.5000'], ] self.assertEqual( DATA_OUT_UNPURGED, hxl.data(DATA_IN).clean_data(date='date', number='affected', latlon='geo', purge=False).values) self.assertEqual( DATA_OUT_PURGED, hxl.data(DATA_IN).clean_data(date='date', number='affected', latlon='geo', purge=True).values)
def test_dates(self): DATA_IN = [ ['Organisation', 'Cluster', 'District', 'Date'], ['#org', '#sector+list', '#adm1', '#date'], ['NGO A', 'WASH', 'Coast', 'January 1 2015'], ['NGO B', 'Education', 'Plains', '2/2/15'], ['NGO B', 'Child Protection', 'Coast', '1 Mar/15'] ] DATA_OUT = [ ['NGO A', 'WASH', 'Coast', '2015-01-01'], ['NGO B', 'Education', 'Plains', '2015-02-02'], ['NGO B', 'Child Protection', 'Coast', '2015-03-01'] ] DATA_OUT_WEEK = [ ['NGO A', 'WASH', 'Coast', '2015-W01'], ['NGO B', 'Education', 'Plains', '2015-W06'], ['NGO B', 'Child Protection', 'Coast', '2015-W09'] ] DATA_OUT_MONTH = [ ['NGO A', 'WASH', 'Coast', '2015-01'], ['NGO B', 'Education', 'Plains', '2015-02'], ['NGO B', 'Child Protection', 'Coast', '2015-03'] ] DATA_OUT_YEAR = [ ['NGO A', 'WASH', 'Coast', '2015'], ['NGO B', 'Education', 'Plains', '2015'], ['NGO B', 'Child Protection', 'Coast', '2015'] ] self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(date='date').values) self.assertEqual(DATA_OUT_WEEK, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-W%V").values) self.assertEqual(DATA_OUT_MONTH, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y-%m").values) self.assertEqual(DATA_OUT_YEAR, hxl.data(DATA_IN).clean_data(date='date', date_format="%Y").values)
def assertRowErrors(self, row_values, errors_expected, schema_values=None, columns=None): """Set up a HXL row and count the errors in it""" errors = [] def callback(error): errors.append(error) if schema_values is None: schema = hxl.schema(hxl.data(self.DEFAULT_SCHEMA), callback=callback) else: schema = hxl.schema(hxl.data(schema_values), callback=callback) if columns is None: columns = self.DEFAULT_COLUMNS row = Row(values=row_values, columns=[Column.parse(tag) for tag in columns]) schema.start() if errors_expected == 0: self.assertTrue(schema.validate_row(row)) else: self.assertFalse(schema.validate_row(row)) self.assertEqual(len(errors), errors_expected)
def test_merge_patterns(self): SOURCE_DATA = [ ['P-code', 'District'], ['#adm1+code', '#adm1+name'], ['001', 'Coast'], ['002', 'Plains'], ] MERGE_DATA = [ ['P-code', 'Population (female)', 'Population (male)', 'Population (total)'], ['#adm1+code', '#population+f', '#population+m', '#population+total'], ['002', '51000', '49000', '100000'], ['001', '76000', '74000', '150000'], ] EXPECTED = [ ['P-code', 'District', 'Population (female)', 'Population (male)', 'Population (total)'], ['#adm1+code', '#adm1+name', '#population+f', '#population+m', '#population+total'], ['001', 'Coast', '76000', '74000', '150000'], ['002', 'Plains', '51000', '49000', '100000'], ] result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population') self.assertEqual(EXPECTED[0], result.headers) self.assertEqual(EXPECTED[1], result.display_tags) self.assertEqual(EXPECTED[2:], result.values) result = hxl.data(SOURCE_DATA).merge_data(hxl.data(MERGE_DATA), keys='#adm1+code', tags='#population+f,#population+m,#population+total') self.assertEqual(EXPECTED[0], result.headers) self.assertEqual(EXPECTED[1], result.display_tags) self.assertEqual(EXPECTED[2:], result.values)
def is_hxl(url): """Try to parse as a HXL dataset.""" try: hxl.data(url).columns return True except: return False
def assertRowErrors(self, row_values, errors_expected, schema_values=None, columns=None): """Set up a HXL row and count the errors in it""" errors = [] def callback(error): errors.append(error) if schema_values is None: schema = hxl.schema(hxl.data(self.DEFAULT_SCHEMA), callback=callback) else: schema = hxl.schema(hxl.data(schema_values), callback=callback) if columns is None: columns = self.DEFAULT_COLUMNS row = Row( values=row_values, columns=[Column.parse(tag) for tag in columns] ) schema.start() if errors_expected == 0: self.assertTrue(schema.validate_row(row)) else: self.assertFalse(schema.validate_row(row)) self.assertEqual(len(errors), errors_expected)
def countriesdata(cls, use_live=True): # type: (bool) -> List[Dict[Dict]] """ Read countries data from OCHA countries feed (falling back to file) Args: use_live (bool): Try to get use latest data from web rather than file in package. Defaults to True. Returns: List[Dict[Dict]]: Countries dictionaries """ if cls._countriesdata is None: countries = None if use_live: try: countries = hxl.data(cls._ochaurl) except IOError: logger.exception( 'Download from OCHA feed failed! Falling back to stored file.' ) if countries is None: countries = hxl.data(script_dir_plus_file( 'Countries & Territories Taxonomy MVP - C&T Taxonomy with HXL Tags.csv', Country), allow_local=True) cls.set_countriesdata(countries) return cls._countriesdata
def test_truthy(self): schema = hxl.schema( hxl.data(resolve_path('files/test_validation/truthy-schema.json'), allow_local=True)) BAD_DATA = [['#sector'], ['Health']] self.assertFalse(schema.validate(hxl.data(BAD_DATA))) GOOD_DATA = [['#adm2+code'], ['xxx']] self.assertTrue(schema.validate(hxl.data(GOOD_DATA)))
def test_minmax_numbers(self): DATA = [['#date+year', '#affected', '#adm1'], ['2016', '200', 'Coast'], ['2016', '100', 'Plains'], ['2015', '300', 'Coast'], ['2015', '200', 'Plains'], ['2014', '400', 'Coast'], ['2014', '300', 'Plains']] self.assertEqual( hxl.data(DATA).with_rows('#affected is max').values, [DATA[5]]) self.assertEqual( hxl.data(DATA).with_rows('#affected is min').values, [DATA[2]])
def test_blank_merge(self): data1 = hxl.data([['#sector+list', '#org+name', '#org+name'], ['Health', '', 'Red Cross']]) data2 = hxl.data([['#org+name', '#org+code'], ['XX', 'YY'], ['Red Cross', 'IFRC']]) expected = [['#sector+list', '#org+name', '#org+name', '#org+code'], ['Health', '', 'Red Cross', 'IFRC']] merged = data1.merge_data(data2, '#org+name', '#org+code') self.assertEqual(expected[1:], merged.values)
def test_empty_header_row(self): """Test for exception parsing an empty header row""" DATA = [ [], ['X', 'Y'], ['#adm1', '#affected'], ['Coast', '100'] ] hxl.data(DATA).columns
def test_remote_google(self): """Test reading from a Google Sheet (will fail without connectivity).""" # default tab with hxl.data(URL_GOOGLE_NOHASH, timeout=5) as source: self.compare_input(source) # specific tab with hxl.data(URL_GOOGLE_HASH, timeout=5) as source: self.compare_input(source)
def test_values_displaced_key(self): """Test that the filter scans all candidate keys.""" data1 = hxl.data([['#sector+list', '#org+name', '#org+name'], ['Health', 'xxx', 'Red Cross']]) data2 = hxl.data([['#org+name', '#org+code'], ['XX', 'YY'], ['Red Cross', 'IFRC']]) expected = [['#sector+list', '#org+name', '#org+name', '#org+code'], ['Health', 'xxx', 'Red Cross', 'IFRC']] merged = data1.merge_data(data2, '#org+name', '#org+code') self.assertEqual(expected[1:], merged.values)
def test_aggregates_mixed_types(self): """Test selecting on rows with mixed datatypes""" DATA = [ ["#affected"], ["1"], [2], ["N/A"], ] self.assertEqual([["1"]], hxl.data(DATA).with_rows('#affected is min').values) self.assertEqual([["N/A"]], hxl.data(DATA).with_rows('#affected is max').values)
def test_remote_google(self): """Test reading from a Google Sheet (will fail without connectivity).""" # default tab with hxl.data(URL_GOOGLE_NOHASH) as source: self.compare_input(source) # specific tab with hxl.data(URL_GOOGLE_HASH) as source: self.compare_input(source)
def test_optional_params(self): url = 'https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832' hxl.input.make_input(url, verify_ssl=True, timeout=30, http_headers={'User-Agent': 'libhxl-python'}) hxl.data(url, verify_ssl=True, timeout=30, http_headers={'User-Agent': 'libhxl-python'})
def test_aggregator_dates(self): DATA_IN = [['#event', '#date'], ['Flood', '2017-01-10 00:00:00'], ['Flood', '1 Jan 2018'], ['Flood', '06/30/2018']] # minimum date self.assertEqual([['Flood', '2017-01-10 00:00:00']], hxl.data(DATA_IN).count('event', 'min(#date)').values) # maximum date self.assertEqual([['Flood', '06/30/2018']], hxl.data(DATA_IN).count('event', 'max(#date)').values)
def test_minmax_numbers(self): DATA = [ ['#date+year', '#affected', '#adm1'], ['2016', '200', 'Coast'], ['2016', '100', 'Plains'], ['2015', '300', 'Coast'], ['2015', '200', 'Plains'], ['2014', '400', 'Coast'], ['2014', '300', 'Plains'] ] self.assertEqual(hxl.data(DATA).with_rows('#affected is max').values, [DATA[5]]) self.assertEqual(hxl.data(DATA).with_rows('#affected is min').values, [DATA[2]])
def test_truthy(self): schema = hxl.schema(hxl.data(resolve_path('files/test_validation/truthy-schema.json'), allow_local=True)) BAD_DATA = [ ['#sector'], ['Health'] ] self.assertFalse(schema.validate(hxl.data(BAD_DATA))) GOOD_DATA = [ ['#adm2+code'], ['xxx'] ] self.assertTrue(schema.validate(hxl.data(GOOD_DATA)))
def test_aggregator_strings(self): DATA_IN = [['#event', '#sector'], ['Flood', 'Food'], ['Flood', 'Health'], ['Flood', 'Education']] # minimum date self.assertEqual([['Flood', 'Education']], hxl.data(DATA_IN).count('event', 'min(#sector)').values) # maximum date self.assertEqual([['Flood', 'Health']], hxl.data(DATA_IN).count('event', 'max(#sector)').values)
def test_json_selector(self): SEL1_DATA = [["Coast", "100"]] SEL2_DATA = [["Plains", "200"]] # make sure legacy selectors still work with make_input(FILE_JSON_SELECTOR, True, selector="sel1") as input: self.assertEqual(SEL1_DATA, hxl.data(input).values) with make_input(FILE_JSON_SELECTOR, True, selector="sel2") as input: self.assertEqual(SEL2_DATA, hxl.data(input).values) # test JSONPath support with make_input(FILE_JSON_SELECTOR, True, selector="$.sel1") as input: self.assertEqual(SEL1_DATA, hxl.data(input).values)
def test_min_year(self): DATA = [ ['#date'], ['2018'], ['2017'] ] self.assertEqual('2017', hxl.data(DATA).min('#date'))
def test_min_date(self): DATA = [ ['#date'], ['2018-01-01'], ['1/1/2019'] ] self.assertEqual('2018-01-01', hxl.data(DATA).min('#date'))
def assertDatasetErrors(self, dataset, errors_expected, schema=None): errors = [] def callback(error): errors.append(error) if schema is None: schema = self.SCHEMA schema = hxl.schema(schema, callback) if errors_expected == 0: self.assertTrue(schema.validate(hxl.data(dataset))) else: self.assertFalse(schema.validate(hxl.data(dataset))) self.assertEqual(len(errors), errors_expected)
def test_blank_merge(self): data1 = hxl.data([ ['#sector', '#org+name', '#org+name'], ['Health', '', 'Red Cross'] ]) data2 = hxl.data([ ['#org+name', '#org+code'], ['XX', 'YY'], ['Red Cross', 'IFRC'] ]) expected = [ ['#sector', '#org+name', '#org+name', '#org+code'], ['Health', '', 'Red Cross', 'IFRC'] ] merged = data1.merge_data(data2, '#org+name', '#org+code') self.assertEqual(expected[1:], merged.values)
def convert(self, url_or_filename, allow_local=False): """ Top-level method to convert a HXLated 3W to IATI """ # open the data source (usually a URL) source = hxl.data(url_or_filename, allow_local=allow_local) # check that the required hashtags are present check_hashtags(source, self.REQUIRED_HASHTAGS) self.xmlout.start_document() self.xmlout.start_block( "iati-activities", { "generated-datetime": datetime.datetime.now().isoformat(), "version": "2.03", }) for row in source: self.do_activity(row) self.xmlout.end_block("iati-activities") self.xmlout.end_document()
def process_dataset(dataset): """Do something with a dataset tagged hxl""" record = { 'type': 'dataset', 'name': dataset['name'], 'title': dataset['title'], 'source': dataset['dataset_source'], 'resources': [] } for resource in dataset['resources']: try: resource_info = { 'type': 'resource', 'name': resource['name'], 'url': resource['url'], 'columns': [] } columns = hxl.data(resource['url']).columns resource_info['columns'].append([{ 'tag': column.tag, 'display_tag': column.display_tag, 'attributes': list(column.attributes) } for column in columns]) record['resources'].append(resource_info) except: print(" Skipped {} (not valid HXL)".format(resource['name']), file=sys.stderr) return None if record['resources']: return record else: return False
def test_repeat(self): # Test repeating a cache filter directly source = hxl.data(DATA).cache() rows1 = [row.values for row in source] rows2 = [row.values for row in source] self.assertEqual(3, len(rows1)) self.assertEqual(rows1, rows2)
def test_row_count(self): row_count = 0 with hxl.data(FILE_CSV, True) as source: # logical row count for row in source: row_count += 1 self.assertEqual(TestParser.EXPECTED_ROW_COUNT, row_count)
def test_repeat_sub(self): # Test repeating a cache filter backing another filter source = hxl.data(DATA).cache().with_rows('org=NGO A') rows1 = [row.values for row in source] rows2 = [row.values for row in source] self.assertEqual(1, len(rows1)) self.assertEqual(rows1, rows2)
def readXlsx(fileLocation): print "Trying to download XLSX" try: response = urlopen(fileLocation) try: print "Reading XLSX" wb = load_workbook(BytesIO(response.read())) except: print "Error reading " + str(fileLocation) return False sheet = wb.active data = {} except URLError as e: print("XLS Failed to download") try: rows_iter = sheet.iter_rows(min_col=1, min_row=1, max_col=sheet.max_column, max_row=sheet.max_row) dataset = [[cell.value for cell in row] for row in rows_iter] for i, row in enumerate(dataset): for j, cell in enumerate(dataset[i]): if isinstance(cell, datetime.date): dataset[i][j] = cell.strftime('%m/%d/%Y') elif isinstance(cell, basestring): dataset[i][j] = cell.encode('ascii', 'ignore') dataset = hxl.data(dataset).cache() output = processHXLData(dataset) print "HXL output" return output except Exception as e: print e return False
def read_external_filter(self, datasetinfo): # type: (Dict) -> Tuple[List[str],Iterator[Union[List,Dict]]] """Read filter list from external url poitning to a HXLated file Args: datasetinfo (Dict): Dictionary of information about dataset Returns: None """ external_filter = datasetinfo.get('external_filter') if not external_filter: return hxltags = external_filter['hxltags'] data = hxl.data(external_filter['url']) use_hxl = datasetinfo.get('use_hxl', False) for row in data: for hxltag in data.columns: if hxltag.display_tag in hxltags: if use_hxl: header = hxltag.display_tag else: header = hxltag.header dict_of_lists_add(self.filters, header, row.get('#country+code'))
def readXls(fileLocation): print "Trying to download XLS" try: response = urlopen(fileLocation).read() try: print "Reading XLS" wb = xlrd.open_workbook(file_contents=response) except Exception as e: print e print "Error reading " + str(fileLocation) return False xl_sheet = wb.sheet_by_index(0) except URLError as e: print("XLS Failed to download") try: dataset = [] for row in range(0, xl_sheet.nrows): r = [] for col in range(0, xl_sheet.ncols): if isinstance(xl_sheet.cell_value(row, col), basestring): r.append( xl_sheet.cell_value(row, col).encode('ascii', 'ignore')) else: r.append(xl_sheet.cell_value(row, col)) #if isinstance(cell, datetime.date): # dataset[i][j] = cell.strftime('%m/%d/%Y') dataset.append(r) dataset = hxl.data(dataset).cache() output = processHXLData(dataset) print "HXL output" return output except Exception as e: print e return False
def test_write_json_attribute_normalisation(self): DATA_IN = [['#sector+es+cluster'], ['Hygiene']] DATA_OUT = [{'#sector+cluster+es': 'Hygiene'}] buffer = StringIO() source = hxl.data(DATA_IN) hxl.input.write_json(buffer, source, use_objects=True) self.assertEqual(DATA_OUT, json.loads(buffer.getvalue()))
def test_google_row_number(self): source = hxl.data('https://docs.google.com/spreadsheets/d/1rOO0-xYa3kIOfI-6KR-mLgMTdgIEijNxM52Nfhs8uvg/edit#gid=0') for row in source: self.assertTrue(row.source_row_number is not None) self.assertEqual(row.source_row_number, row.row_number+1) # there are two header rows and the hashtags for i, column in enumerate(row.columns): self.assertEqual(i, column.column_number)
def test_repeat_sub(self): # Test repeating a cache filter backing another filter source = hxl.data(DATA).cache().with_rows('org=NGO A') rows1 = [row.values for row in source] rows2 = [row.values for row in source] self.assertEqual(2, len(rows1)) self.assertEqual(rows1, rows2)
def test_attributes(self): with hxl.data(FILE_CSV, True) as source: for row in source: for column_number, column in enumerate(row.columns): self.assertEqual( set(TestParser.EXPECTED_ATTRIBUTES[column_number]), column.attributes)
def test_write_json_objects(self): with open(FILE_JSON_OBJECTS_OUT) as input: expected = input.read() buffer = StringIO() with hxl.data(FILE_CSV, True) as source: hxl.input.write_json(buffer, source, use_objects=True) self.assertEqual(expected, buffer.getvalue())
def test_repeat(self): # Test repeating a cache filter directly source = hxl.data(DATA).cache() rows1 = [row.values for row in source] rows2 = [row.values for row in source] self.assertEqual(4, len(rows1)) self.assertEqual(rows1, rows2)
def test_filter(self): """ Confirm that the JSONPath implementation supports filters. """ data = [ ["#x"], ['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]'], ] self.assertEqual("2", hxl.data(data).jsonpath('$[?(@.a=1)].b').values[0][0]);
def test_write_json_objects(self): with open(FILE_JSON_OBJECTS_OUT) as input: expected = input.read() buffer = StringIO() with hxl.data(FILE_CSV, True) as source: hxl.io.write_json(buffer, source, use_objects=True) self.assertEqual(expected, buffer.getvalue())
def test_outliers(self): BAD_VALUES = ['1', '1000000'] raw_data = [ ['#affected'], ['1'], ['1000000'] ] for i in range(0, 10): raw_data += [ ['100'], ['200'], ['800'] ] seen_callback = False def callback(e): nonlocal seen_callback seen_callback = True self.assertTrue(e.value in BAD_VALUES) schema = hxl.schema([ ['#valid_tag', '#valid_value+outliers'], ['#affected', 'true'] ], callback=callback) data = hxl.data(raw_data) self.assertFalse(schema.validate(data)) self.assertTrue(seen_callback)
def test_values_displaced_key(self): """Test that the filter scans all candidate keys.""" data1 = hxl.data([ ['#sector+list', '#org+name', '#org+name'], ['Health', 'xxx', 'Red Cross'] ]) data2 = hxl.data([ ['#org+name', '#org+code'], ['XX', 'YY'], ['Red Cross', 'IFRC'] ]) expected = [ ['#sector+list', '#org+name', '#org+name', '#org+code'], ['Health', 'xxx', 'Red Cross', 'IFRC'] ] merged = data1.merge_data(data2, '#org+name', '#org+code') self.assertEqual(expected[1:], merged.values)
def test_write_csv(self): with open(FILE_CSV_OUT, 'rb') as input: expected = input.read() buffer = StringIO() with hxl.data(FILE_CSV, True) as source: hxl.io.write_hxl(buffer, source) # Need to work with bytes to handle CRLF self.assertEqual(expected, buffer.getvalue().encode('utf-8'))
def test_taxonomy_bad(self): schema = hxl.schema(SCHEMA_TAXONOMY) result = hxl.validate(hxl.data(DATA_TAXONOMY_BAD), schema) self.assertFalse(result['is_valid']) self.assertEqual(1, result['stats']['error']) self.assertEqual(0, result['stats']['external']) self.assertEqual(1, len(result['issues'])) self.assertEqual(0, len(result['external_issues']))
def add_append_filter(source, args, index): """Add the hxlappend filter to the end of the chain.""" exclude_columns = args.get('append-exclude-columns%02d' % index, False) for subindex in range(1, 100): dataset_url = args.get('append-dataset%02d-%02d' % (index, subindex)) if dataset_url: source = source.append(hxl.data(dataset_url), not exclude_columns) return source
def add_merge_filter(source, args, index): """Add the hxlmerge filter to the end of the pipeline.""" tags = hxl.TagPattern.parse_list(args.get('merge-tags%02d' % index, [])) keys = hxl.TagPattern.parse_list(args.get('merge-keys%02d' % index, [])) replace = (args.get('merge-replace%02d' % index) == 'on') overwrite = (args.get('merge-overwrite%02d' % index) == 'on') url = args.get('merge-url%02d' % index) merge_source = hxl.data(url) return source.merge_data(merge_source, keys=keys, tags=tags, replace=replace, overwrite=overwrite)
def test_wide_data(self): """Test for very wide data""" tagging_specs = [ ('cod_wardsr', '#adm3+code',), ('food_monthly', '#value+expenditure+food_monthly',), ] filename = resolve_path("files/test_converters/wide-tagging-test.csv") source = hxl.data(hxl.converters.Tagger(hxl.io.make_input(filename, allow_local=True), tagging_specs)).cache() self.assertTrue('#value+expenditure+food_monthly' in source.display_tags)
def test_latlon(self): DATA_IN = [ ['#foo', '#geo+lat', '#geo+lon', '#geo+coord'], ['75W 30 00', '45N 30 00', '75W 30 00', '45N 30 00,75W 30 00'], ] DATA_OUT = [ ['75W 30 00', '45.5000', '-75.5000', '45.5000,-75.5000'], ] self.assertEqual(DATA_OUT, hxl.data(DATA_IN).clean_data(latlon='geo').values)
def test_taxonomy_missing(self): """Handle a missing external taxonomy.""" schema = hxl.schema(SCHEMA_TAXONOMY_MISSING) result = hxl.validate(hxl.data(DATA_TAXONOMY_GOOD), schema) self.assertTrue(result['is_valid']) self.assertTrue('external_issues' in result) self.assertEqual(0, result['stats']['error']) self.assertEqual(1, result['stats']['external']) self.assertEqual(0, len(result['issues'])) self.assertEqual(1, len(result['external_issues']))
def test_aggregator_strings(self): DATA_IN = [ ['#event', '#sector'], ['Flood', 'Food'], ['Flood', 'Health'], ['Flood', 'Education'] ] # minimum date self.assertEqual( [['Flood', 'Education']], hxl.data(DATA_IN).count('event', 'min(#sector)').values ) # maximum date self.assertEqual( [['Flood', 'Health']], hxl.data(DATA_IN).count('event', 'max(#sector)').values )