def test_read_parametric(sample_pridb): param = sample_pridb.read_parametric() assert len(param) == len(PARAMETRIC_EXPECTED) assert param.index.name == "set_id" assert param.index.dtype == int64 assert dict(param.dtypes) == { "param_id": Int64Dtype(), "time": float64, "param_id": Int64Dtype(), "pctd": Int64Dtype(), "pcta": Int64Dtype(), }
def test_read(sample_pridb): df = sample_pridb.read() assert df.index.name == "set_id" assert df.index.dtype == int64 assert dict(df.dtypes) == { "set_type": int64, "time": float64, "channel": Int64Dtype(), "param_id": Int64Dtype(), "threshold": float64, "amplitude": float64, "rise_time": float64, "cascade_counts": Int64Dtype(), "cascade_energy": float64, "cascade_hits": Int64Dtype(), "cascade_signal_strength": float64, "counts": Int64Dtype(), "duration": float64, "energy": float64, "rms": float64, "signal_strength": float64, "trai": Int64Dtype(), "pctd": Int64Dtype(), "pcta": Int64Dtype(), }
def test_read_markers(sample_pridb): markers = sample_pridb.read_markers() assert len(markers) == len(LABELS_EXPECTED) assert markers.index.name == "set_id" assert markers.index.dtype == int64 assert dict(markers.dtypes) == { "time": float64, "set_type": Int64Dtype(), "number": Int64Dtype(), "data": dtype("O"), } labels = list(markers["data"]) assert labels == LABELS_EXPECTED
class TemplatePipelineChain(PipelineChain): """ Pipeline chain for `template` data, which will output a table with the schema described below. For very simple pipelines (e.g. single source) this class can be placed in the same file as the one defining the pipeline. See [MetadataPipelineChain] for an example of a very simple pipeline. """ schema: Dict[str, type] = { "date": str, "key": str, "column1": Int64Dtype(), "column2": str, } """ Defines the schema of the output table, dtypes str, float and Int64 are supported """ pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [ ( SourceNamePipeline(), { "parse_opts": {}, "merge_opts": {}, "filter_func": None }, ), (RExamplePipeline(), {}), ] """ Defines the pipelines to be run in order to produce the combined, full output """
def _parse_dtype(dtype_name: str) -> type: if dtype_name == "str": return str if dtype_name == "int": return Int64Dtype() if dtype_name == "float": return float raise TypeError(f"Unsupported dtype: {dtype_name}")
class EconomyPipelineChain(PipelineChain): schema: Dict[str, type] = { "key": str, "gdp": Int64Dtype(), "gdp_per_capita": Int64Dtype(), } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [( WikidataPipeline(), { "parse_opts": { "properties": { "gdp": "P2131", "gdp_per_capita": "P2132", } } }, )]
def integral_extension_dtypes(self): return ([ "Int8", "Int16", "Int32", "Int64", Int8Dtype(), Int16Dtype(), Int32Dtype(), Int64Dtype(), ] if extension_dtypes_available else [])
def test_as_spark_type_extension_dtypes(self): from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype type_mapper = { Int8Dtype(): ByteType(), Int16Dtype(): ShortType(), Int32Dtype(): IntegerType(), Int64Dtype(): LongType(), } for extension_dtype, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(extension_dtype), spark_type)
class GeographyPipelineChain(PipelineChain): schema: Dict[str, type] = { "key": str, "latitude": float, "longitude": float, "elevation": Int64Dtype(), "area": Int64Dtype(), } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [( WikidataPipeline(), { "parse_opts": { "properties": { "latitude": "P625", "longitude": "P625", "elevation": "P2044", "area": "P2046", } } }, )]
def spark_type_to_pandas_dtype( spark_type: types.DataType, *, use_extension_dtypes: bool = False ) -> Dtype: """Return the given Spark DataType to pandas dtype.""" if use_extension_dtypes and extension_dtypes_available: # IntegralType if isinstance(spark_type, types.ByteType): return Int8Dtype() elif isinstance(spark_type, types.ShortType): return Int16Dtype() elif isinstance(spark_type, types.IntegerType): return Int32Dtype() elif isinstance(spark_type, types.LongType): return Int64Dtype() if extension_object_dtypes_available: # BooleanType if isinstance(spark_type, types.BooleanType): return BooleanDtype() # StringType elif isinstance(spark_type, types.StringType): return StringDtype() # FractionalType if extension_float_dtypes_available: if isinstance(spark_type, types.FloatType): return Float32Dtype() elif isinstance(spark_type, types.DoubleType): return Float64Dtype() if isinstance( spark_type, ( types.DateType, types.NullType, types.ArrayType, types.MapType, types.StructType, types.UserDefinedType, ), ): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
class IndexPipelineChain(PipelineChain): schema: Dict[str, type] = { "key": str, "wikidata": str, "datacommons": str, "country_code": str, "country_name": str, "subregion1_code": str, "subregion1_name": str, "subregion2_code": str, "subregion2_name": str, "3166-1-alpha-2": str, "3166-1-alpha-3": str, "aggregation_level": Int64Dtype(), } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [(IndexPipeline(), {})]
def parse_dtype(dtype_name: str) -> Any: """ Parse a dtype name into its pandas name. Only the following dtypes are supported in our table schemas: | column type label | pandas dtype | | ----------------- | ------------ | | str | str | | int | Int64 | | float | float | Arguments: dtype_name: label of the dtype object Returns: type: dtype object """ if dtype_name == "str": return "str" if dtype_name == "int": return Int64Dtype() if dtype_name == "float": return "float" raise TypeError(f"Unsupported dtype: {dtype_name}")
class DemographicsPipelineChain(PipelineChain): schema: Dict[str, type] = { "key": str, "population": Int64Dtype(), "life_expectancy": float, "human_development_index": float, } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [ ( WikidataPipeline(), { "parse_opts": { "properties": { "population": "P1082", "life_expectancy": "P2250", "human_development_index": "P1081", } } }, ) ]
def test_read_hits(sample_pridb): hits = sample_pridb.read_hits() assert len(hits) == len(HITS_EXPECTED) assert hits.index.name == "set_id" assert hits.index.dtype == int64 assert dict(hits.dtypes) == { "time": float64, "channel": Int64Dtype(), "param_id": Int64Dtype(), "threshold": float64, "amplitude": float64, "rise_time": float64, "cascade_counts": Int64Dtype(), "cascade_energy": float64, "cascade_hits": Int64Dtype(), "cascade_signal_strength": float64, "counts": Int64Dtype(), "duration": float64, "energy": float64, "rms": float64, "signal_strength": float64, "trai": Int64Dtype(), }
def test_london_cleaner(): unclean_input = pd.DataFrame.from_dict( { "Place (Overall)": [12547, 34146], "Place (Gender)": [9390, 20833], "Place (Category)": [4345, 3132], "Name": ["»A Smith, Matthew (GBR) \n", "»Aalders, Jennifer (GBR) \n"], "Sex": ["M", "W"], "Club": ["Lymm Runners", "Tynny Trotters"], "Running Number": ["Runner Number40546", "Runner Number23235"], "Category": ["18-39", pd.NA], "Finish": ["0 days 03:59:33", "0 days 06:22:20"], "Year": [2021, 2021], } ) exp_output = pd.DataFrame.from_dict( { "Place (Overall)": [12547, 34146], "Place (Gender)": [9390, 20833], "Place (Category)": [4345, 3132], "Name": ["A Smith Matthew", "Aalders Jennifer"], "Sex": ["M", "F"], "Club": ["Lymm Runners", "Tynny Trotters"], "Running Number": ["40546", "23235"], "Category": ["18-39", "Unknown"], "Finish": [ pd.Timedelta("0 days 03:59:33"), pd.Timedelta("0 days 06:22:20"), ], "Year": [2021, 2021], "Country": ["GBR", "GBR"], "FirstName": ["Matthew", "Jennifer"], "LastName": ["A Smith", "Aalders"], "DSQ": [False, False], "Finish (Total Seconds)": [14373.0, 22940.0], } ).astype( { "Place (Overall)": Int64Dtype(), "Place (Gender)": Int64Dtype(), "Place (Category)": Int64Dtype(), "Name": dtype("O"), "Sex": dtype("O"), "Club": dtype("O"), "Running Number": dtype("O"), "Category": CategoricalDtype( categories=[ "18-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+", "70-74", "75-79", "80-84", "85+", "80+", "Unknown", ], ordered=False, ), "Finish": dtype("<m8[ns]"), "Year": Int64Dtype(), "Country": dtype("O"), "FirstName": dtype("O"), "LastName": dtype("O"), "DSQ": dtype("bool"), "Finish (Total Seconds)": dtype("float64"), } ) actual_output = london_cleaner(unclean_input) pd.testing.assert_frame_equal(actual_output, exp_output, check_categorical=False)
def test_output_attributes(scraper_output): results = scraper_output exp_cols = [ "Place (Overall)", "Place (Gender)", "Place (Category)", "Name", "Sex", "Club", "Running Number", "Category", "Finish", "Year", "Country", "FirstName", "LastName", "DSQ", "Finish (Total Seconds)", ] exp_dtypes = pd.Series({ "Place (Overall)": Int64Dtype(), "Place (Gender)": Int64Dtype(), "Place (Category)": dtype("float64"), "Name": dtype("O"), "Sex": dtype("O"), "Club": dtype("O"), "Running Number": dtype("O"), "Category": CategoricalDtype( categories=[ "18-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+", "70-74", "75-79", "80-84", "85+", "80+", "Unknown", ], ordered=False, ), "Finish": dtype("<m8[ns]"), "Year": Int64Dtype(), "Country": dtype("O"), "FirstName": dtype("O"), "LastName": dtype("O"), "DSQ": dtype("bool"), "Finish (Total Seconds)": dtype("float64"), }) exp_rows_min = 1000 # One sex for one year should give at least this many assert exp_cols == list(results.columns), "Expected columns not found" assert exp_rows_min <= results.shape[ 0], "Less than minimum expected number of rows" assert exp_dtypes.values.tolist() == results.dtypes.values.tolist()
class EpidemiologyPipelineChain(PipelineChain): schema: Dict[str, type] = { "date": str, "key": str, "new_confirmed": Int64Dtype(), "new_deceased": Int64Dtype(), "new_recovered": Int64Dtype(), "new_tested": Int64Dtype(), "total_confirmed": Int64Dtype(), "total_deceased": Int64Dtype(), "total_recovered": Int64Dtype(), "total_tested": Int64Dtype(), } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [ # Start with yesterday's data to make sure that we carry over datapoints in case the data # source has gone offline or is temporarily unavailable # (OpenCovid19Pipeline(), {}), # Data sources for all countries level 1 (OurWorldInDataPipeline(), {}), (ECDCPipeline(), {}), # Data sources for AR level 2 ( WikipediaPipeline( "{}/{}/Argentina_medical_cases".format(_wiki_base_url, _wiki_template_path) ), { "parse_opts": { "date_format": "%d %b", "country": "AR", "skiprows": 1, "cumsum": True, } }, ), # Data sources for AT level 2 ( Covid19EuDataPipeline("AT"), # Remove dates with known bad data # TODO: apply patch to make up for missing dates {"filter_func": lambda x: not x.date in ["2020-04-14", "2020-04-15"]}, ), # Data sources for AU level 2 (Covid19AuPipeline(), {}), ( WikipediaPipeline( "{}/{}/Australia_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%d %B", "country": "AU", "cumsum": True}}, ), # Data sources for BO level 2 ( WikipediaPipeline( "{}/{}/Bolivia_medical_cases".format(_wiki_base_url, _wiki_template_path) ), { "parse_opts": { "date_format": "%b %d", "country": "BO", "skiprows": 1, "droprows": "Date(2020)", } }, ), # Data sources for BR level 2 (Covid19BrazilTimeseriesPipeline(), {}), # Data sources for CA level 2 (CanadaPipeline(), {}), # Data sources for CH level 2 (OpenZHPipeline(), {}), # Data sources for CL level 2 ( WikipediaPipeline( "{}/{}/Chile_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%Y-%m-%d", "country": "CL", "skiprows": 1}}, ), # Data sources for CN level 2 (DXYPipeline(), {"parse_opts": {"country_name": "China"}}), # Data sources for CO levels 2 + 3 (ColombiaPipeline(), {}), # Data sources for CZ level 2 (Covid19EuDataPipeline("CZ"), {}), # Data sources for DE level 2 (Covid19GermanyPipeline(), {}), # Data sources for ES levels 1 + 2 # (DatadistaPipeline(), {}), (ISCIIIPipeline(), {}), # Data sources for FR level 2 ( WikipediaPipeline( "{}/{}/France_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%Y-%m-%d", "country": "FR", "skiprows": 1}}, ), (FranceCovid19Pipeline(), {}), # Data sources for GB lebels 2 + 3 (Covid19UkDataL2Pipeline(), {}), (Covid19UkDataL3Pipeline(), {}), # Data sources for ID level 2 (CatchmeupPipeline(), {}), # Data sources for IN level 2 ( WikipediaPipeline("{}/2020_coronavirus_pandemic_in_India".format(_wiki_base_url)), {"parse_opts": {"date_format": "%b-%d", "country": "IN", "skiprows": 1}}, ), # Data sources for IT level 2 (PcmDpcL1Pipeline(), {}), (PcmDpcL2Pipeline(), {}), # Data sources for JP level 2 ( WikipediaPipeline( "{}/{}/Japan_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%Y/%m/%d", "country": "JP", "skiprows": 2}}, ), (Jp2019NcovJapanByDate(), {}), # Data sources for KR level 2 ( WikipediaPipeline( "{}/{}/South_Korea_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%Y-%m-%d", "country": "KR", "skiprows": 1}}, ), # Data sources for MY level 2 ( WikipediaPipeline("{}/2020_coronavirus_pandemic_in_Malaysia".format(_wiki_base_url)), { "parse_opts": { "date_format": "%d/%m", "country": "MY", "cumsum": True, "drop_column": "deceased", } }, ), # Data sources for MX level 2 (MexicoCovid19Pipeline(), {}), # Data sources for NL levels 2 + 3 (CoronaWatchNlPipeline(), {}), # Data sources for NO level 2 (Covid19EuDataPipeline("NO"), {}), # Data sources for PE level 2 ( WikipediaPipeline( "https://es.wikipedia.org/wiki/Pandemia_de_enfermedad_por_coronavirus_de_2020_en_Per%C3%BA" ), { "parse_opts": { "date_format": "%d de %B", "country": "PE", "locale": "es_ES", "skiprows": 1, } }, ), # Data sources for PK level 2 ( WikipediaPipeline( "{}/{}/Pakistan_medical_cases".format(_wiki_base_url, _wiki_template_path) ), { "parse_opts": { "date_format": "%b %d", "country": "PK", "skiprows": 1, "cumsum": True, } }, ), # Data sources for PL level 2 (Covid19EuDataPipeline("PL"), {}), # Data sources for PT level 2 (Covid19PtPipeline(), {}), # Data sources for RU level 2 ( WikipediaPipeline( "{}/{}/Russia_medical_cases".format(_wiki_base_url, _wiki_template_path) ), {"parse_opts": {"date_format": "%d %b", "country": "RU", "skiprows": 1}}, ), # Data sources for SE level 2 (Covid19EuDataPipeline("SE"), {}), # Data sources for SI level 1 (SloveniaPipeline(), {}), # Data sources for US levels 2 + 3 (CovidTrackingPipeline(), {}), (NytCovidL2Pipeline(), {}), (NytCovidL3Pipeline(), {}), ]
def test_registry_byte_size_dtype(sound_subreg): from pandas import Int64Dtype assert sound_subreg["byte_size"].dtype == Int64Dtype()
class OxfordGovernmentResponsePipelineChain(PipelineChain): schema: Dict[str, type] = { "date": str, "key": str, "school_closing": Int64Dtype(), "workplace_closing": Int64Dtype(), "cancel_public_events": Int64Dtype(), "restrictions_on_gatherings": Int64Dtype(), "public_transport_closing": Int64Dtype(), "stay_at_home_requirements": Int64Dtype(), "restrictions_on_internal_movement": Int64Dtype(), "international_travel_controls": Int64Dtype(), "income_support": Int64Dtype(), "debt_relief": Int64Dtype(), "fiscal_measures": Int64Dtype(), "international_support": Int64Dtype(), "public_information_campaigns": Int64Dtype(), "testing_policy": Int64Dtype(), "contact_tracing": Int64Dtype(), "emergency_investment_in_healthcare": Int64Dtype(), "investment_in_vaccines": Int64Dtype(), "stringency_index": float, } pipelines: List[Tuple[DataPipeline, Dict[str, Any]]] = [ (OxfordGovernmentResponsePipeline(), {}) ]