def test_different_all(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) df2 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1"], "EXE_SOI_DTD": ["01/01/2015"] })) df3 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["2"], "EXE_SOI_DTD": ["01/02/2016"] })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft2 = FlatTable("FT2", df2, "FT2", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft3 = FlatTable("FT3", df3, "FT3", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft = FlatTable.difference_all([ft1, ft2, ft3], ["NUM_ENQ", "EXE_SOI_DTD"]) df4 = self.spark.createDataFrame( pd.DataFrame( OrderedDict([("NUM_ENQ", ["3"]), ("EXE_SOI_DTD", ["01/03/2017"])]))) expected_ft = FlatTable("result", df4, "result", ["NUM_ENQ", "EXE_SOI_DTD"], {}) self.assertEqual(expected_ft, ft)
def test_getitem(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft2 = FlatTable("FT2", ft1["NUM_ENQ"], "FT2", ["NUM_ENQ"], {}) df3 = self.spark.createDataFrame( pd.DataFrame({"NUM_ENQ": ["1", "2", "3"]})) ft3 = FlatTable("FT3", df3, "FT3", ["NUM_ENQ"], {}) self.assertEqual(ft3, ft2) with self.assertRaises(TypeError) as context: ft1[1] self.assertTrue("Expected a str" in str(context.exception))
def from_json(json_file: str) -> "FlatTableCollection": """ Build FlatTableCollection from metadata json file. Parameters ---------- json_file : str Flattening metadata json file, which is generated by flattening. Examples -------- >>> with open("metadata_flattening.json", "r") as f: ... collection = FlatTableCollection.from_json(f.read()) """ def get_single_tables(tables_json, names): return { table_json["output_table"]: SingleTable.from_json(table_json) for table_json in tables_json if table_json["output_table"] in set(names) } metadata_json = json.loads(json_file) return FlatTableCollection( { table["output_table"]: FlatTable.from_json( table, get_single_tables(metadata_json["operations"], table["sources"]), ) for table in metadata_json["operations"] if table["output_type"] == "flat_table" } )
def test_eq(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) df2 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2", "1"], "EXE_SOI_DTD": ["01/03/2017", "01/02/2016", "01/01/2015"], })) df3 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2", "1"], "EXE_SOI_DTF": ["01/03/2017", "01/02/2016", "01/01/2015"], })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft2 = copy.copy(ft1) ft3 = copy.copy(ft1) ft3.name = "FT2" ft3.source = df2 ft3.characteristics = "FT2" ft3.join_keys = ["NUM_ENQ", "EXE_SOI_DTF"] ft3 = copy.copy(ft1) ft3.name = "FT3" ft3.source = df3 ft3.characteristics = "FT3" ft3.join_keys = ["NUM_ENQ", "EXE_SOI_DTF"] self.assertEqual(ft1, ft2) self.assertNotEqual(ft1, ft3)
def plot_patient_events_each_year_on_months( figure: Figure, cohort: FlatTable, show=False, show_func=print, save_path=None, id_col: str = "NUM_ENQ", date_col: str = "EXE_SOI_DTD", years: List[int] = None, ) -> Figure: """ This method is used to visualize the 'patient events each year on months stat' in seaborn context. Parameters ---------- figure: matplotlib.figure.Figure, users can define it like plt.figure() or plt.gcf(). cohort: FlatTable, a flat table. show: {False, True}, optional, If show the pandas table of confidence degree, default first when optional. show_func: optional Function to show a pandas table, print by default. save_path: str, optional the HDFS path to persist the pandas table, None by default, the save data can be used in stat history api. id_col: str, identity column default = 'NUM_ENQ'. date_col: str, data column used for 'group by' statement, default = 'EXE_SOI_DTD'. years: a list of special years in which the data will be loaded, default is None. Examples -------- This is an example to illustrate how to use the function in jupyter. >>> with open("metadata_flattening.json", "r") as f: ... dcir_collection = FlatTableCollection.from_json(f.read()) >>> dcir = dcir_collection.get("DCIR") >>> plot_patient_events_each_year_on_months(plt.figure(figsize=(12, 8)), dcir) >>> plt.show() """ item = "{} as id, year({}) as year, month({}) as month".format( id_col, date_col, date_col ) new_cohort = FlatTable( cohort.name, cohort[item], cohort.characteristics, ["id", "year", "month"], cohort.single_tables, ) return FlatteningEventsEachYearOnMonthsStat()( figure, new_cohort, show=show, show_func=show_func, save_path=save_path, id_col=id_col, date_col=date_col, years=years, )
def test_union_all(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1"], "EXE_SOI_DTD": ["01/01/2015"] })) df2 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["2"], "EXE_SOI_DTD": ["01/02/2016"] })) df3 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3"], "EXE_SOI_DTD": ["01/03/2017"] })) single_1 = SingleTable("ST1", df1, "ST1") single_2 = SingleTable("ST2", df2, "ST2") ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {"ST1": single_1}) ft2 = FlatTable("FT2", df2, "FT2", ["NUM_ENQ", "EXE_SOI_DTD"], {"ST2": single_2}) ft3 = FlatTable("FT3", df3, "FT3", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft = FlatTable.union_all([ft1, ft2, ft3]) df4 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) expected_ft = FlatTable( "result", df4, "result", ["NUM_ENQ", "EXE_SOI_DTD"], { "ST1": single_1, "ST2": single_2 }, ) self.assertEqual(expected_ft, ft)
def test_difference(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016"] })) df2 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2"], "EXE_SOI_DTD": ["01/03/2017", "01/02/2016"] })) df3 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2"], "EXE_SOI_DTF": ["01/03/2017", "01/02/2016"] })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft2 = FlatTable("FT2", df2, "FT2", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft3 = FlatTable("FT3", df3, "FT3", ["NUM_ENQ", "EXE_SOI_DTF"], {}) ft = ft1.difference(ft2) df4 = self.spark.createDataFrame( pd.DataFrame( OrderedDict([("NUM_ENQ", ["1"]), ("EXE_SOI_DTD", ["01/01/2015"])]))) expected_ft = FlatTable("result", df4, "result", ["NUM_ENQ", "EXE_SOI_DTD"], {}) self.assertEqual(expected_ft, ft) self.assertRaises(ValueError, ft1.difference, ft3)
def test_in(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) df2 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2", "2"], "EXE_SOI_DTD": ["01/03/2017", "01/02/2016", "01/02/2016"], })) df3 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["3", "2"], "EXE_SOI_DTF": ["01/03/2017", "01/02/2016"] })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft2 = FlatTable("FT2", df2, "FT2", ["NUM_ENQ", "EXE_SOI_DTD"], {}) ft3 = FlatTable("FT3", df3, "FT3", ["NUM_ENQ", "EXE_SOI_DTF"], {}) self.assertIn(ft2, ft1) self.assertNotIn(ft1, ft2) self.assertNotIn(ft3, ft1)
def test_setter_validation(self): df1 = self.spark.createDataFrame( pd.DataFrame({ "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], })) ft1 = FlatTable("FT1", df1, "FT1", ["NUM_ENQ", "EXE_SOI_DTD"], {}) with self.assertRaises(TypeError) as context: ft1.name = None self.assertTrue("Expected a string" in str(context.exception)) with self.assertRaises(TypeError) as context: ft1.source = None self.assertTrue("Expected a Spark DataFrame" in str(context.exception)) with self.assertRaises(TypeError) as context: ft1.characteristics = None self.assertTrue("Expected a string" in str(context.exception)) with self.assertRaises(TypeError) as context: ft1.join_keys = None self.assertTrue("Expected a List" in str(context.exception)) with self.assertRaises(TypeError) as context: ft1.single_tables = None