Example #1
0
 def test_read_csv(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/data.csv")
     feature_tbl = FeatureTable.read_csv(file_path, header=True)
     assert feature_tbl.size() == 4
     columns = feature_tbl.columns
     assert columns == ["col1", "col2", "col3"]
     records = feature_tbl.df.collect()
     assert isinstance(records[0][0], float)
     assert isinstance(records[0][1], str) and isinstance(
         records[0][1], str)
     file_path2 = os.path.join(self.resource_path,
                               "friesian/feature/data_no_header.csv")
     feature_tbl2 = FeatureTable.read_csv(file_path2,
                                          names=["col1", "_col2", "col3"],
                                          dtype={"col1": "int"})
     assert feature_tbl2.size() == 4
     columns2 = feature_tbl2.columns
     assert columns2 == ["col1", "_col2", "col3"]
     records2 = feature_tbl2.df.collect()
     assert isinstance(records2[0][0], int)
     assert isinstance(records2[0][1], str) and isinstance(
         records2[0][1], str)
     feature_tbl3 = FeatureTable.read_csv(file_path,
                                          header=True,
                                          dtype=["int", "str", "str"])
     records3 = feature_tbl3.df.collect()
     assert isinstance(records3[0][0], int)
     assert isinstance(records3[0][1], str) and isinstance(
         records3[0][1], str)
Example #2
0
 def test_split(self):
     file_path = os.path.join(self.resource_path, "orca/learn/ncf.csv")
     feature_tbl = FeatureTable.read_csv(file_path,
                                         header=True,
                                         dtype="int")
     tbl1, tbl2 = feature_tbl.split([0.8, 0.2], seed=1128)
     total_size = feature_tbl.size()
     size1 = tbl1.size()
     size2 = tbl2.size()
     assert size1 + size2 == total_size
Example #3
0
 def test_append_column(self):
     file_path = os.path.join(self.resource_path, "friesian/feature/")
     df = FeatureTable.read_csv(file_path + "data.csv", header=True)
     df = df.append_column("z", 0)
     assert df.select("z").size() == 4
     assert df.filter("z == 0").size() == 4
     df = df.append_column("str", "a")
     assert df.select("str").size() == 4
     assert df.filter("str == 'a'").size() == 4
     df = df.append_column("float", 1.2)
     assert df.select("float").size() == 4
     assert df.filter("float == 1.2").size() == 4
Example #4
0
 def test_category_encode_and_one_hot_encode(self):
     file_path = os.path.join(self.resource_path,
                              "friesian/feature/data.csv")
     feature_tbl = FeatureTable.read_csv(file_path, header=True)
     feature_tbl, indices = feature_tbl.category_encode(
         columns=["col2", "col3"])
     assert isinstance(indices, list) and len(indices) == 2
     assert isinstance(indices[0], StringIndex) and isinstance(
         indices[1], StringIndex)
     assert indices[0].size() == 3 and indices[1].size() == 4
     dict1 = indices[0].to_dict()
     dict2 = indices[1].to_dict()
     records = feature_tbl.df.collect()
     assert records[0][1] == dict1["x"] and records[0][2] == dict2["abc"]
     assert records[3][1] == dict1["z"] and records[2][2] == dict2["aaa"]
     feature_tbl = feature_tbl.one_hot_encode(columns=["col2", "col3"],
                                              prefix=["o1", "o2"])
     feature_tbl.show()
     columns = feature_tbl.columns
     assert columns == [
         "col1", "o1_0", "o1_1", "o1_2", "o1_3", "o2_0", "o2_1", "o2_2",
         "o2_3", "o2_4"
     ]
     records = feature_tbl.df.collect()
     record = records[0]
     value1 = dict1["x"]
     value2 = dict2["abc"]
     for i in range(1, 4):
         if i == value1:
             assert record[i + 1] == 1
         else:
             assert record[i + 1] == 0
     for i in range(1, 5):
         if i == value2:
             assert record[i + 5] == 1
         else:
             assert record[i + 5] == 0