def test_feature_variable_column_name(): df = pd.DataFrame({"y": [1], "x": [2], "a": ["col_x"]}) conv = DFtoVW( label=SimpleLabel(Col("y")), namespaces=Namespace(Feature(name=Col("a"), value=Col("x"))), df=df, ) first_line = conv.process_df()[0] assert first_line == "1 | col_x:2"
def test_multiple_lines_conversion(): df = pd.DataFrame({"y": [1, -1], "x": [1, 2]}) conv = DFtoVW( label=SimpleLabel(Col("y")), namespaces=Namespace(Feature(value=Col("x"))), df=df, ) lines_list = conv.process_df() assert lines_list == ["1 | 1", "-1 | 2"]
def test_multiple_lines(): df = pd.DataFrame({"y": [1, -1], "x": [1, 2]}) conv = DFtoVW( label=SimpleLabel("y"), features=Feature(value="x"), df=df, ) lines_list = conv.convert_df() assert lines_list == ["1 | 1", "-1 | 2"]
def test_variable_feature_name(): df = pd.DataFrame({"y": [1], "x": [2], "a": ["col_x"]}) conv = DFtoVW( label=SimpleLabel("y"), features=Feature(name="a", value="x", name_from_df=True), df=df, ) first_line = conv.convert_df()[0] assert first_line == "1 | col_x:2"
def test_feature_constant_column_with_empty_name(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel(Col("y")), tag=Col("idx"), namespaces=Namespace([Feature(name="", value=2)]), df=df, ) first_line = conv.process_df()[0] assert first_line == "1 id_1| :2"
def test_feature_column_renaming_and_tag(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel(Col("y")), tag=Col("idx"), namespaces=Namespace([Feature(name="col_x", value=Col("x"))]), df=df, ) first_line = conv.process_df()[0] assert first_line == "1 id_1| col_x:2"
def test_absent_col_error(): with pytest.raises(ValueError) as value_error: df = pd.DataFrame({"a": [1]}) DFtoVW( df=df, label=SimpleLabel("a"), features=[Feature(col) for col in ["a", "c", "d"]], ) expected = "In 'Feature': column(s) 'c', 'd' not found in dataframe." assert expected == str(value_error.value)
def test_constant_feature_value_with_empty_name(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel("y"), tag="idx", features=Feature(name="", value=2, value_from_df=False), df=df, ) first_line = conv.convert_df()[0] assert first_line == "1 id_1| :2"
def test_feature_column_renaming_and_tag(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel("y"), tag="idx", features=Feature(name="col_x", value="x"), df=df, ) first_line = conv.convert_df()[0] assert first_line == "1 id_1| col_x:2"
def test_absent_col_error(): with pytest.raises(ValueError) as value_error: df = pd.DataFrame({"a": [1]}) conv = DFtoVW( df=df, label=SimpleLabel(Col("a")), namespaces=Namespace( [Feature(Col("a")), Feature(Col("c")), Feature("d")] ), ) expected = "In argument 'features', column(s) 'c' not found in dataframe" assert expected == str(value_error.value)
def test_multiple_namespaces(): df = pd.DataFrame({"y": [1], "a": [2], "b": [3]}) conv = DFtoVW( df=df, label=SimpleLabel(Col("y")), namespaces=[ Namespace(name="FirstNameSpace", features=Feature(Col("a"))), Namespace(name="DoubleIt", value=2, features=Feature(Col("b"))), ], ) first_line = conv.process_df()[0] assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
def test_wrong_feature_type_error(): df = pd.DataFrame({"y": [1], "x": [2]}) with pytest.raises(TypeError) as type_error: DFtoVW(df=df, label=SimpleLabel("y"), features="x") expected = "Argument 'features' should be a Feature or a list of Feature." assert expected == str(type_error.value)
def test_non_numerical_simplelabel_error(): df = pd.DataFrame({"y": ["a"], "x": ["featX"]}) with pytest.raises(TypeError) as type_error: DFtoVW(df=df, label=SimpleLabel(name="y"), features=Feature("x")) expected = "In argument 'name' of 'SimpleLabel', column 'y' should be either of the following type(s): 'int', 'float'." assert expected == str(type_error.value)