def test_ifelse_wrong_type(): DT = dt.Frame(A=range(10)) DT["B"] = dt.str32(f.A) msg = r"The condition argument in ifelse\(\) must be a boolean column" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A, f.A, f.A)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.B, f.A, f.A)]
def test_ifelse_with_groupby(): DT = dt.Frame(A=[2, 5, 2, 5, 2, 2], B=range(6)) R1 = DT[:, ifelse(f.A == 2, dt.min(f.B), dt.max(f.B)), by(f.A)] R2 = DT[:, ifelse(f.A == 2, f.B, dt.max(f.B)), by(f.A)] R3 = DT[:, ifelse(f.A == 2, dt.min(f.B), f.B), by(f.A)] R4 = DT[:, ifelse(f.B > 2, dt.min(f.B), f.B), by(f.A)] assert_equals(R1, dt.Frame(A=[2, 5], C0=[0, 3])) assert_equals(R2, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 4, 5, 3, 3])) assert_equals(R3, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 0, 0, 0, 1, 3])) assert_equals(R4, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 0, 0, 1, 1]))
def test_ifelse_columnsets(): DT = dt.Frame(A=range(10), B=[7] * 10, C=list('abcdefghij')) msg = r"Multi-column expressions are not supported in ifelse\(\) function" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f[:], 0, 1)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 3, f[:], f.A)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 3, f.A, f[:])] # We could in theory make this work... with pytest.raises(TypeError, match=msg): DT[:, ifelse(f[int] > 3, 3, f[int])]
def test_ifelse_bad_signature(): DT = dt.Frame(A=range(10)) msg = r"Function datatable\.ifelse\(\) requires at least 3 arguments" with pytest.raises(TypeError, match=msg): DT[:, ifelse()] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0, f.A)] msg = r"Missing the required default argument in function " \ r"datatable\.ifelse\(\)" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0, f.A, f.A, f.A)]
def test_ifelse_bad_signature(): DT = dt.Frame(A=range(10)) msg = r"Function ifelse\(\) requires 3 arguments" with pytest.raises(TypeError, match=msg): DT[:, ifelse()] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0, f.A)] msg = r"ifelse\(\) takes at most 3 positional arguments, " \ r"but 4 were given" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 0, f.A, f.A, f.A)]
def test_countries(): # inspired by @knapply's example countries = dt.Frame( name=["Czech Republic", "Czecho-Slovakia", "Mexico", "Czech Republic", "Canada", "Czechoslovakia", "USA", "Britain"], year=[1918] + list(range(1990, 1997)) ) def is_czech_name(x): return ((x == "Czechoslovak Republic") | (x == "Czechoslovakia") | (x == "Czech Republic") | (x == "Czecho-Slovakia")) name, year = countries.export_names() RES = countries[:, {"historic_name": ifelse( is_czech_name(name) & (year <= 1938), "Czechoslovak Republic", is_czech_name(name) & (year <= 1992), "Czechoslovakia", is_czech_name(name) & (year >= 1993), "Czech Republic", (name == "USA"), "United States of America", (name == "Britain"), "United Kingdom", name )}] assert_equals(RES, dt.Frame(historic_name=["Czechoslovak Republic", "Czechoslovakia", "Mexico", "Czechoslovakia", "Canada", "Czech Republic", "United States of America", "United Kingdom"]))
def test_ifelse_multi_different_grouplevels(): DT = dt.Frame(A=[1, 2, 3, 4, 5, 6]) RES = DT[:, ifelse(f.A <= 2, dt.min(f.A), f.A >= 5, dt.max(f.A), f.A == 0, 1000000, f.A)] assert_equals(RES, dt.Frame([1, 1, 3, 4, 6, 6]))
def addRiskColumn(table, incidencePrognosisColumn, newColumn, darkFactor): incidence = dt.f[incidencePrognosisColumn] newTable = table[:, dt.f[:].extend({ newColumn: dt.ifelse(incidence <= 0, 99999, 100000 / (incidence * darkFactor)) })] #print(newTable) return newTable
def test_ifelse_columnsets(): DT = dt.Frame(A=range(10), B=[7]*10, C=list('abcdefghij')) msg = r"The condition1 argument in ifelse\(\) cannot be a multi-column " \ r"expression" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f[:], 0, 1)] with pytest.raises(TypeError, match=msg): DT[:, ifelse(f[int] > 3, 3, f[int])] msg = r"The value1 argument in ifelse\(\) cannot be a multi-column " \ r"expression" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 3, f[:], f.A)] msg = r"The value2 argument in ifelse\(\) cannot be a multi-column " \ r"expression" with pytest.raises(TypeError, match=msg): DT[:, ifelse(f.A > 3, f.A, f[:])]
def arreglar_infinitos(dataset: Frame) -> Frame: for column in dataset.names: if column != 'clase_ternaria': dataset[column] = dataset[:, ifelse( math.isinf(f[column]) == 1, None, f[column])] return dataset
def test_example(): DT = dt.Frame(domestic_income=[4500, 2500, 1500, 4000], internationaL_income=[2000, 5000, 1000, 4500]) DT["profit_loss"] = ifelse(f.domestic_income > f.internationaL_income, "profit", "loss") assert_equals( DT, dt.Frame(domestic_income=[4500, 2500, 1500, 4000], internationaL_income=[2000, 5000, 1000, 4500], profit_loss=["profit", "loss", "profit", "loss"]))
def calcular_ganancia(data: Frame, votos: int) -> float: datos_votos = data[:, 'votos'].to_numpy().flatten() >= votos ganancias = data[:, ifelse(f.clase_ternaria == 'BAJA+2', 29250, -750)].to_numpy().flatten() return np.dot(datos_votos, ganancias)
def test_condition_with_NAs(): DT = dt.Frame(A=[True, False, None], B=[5, 7, 9]) RES = DT[:, ifelse(f.A, f.B, -f.B)] assert_equals(RES, dt.Frame([5, -7, None]))
def test_different_stypes(): DT = dt.Frame(A=[3], B=[7.1]) RES = DT[:, ifelse(f.A > 0, f.A, f.B)] assert_equals(RES, dt.Frame([3.0]))
def test_ifelse_with_scalars(): DT = dt.Frame(A=range(10)) RES = DT[:, ifelse(f.A % 2 == 0, "even", "odd")] assert_equals(RES, dt.Frame(["even", "odd"] * 5))
def test_ifelse_simple(): DT = dt.Frame(A=range(10)) DT["B"] = ifelse(f.A > 5, f.A - 5, f.A + 5) assert_equals(DT, dt.Frame(A=range(10), B=[5, 6, 7, 8, 9, 10, 1, 2, 3, 4]))
"""Transform dataset features""" # Map and create new features by adding new columns or with in-place update. # For example, use for mapping multi-valued key to single column or # any other types of map (row by row) transformations. # # Specification: # Inputs: # X: datatable - primary dataset # Parameters: # transformations: map - map with datatable transformation in the form of key: value pairs where key # is new / existing column name and value is datatable expression for this column. # Output: # dataset containing original and transformed features from datatable import f, isna, ifelse transformations = {'title_with_type': f['primaryTitle'] + '-' + f['titleType'], # concatentate 2 columns 'startYear': ifelse(f['startYear']=='\\N', None, f['startYear']), # override empty value with NULL 'endYear': ifelse(f['endYear']=='\\N', None, f['endYear']), # override empty value with NULL in another column 'spanYears': ifelse((f['startYear']=='\\N') | (f['endYear']=='\\N'), 0, dt.int32(f['endYear']) - dt.int32(f['startYear'])) # compute the different between two columns } X[:, dt.update(**transformations)] return {"temp_to_delte": X}
np.random.seed(TRAIN_PARAMS['seed']) if __name__ == '__main__': args = parser.parse_args() TRAIN_PARAMS['binaria_especial'] = args.binaria_especial TRAIN_PARAMS['model'] = args.model TRAIN_PARAMS['file_data'] = f'../datasets/datos_fe_hist_v{args.version}.gz' dataset_original = fread(TRAIN_PARAMS['file_data']) dataset = dataset_original[f.foto_mes <= TRAIN_PARAMS['max_foto_mes_entero'], :] dataset['azar'] = np.random.uniform(size=dataset.shape[0]) dataset['clase01'] = dataset[:, ifelse(f.clase_ternaria == 'CONTINUA', 0, 1)] if args.binaria_especial: dataset['target'] = dataset[:, f.clase_ternaria != 'CONTINUA'] dataset['weight'] = dataset[:, ifelse(f.clase_ternaria == 'BAJA+2', 1.0000001, 1)] campos_buenos = f[:].remove([f.clase_ternaria, f.target, f.azar, f.clase01, f.weight]) else: dataset['target'] = dataset[:, f.clase_ternaria == 'BAJA+2'] campos_buenos = f[:].remove([f.clase_ternaria, f.target, f.azar, f.clase01]) X = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), campos_buenos] y = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), f.target] weights = None if args.binaria_especial: weights = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), f.weight]
# any other types of map (row by row) transformations. # # Specification: # Inputs: # X: datatable - primary dataset # Parameters: # transformations: map - map with datatable transformation in the form of key: value pairs where key # is new / existing column name and value is datatable expression for this column. # Output: # dataset containing original and transformed features from datatable import f, isna, ifelse transformations = { 'title_with_type': f['primaryTitle'] + '-' + f['titleType'], # concatentate 2 columns 'startYear': ifelse(f['startYear'] == '\\N', None, f['startYear']), # override empty value with NULL 'endYear': ifelse(f['endYear'] == '\\N', None, f['endYear']), # override empty value with NULL in another column 'spanYears': ifelse((f['startYear'] == '\\N') | (f['endYear'] == '\\N'), 0, dt.int32(f['endYear']) - dt.int32(f['startYear'])) # compute the different between two columns } X[:, dt.update(**transformations)] return {"temp_to_delte": X}
TRAIN_PARAMS['file_data'] = f'../datasets/datos_fe_hist_v{args.version}.gz' dataset = fread(TRAIN_PARAMS['file_data']) for experimento in args.experimentos: for file in os.listdir(f'../experimentos/{experimento}/'): if file.endswith('stacking_apply.csv'): stacking = fread(f'../experimentos/{experimento}/{file}') dataset[f'{experimento}_prob'] = stacking['prob'] break dapply_kaggle = dataset[f.foto_mes == TRAIN_PARAMS['foto_mes_kaggle'], f[:].remove([f.clase_ternaria])] dataset = dataset[f.foto_mes <= TRAIN_PARAMS['max_foto_mes_entero'], :] dataset['azar'] = np.random.uniform(size=dataset.shape[0]) dataset['clase01'] = dataset[:, ifelse(f.clase_ternaria == 'CONTINUA', 0, 1)] if args.binaria_especial: dataset['target'] = dataset[:, f.clase_ternaria != 'CONTINUA'] dataset['weight'] = dataset[:, ifelse(f.clase_ternaria == 'BAJA+2', 1.0000001, 1)] campos_buenos = f[:].remove( [f.clase_ternaria, f.target, f.azar, f.clase01, f.weight]) else: dataset['target'] = dataset[:, f.clase_ternaria == 'BAJA+2'] campos_buenos = f[:].remove( [f.clase_ternaria, f.target, f.azar, f.clase01]) X = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), campos_buenos]
def test_ifelse_multi(): DT = dt.Frame(A=['fox', 'cat', 'jay', 'cow']) RES = DT[:, ifelse(f.A == 'fox', 3, f.A == 'dog', 7, f.A == 'cow', 2, -1)] assert_equals(RES, dt.Frame([3, -1, -1, 2]))