Exemple #1
0
def test_ifelse_wrong_type():
    DT = dt.Frame(A=range(10))
    DT["B"] = dt.str32(f.A)
    msg = r"The condition argument in ifelse\(\) must be a boolean column"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A, f.A, f.A)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.B, f.A, f.A)]
Exemple #2
0
def test_ifelse_with_groupby():
    DT = dt.Frame(A=[2, 5, 2, 5, 2, 2], B=range(6))
    R1 = DT[:, ifelse(f.A == 2, dt.min(f.B), dt.max(f.B)), by(f.A)]
    R2 = DT[:, ifelse(f.A == 2, f.B, dt.max(f.B)), by(f.A)]
    R3 = DT[:, ifelse(f.A == 2, dt.min(f.B), f.B), by(f.A)]
    R4 = DT[:, ifelse(f.B > 2, dt.min(f.B), f.B), by(f.A)]
    assert_equals(R1, dt.Frame(A=[2, 5], C0=[0, 3]))
    assert_equals(R2, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 4, 5, 3, 3]))
    assert_equals(R3, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 0, 0, 0, 1, 3]))
    assert_equals(R4, dt.Frame(A=[2, 2, 2, 2, 5, 5], C0=[0, 2, 0, 0, 1, 1]))
Exemple #3
0
def test_ifelse_columnsets():
    DT = dt.Frame(A=range(10), B=[7] * 10, C=list('abcdefghij'))
    msg = r"Multi-column expressions are not supported in ifelse\(\) function"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f[:], 0, 1)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 3, f[:], f.A)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 3, f.A, f[:])]
    # We could in theory make this work...
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f[int] > 3, 3, f[int])]
Exemple #4
0
def test_ifelse_bad_signature():
    DT = dt.Frame(A=range(10))
    msg = r"Function datatable\.ifelse\(\) requires at least 3 arguments"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse()]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0, f.A)]

    msg = r"Missing the required default argument in function " \
          r"datatable\.ifelse\(\)"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0, f.A, f.A, f.A)]
Exemple #5
0
def test_ifelse_bad_signature():
    DT = dt.Frame(A=range(10))
    msg = r"Function ifelse\(\) requires 3 arguments"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse()]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0, f.A)]

    msg = r"ifelse\(\) takes at most 3 positional arguments, " \
          r"but 4 were given"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 0, f.A, f.A, f.A)]
Exemple #6
0
def test_countries():
    # inspired by @knapply's example
    countries = dt.Frame(
        name=["Czech Republic", "Czecho-Slovakia", "Mexico", "Czech Republic",
              "Canada", "Czechoslovakia", "USA", "Britain"],
        year=[1918] + list(range(1990, 1997))
    )
    def is_czech_name(x):
        return ((x == "Czechoslovak Republic") |
                (x == "Czechoslovakia") |
                (x == "Czech Republic") |
                (x == "Czecho-Slovakia"))

    name, year = countries.export_names()
    RES = countries[:, {"historic_name": ifelse(
              is_czech_name(name) & (year <= 1938), "Czechoslovak Republic",
              is_czech_name(name) & (year <= 1992), "Czechoslovakia",
              is_czech_name(name) & (year >= 1993), "Czech Republic",
              (name == "USA"),                      "United States of America",
              (name == "Britain"),                  "United Kingdom",
              name
            )}]
    assert_equals(RES,
        dt.Frame(historic_name=["Czechoslovak Republic", "Czechoslovakia",
            "Mexico", "Czechoslovakia", "Canada", "Czech Republic",
            "United States of America", "United Kingdom"]))
Exemple #7
0
def test_ifelse_multi_different_grouplevels():
    DT = dt.Frame(A=[1, 2, 3, 4, 5, 6])
    RES = DT[:, ifelse(f.A <= 2, dt.min(f.A),
                       f.A >= 5, dt.max(f.A),
                       f.A == 0, 1000000,
                       f.A)]
    assert_equals(RES, dt.Frame([1, 1, 3, 4, 6, 6]))
Exemple #8
0
def addRiskColumn(table, incidencePrognosisColumn, newColumn, darkFactor):
    incidence = dt.f[incidencePrognosisColumn]
    newTable = table[:, dt.f[:].extend({
        newColumn:
        dt.ifelse(incidence <= 0, 99999, 100000 / (incidence * darkFactor))
    })]
    #print(newTable)
    return newTable
Exemple #9
0
def test_ifelse_columnsets():
    DT = dt.Frame(A=range(10), B=[7]*10, C=list('abcdefghij'))
    msg = r"The condition1 argument in ifelse\(\) cannot be a multi-column " \
          r"expression"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f[:], 0, 1)]
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f[int] > 3, 3, f[int])]

    msg = r"The value1 argument in ifelse\(\) cannot be a multi-column " \
          r"expression"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 3, f[:], f.A)]

    msg = r"The value2 argument in ifelse\(\) cannot be a multi-column " \
          r"expression"
    with pytest.raises(TypeError, match=msg):
        DT[:, ifelse(f.A > 3, f.A, f[:])]
def arreglar_infinitos(dataset: Frame) -> Frame:
    for column in dataset.names:
        if column != 'clase_ternaria':
            dataset[column] = dataset[:,
                                      ifelse(
                                          math.isinf(f[column]) ==
                                          1, None, f[column])]

    return dataset
Exemple #11
0
def test_example():
    DT = dt.Frame(domestic_income=[4500, 2500, 1500, 4000],
                  internationaL_income=[2000, 5000, 1000, 4500])
    DT["profit_loss"] = ifelse(f.domestic_income > f.internationaL_income,
                               "profit", "loss")
    assert_equals(
        DT,
        dt.Frame(domestic_income=[4500, 2500, 1500, 4000],
                 internationaL_income=[2000, 5000, 1000, 4500],
                 profit_loss=["profit", "loss", "profit", "loss"]))
def calcular_ganancia(data: Frame, votos: int) -> float:
    datos_votos = data[:, 'votos'].to_numpy().flatten() >= votos
    ganancias = data[:, ifelse(f.clase_ternaria ==
                               'BAJA+2', 29250, -750)].to_numpy().flatten()
    return np.dot(datos_votos, ganancias)
Exemple #13
0
def test_condition_with_NAs():
    DT = dt.Frame(A=[True, False, None], B=[5, 7, 9])
    RES = DT[:, ifelse(f.A, f.B, -f.B)]
    assert_equals(RES, dt.Frame([5, -7, None]))
Exemple #14
0
def test_different_stypes():
    DT = dt.Frame(A=[3], B=[7.1])
    RES = DT[:, ifelse(f.A > 0, f.A, f.B)]
    assert_equals(RES, dt.Frame([3.0]))
Exemple #15
0
def test_ifelse_with_scalars():
    DT = dt.Frame(A=range(10))
    RES = DT[:, ifelse(f.A % 2 == 0, "even", "odd")]
    assert_equals(RES, dt.Frame(["even", "odd"] * 5))
Exemple #16
0
def test_ifelse_simple():
    DT = dt.Frame(A=range(10))
    DT["B"] = ifelse(f.A > 5, f.A - 5, f.A + 5)
    assert_equals(DT, dt.Frame(A=range(10), B=[5, 6, 7, 8, 9, 10, 1, 2, 3, 4]))
"""Transform dataset features"""

# Map and create new features by adding new columns or with in-place update.
# For example, use for mapping multi-valued key to single column or
# any other types of map (row by row) transformations.
#
# Specification:
# Inputs:
#   X: datatable - primary dataset
# Parameters:
#   transformations: map - map with datatable transformation in the form of key: value pairs where key
#                    is new / existing column name and value is datatable expression for this column.
# Output:
#   dataset containing original and transformed features
from datatable import f, isna, ifelse

transformations = {'title_with_type': f['primaryTitle'] + '-' + f['titleType'], # concatentate 2 columns
                   'startYear': ifelse(f['startYear']=='\\N', None, f['startYear']), # override empty value with NULL
                   'endYear': ifelse(f['endYear']=='\\N', None, f['endYear']), # override empty value with NULL in another column
                   'spanYears': ifelse((f['startYear']=='\\N') | (f['endYear']=='\\N'), 
                                       0, dt.int32(f['endYear']) - dt.int32(f['startYear'])) # compute the different between two columns
                  } 

X[:, dt.update(**transformations)]

return {"temp_to_delte": X}
np.random.seed(TRAIN_PARAMS['seed'])

if __name__ == '__main__':
    args = parser.parse_args()

    TRAIN_PARAMS['binaria_especial'] = args.binaria_especial
    TRAIN_PARAMS['model'] = args.model
    TRAIN_PARAMS['file_data'] = f'../datasets/datos_fe_hist_v{args.version}.gz'

    dataset_original = fread(TRAIN_PARAMS['file_data'])

    dataset = dataset_original[f.foto_mes <= TRAIN_PARAMS['max_foto_mes_entero'], :]

    dataset['azar'] = np.random.uniform(size=dataset.shape[0])
    dataset['clase01'] = dataset[:, ifelse(f.clase_ternaria == 'CONTINUA', 0, 1)]

    if args.binaria_especial:
        dataset['target'] = dataset[:, f.clase_ternaria != 'CONTINUA']
        dataset['weight'] = dataset[:, ifelse(f.clase_ternaria == 'BAJA+2', 1.0000001, 1)]
        campos_buenos = f[:].remove([f.clase_ternaria, f.target, f.azar, f.clase01, f.weight])
    else:
        dataset['target'] = dataset[:, f.clase_ternaria == 'BAJA+2']
        campos_buenos = f[:].remove([f.clase_ternaria, f.target, f.azar, f.clase01])

    X = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), campos_buenos]
    y = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), f.target]
    weights = None
    if args.binaria_especial:
        weights = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) & ((f.clase01 == 1) | (f.azar < 0.1)), f.weight]
Exemple #19
0
# any other types of map (row by row) transformations.
#
# Specification:
# Inputs:
#   X: datatable - primary dataset
# Parameters:
#   transformations: map - map with datatable transformation in the form of key: value pairs where key
#                    is new / existing column name and value is datatable expression for this column.
# Output:
#   dataset containing original and transformed features
from datatable import f, isna, ifelse

transformations = {
    'title_with_type':
    f['primaryTitle'] + '-' + f['titleType'],  # concatentate 2 columns
    'startYear':
    ifelse(f['startYear'] == '\\N', None,
           f['startYear']),  # override empty value with NULL
    'endYear':
    ifelse(f['endYear'] == '\\N', None, f['endYear']),
    # override empty value with NULL in another column
    'spanYears':
    ifelse((f['startYear'] == '\\N') | (f['endYear'] == '\\N'), 0,
           dt.int32(f['endYear']) - dt.int32(f['startYear']))
    # compute the different between two columns
}

X[:, dt.update(**transformations)]

return {"temp_to_delte": X}
Exemple #20
0
    TRAIN_PARAMS['file_data'] = f'../datasets/datos_fe_hist_v{args.version}.gz'

    dataset = fread(TRAIN_PARAMS['file_data'])
    for experimento in args.experimentos:
        for file in os.listdir(f'../experimentos/{experimento}/'):
            if file.endswith('stacking_apply.csv'):
                stacking = fread(f'../experimentos/{experimento}/{file}')
                dataset[f'{experimento}_prob'] = stacking['prob']
                break

    dapply_kaggle = dataset[f.foto_mes == TRAIN_PARAMS['foto_mes_kaggle'],
                            f[:].remove([f.clase_ternaria])]
    dataset = dataset[f.foto_mes <= TRAIN_PARAMS['max_foto_mes_entero'], :]
    dataset['azar'] = np.random.uniform(size=dataset.shape[0])
    dataset['clase01'] = dataset[:,
                                 ifelse(f.clase_ternaria == 'CONTINUA', 0, 1)]

    if args.binaria_especial:
        dataset['target'] = dataset[:, f.clase_ternaria != 'CONTINUA']
        dataset['weight'] = dataset[:,
                                    ifelse(f.clase_ternaria ==
                                           'BAJA+2', 1.0000001, 1)]
        campos_buenos = f[:].remove(
            [f.clase_ternaria, f.target, f.azar, f.clase01, f.weight])
    else:
        dataset['target'] = dataset[:, f.clase_ternaria == 'BAJA+2']
        campos_buenos = f[:].remove(
            [f.clase_ternaria, f.target, f.azar, f.clase01])

    X = dataset[(f.foto_mes <= TRAIN_PARAMS['max_foto_mes_train']) &
                ((f.clase01 == 1) | (f.azar < 0.1)), campos_buenos]
Exemple #21
0
def test_ifelse_multi():
    DT = dt.Frame(A=['fox', 'cat', 'jay', 'cow'])
    RES = DT[:, ifelse(f.A == 'fox', 3,
                       f.A == 'dog', 7,
                       f.A == 'cow', 2, -1)]
    assert_equals(RES, dt.Frame([3, -1, -1, 2]))