#print idx
    #i = i + 1
    #if i < 50:
        #print prej, "<=", salarie, "<", after

    #if not (prej <= salarie < after):
        #print "NI OK"
        #print prej, "<=", salarie, "<", after
        #break



#valid_salaries_enc = encode_salaries(valid_salaries, 4)
#salaries = np.log(salaries)
#valid_salaries = np.log(valid_salaries)
dio.is_log = True



print salaries.shape
metric = dio.error_metric


def make_grid_search(pipeline, parameters, model_name, params):
    print model_name
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3,
                               #loss_func=f1_score,
                               scoring="f1",
                               iid=False,
                               refit=True)
    #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log"
#after = bin_edges[idx]
#except IndexError as iex:
#print idx
#i = i + 1
#if i < 50:
#print prej, "<=", salarie, "<", after

#if not (prej <= salarie < after):
#print "NI OK"
#print prej, "<=", salarie, "<", after
#break

#valid_salaries_enc = encode_salaries(valid_salaries, 4)
#salaries = np.log(salaries)
#valid_salaries = np.log(valid_salaries)
dio.is_log = True

print salaries.shape
metric = dio.error_metric


def make_grid_search(pipeline, parameters, model_name, params):
    print model_name
    grid_search = GridSearchCV(
        pipeline,
        parameters,
        n_jobs=4,
        verbose=3,
        #loss_func=f1_score,
        scoring="f1",
        iid=False,
Beispiel #3
0
def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"

#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)

    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load(
        "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

    #salaries = dio.get_salaries(type_n, log=True)
    #if not submission:
    #valid_salaries = dio.get_salaries(type_v, log=True)

    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

    #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
    #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

    #TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load(
        "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape

    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (
        min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(
        n_estimators=n_trees,
        verbose=2,
        n_jobs=4,  # 2 jobs on submission / 4 on valid test
        oob_score=False,
        min_samples_split=min_samples_split,
        random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"



#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
                                #["Title", "FullDescription", "LocationRaw"],
                                #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
                                            #["Title", "FullDescription", "LocationRaw"],
                                            #extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)
    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

#salaries = dio.get_salaries(type_n, log=True)
#if not submission:
        #valid_salaries = dio.get_salaries(type_v, log=True)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

#joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
#joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

#TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape


    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                    verbose=2,
                                    n_jobs=4, # 2 jobs on submission / 4 on valid test
                                    oob_score=False,
                                    min_samples_split=min_samples_split,
                                    random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)