Example #1
0
def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings_loc5.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")


names = ["Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4", "Loc5", "Company", "SourceName"]
le_features = map(lambda x: label_encode_column_fit_only(
    x, file_id="train_full_data_path", type_n="train_full"), names)

features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="train_data_path", type_n="train"), zip(le_features, names))

description_length = map(len, read_column(paths["train_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["train_data_path"], "Title"))

features.append(description_length)
features.append(title_length)


#le_features, features = zip(*features_les)

validation_features = map(lambda (le, name): label_encode_column_transform(le, name, file_id="valid_data_path", type_n="valid"), zip(le_features, names))

description_length = map(len, read_column(paths["valid_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["valid_data_path"], "Title"))
Example #2
0
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import AdaBoostRegressor
import joblib


def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

le_category, category_train = label_encode_column_fit("Category")
category_valid = label_encode_column_transform(le_category, "Category")

le_contractTime, contractTime_train = label_encode_column_fit("ContractTime")
contractTime_valid = label_encode_column_transform(le_contractTime, "ContractTime")

le_contractType, contractType_train = label_encode_column_fit("ContractType")
contractType_valid = label_encode_column_transform(le_contractType, "ContractType")


features = join_features("%strain_count_vector_matrix_max_f_100", #train_tfidf_matrix_max_f_200
                         ["Title", "FullDescription", "LocationRaw"],
                         data_dir,
                         [contractTime_train, contractType_train, category_train])
validation_features = join_features("%svalid_count_vector_matrix_max_f_100",#valid_tfidf_matrix_max_f_200
                                    ["Title", "FullDescription", "LocationRaw"],
                                    data_dir,
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
import joblib

def log_mean_absolute_error(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))


paths = get_paths("Settings_submission.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

le_category, category_train = label_encode_column_fit("Category")
category_valid = label_encode_column_transform(le_category, "Category")

le_contractTime, contractTime_train = label_encode_column_fit("ContractTime")
contractTime_valid = label_encode_column_transform(le_contractTime, "ContractTime")

le_contractType, contractType_train = label_encode_column_fit("ContractType")
contractType_valid = label_encode_column_transform(le_contractType, "ContractType")
features = join_features("%s_train_full_count_vector_matrix_max_f_200", #train_tfidf_matrix_max_f_200
#features = join_features("%s_train_tfidf_matrix_max_f_200",
                         ["Title", "FullDescription", "LocationRaw"],
                         data_dir,
                         [contractTime_train, contractType_train, category_train])
#for column_name in ["Title", "FullDescription", "LocationRaw"]:
    #vocabulary = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_vocabulary"))
    #stop_words = joblib.load(path_join(cache_dir, column_name + "count_vectorizer_stop_words"))
Example #4
0
paths = get_paths("Settings_loc5.json")
data_dir = paths["data_path"]
cache_dir = path_join(data_dir, "tmp")
prediction_dir = path_join(data_dir, "predictions")

names = [
    "Category", "ContractTime", "ContractType", "Loc1", "Loc2", "Loc3", "Loc4",
    "Loc5", "Company", "SourceName"
]
le_features = map(
    lambda x: label_encode_column_fit_only(
        x, file_id="train_full_data_path", type_n="train_full"), names)

features = map(
    lambda (le, name): label_encode_column_transform(
        le, name, file_id="train_data_path", type_n="train"),
    zip(le_features, names))

description_length = map(
    len, read_column(paths["train_data_path"], "FullDescription"))
title_length = map(len, read_column(paths["train_data_path"], "Title"))

features.append(description_length)
features.append(title_length)

#le_features, features = zip(*features_les)

validation_features = map(
    lambda (le, name): label_encode_column_transform(
        le, name, file_id="valid_data_path", type_n="valid"),
    zip(le_features, names))