tfidf_columns = ["Title", "FullDescription", "LocationRaw"]


#columns = ["Category", "ContractTime", "ContractType", "Category", "SourceName"]
#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)

#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
                                        #["Title", "FullDescription", "LocationRaw"],
                                        #extra_valid_features).astype(np.int64)
short_id = "tfidf_200f_l2"
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)
extra_features = []
extra_valid_features = []
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)

print features.max()
print features.min()
salaries = dio.get_salaries(type_n, log=False).astype(np.int64)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=False)
Ejemplo n.º 2
0
min_samples_split = 2

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
#short_id = "tfidf_200f_l1"
short_id = "tfidf_05df_stopwords"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)

#columns = ["Category", "ContractTime", "ContractType"]
columns = ["Category", "ContractTime", "ContractType", "Company", "SourceName"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features,
Ejemplo n.º 3
0
import joblib
import cloud
import os

tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(max_features=200,
                             norm='l1',
                             smooth_idf=True,
                             sublinear_tf=False,
                             use_idf=True)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

print features.shape
print validation_features.shape
run = raw_input("OK (Y/N)?")
print run
if run != "Y":
import os

tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(
    max_features=200,
    norm='l1',
    smooth_idf=True,
    sublinear_tf=False,
    use_idf=True
)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix",
                                        tfidf_columns,
                                        extra_valid_features)

print features.shape
print validation_features.shape
run = raw_input("OK (Y/N)?")
print run