tfidf_columns = ["Title", "FullDescription", "LocationRaw"] #columns = ["Category", "ContractTime", "ContractType", "Category", "SourceName"] #le_features = dio.get_le_features(columns, "train_full") #extra_features = dio.get_features(columns, type_n, le_features) #extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features).astype(np.int64) short_id = "tfidf_200f_l2" dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) extra_features = [] extra_valid_features = [] features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) print features.max() print features.min() salaries = dio.get_salaries(type_n, log=False).astype(np.int64) if not submission: valid_salaries = dio.get_salaries(type_v, log=False)
min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') #short_id = "tfidf_200f_l1" short_id = "tfidf_05df_stopwords" tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v) #columns = ["Category", "ContractTime", "ContractType"] columns = ["Category", "ContractTime", "ContractType", "Company", "SourceName"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) #features = dio.join_features("%s_" + type_n + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_count_vector_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features,
import joblib import cloud import os tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer(max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) print features.shape print validation_features.shape run = raw_input("OK (Y/N)?") print run if run != "Y":
import os tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer( max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True ) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features) print features.shape print validation_features.shape run = raw_input("OK (Y/N)?") print run