def main(): X, y, train_X, train_y, test_X, test_y = ml_utils.load_data(my_dir, 0) estimator = RandomForestClassifier(n_estimators=40 , max_depth=5) estimator.fit(X, y) keys_model = model.Model(estimator, my_dir) keys_model.save(os.path.join(my_dir, "keys_model.pkl"))
def main(data, train, test=None): classes_to_score = [0, 1, 2, 3] train_text, train_labels, score_text, score_ids = load_data( data, train, test, classes_to_score, scoring=True) train_text, train_labels = fix_data(train_text, train_labels) score(train_text, train_labels, score_text, score_ids)
def main(data, train, test=None): classes_to_score = [0, 1, 2, 3, 4] train_text, train_labels, score_text, score_ids, timestamps = load_data( data, train, test, id_key='__index__', text_key='text', scoring=True) train_text, train_labels = fix_data(train_text, train_labels) score(train_text, train_labels, score_text, score_ids, timestamps, classes_to_score)
def main(data, train_data, test_data): train_text, train_labels, test_text, test_labels = load_data( data, train_data, test_data, text_key='article', id_key=['source', 'source_index']) analyze(train_text, train_labels, test_text, test_labels, threshold=.6)
def main(data, train_data, test_data): train_text, train_labels, test_text, test_labels = load_data( data, train_data, test_data, text_key='article', id_key=['source', 'source_index']) train(train_text, train_labels, test_text, test_labels)
def main(data, train_data, test_data): classes_to_analyze = [0, 1, 2, 3, 4] train_text, train_labels, test_text, test_labels = load_data( data, train_data, test_data, id_key='__index__', text_key='text') train_text, train_labels = fix_data(train_text, train_labels) test_text, test_labels = fix_data(test_text, test_labels) analyze(train_text, train_labels, test_text, test_labels, classes_to_analyze)
def main(data, train_data, test_data): CLASSES_TO_TRAIN = [0, 1, 2, 3, 4] # load the data train_text, train_labels, test_text, test_labels = load_data( data, train_data, test_data, id_key='__index__', text_key='text') # fix the data train_text, train_labels = fix_data(train_text, train_labels) test_text, test_labels = fix_data(test_text, test_labels) # train on the data train(train_text, train_labels, test_text, test_labels, CLASSES_TO_TRAIN)
def main(): if len(sys.argv) < 2: print("Usage: {} <task directory>".format(sys.argv[0])) return task_dir = sys.argv[1] X, y, train_X, train_y, test_X, test_y = ml_utils.load_data(task_dir, TEST_FRACTION) curr_results_dir = os.path.join(task_dir, "results") # # Boosted Decision Tree # num_estimators_param = Parameter("Max Number of Estimators", "n_estimators", range(10, 111, 10)) # base_estimators = [DecisionTreeClassifier(max_depth=d) for d in range(2, 6)] # estimator_depth_param = Parameter("Max Tree Depth", "base_estimator", base_estimators, range(1, 6)) # boosted_algo = Algorithm("Boosted Decision Tree", [num_estimators_param, estimator_depth_param], AdaBoostClassifier()) # analyze_algorithm(boosted_algo, curr_results_dir, train_X, train_y, X, y) # print() # Random Forest num_estimators_param = Parameter("Number of Estimators", "n_estimators", range(1, 121, 10)) max_depth = Parameter("Max Tree Depth", "max_depth", range(1, 7)) rf_algo = Algorithm("Random Forest", [num_estimators_param, max_depth], RandomForestClassifier()) analyze_algorithm(rf_algo, curr_results_dir, train_X, train_y, X, y) print() # Naiive Bayes bayes_algo = Algorithm("Naiive Bayes", [], GaussianNB()) analyze_algorithm(bayes_algo, curr_results_dir, train_X, train_y, X, y) print() # Support Vector Machine (Linear) c_param = Parameter("C Value", "C", [1e-2, 1e-1, 1e0, 1e1, 1e2], log_scale=True) linear_svm_algo = Algorithm("Linear Support Vector Machine", [c_param], svm.SVC(kernel="linear")) analyze_algorithm(linear_svm_algo, curr_results_dir, train_X, train_y, X, y) print() # Linear Discriminant Analysis lda_algo = Algorithm("Linear Discriminant Analysis", [], LinearDiscriminantAnalysis()) analyze_algorithm(lda_algo, curr_results_dir, train_X, train_y, X, y) print() # Neural Network hidden_layer_param = Parameter("Number of Hidden Layers", "hidden_layer_sizes", vary_num_hidden_layers(30, 5), range(1, 6)) alpha_param = Parameter("Regularization Strength (alpha)", "alpha", [1e0, 1e-2, 1e-4, 1e-6, 1e-8], log_scale=True) nn_algo = Algorithm("Neural Network", [hidden_layer_param, alpha_param], MLPClassifier(max_iter=5000)) analyze_algorithm(nn_algo, curr_results_dir, train_X, train_y, X, y) print()
def train(model_class, domain=None): cache_path = './cache' if os.path.exists( os.path.join(cache_path, "{}_boosted_tree.pkl".format(target_year))): df = pickle.load( open( os.path.join(cache_path, "{}_boosted_tree.pkl".format(target_year)), 'rb')) else: df = load_data(target_year) pickle.dump( df, open( os.path.join(cache_path, "{}_boosted_tree.pkl".format(target_year)), 'wb')) if domain is not None: print("Find important feature in {}".format(domain)) domain2category = pickle.load(open("domain2category.pkl", 'rb')) df = df.loc[df['main_category'].isin(domain2category[domain])] selected_feature = const_selected_feature X = df[selected_feature].values bde = joblib.load('category_encoder.pkl') # bde = ce.BinaryEncoder(cols=categorical_feature, return_df=False) venue_embedding = bde.transform(df[['venue']]).values # pca = PCA(n_components=32) # venue_embedding = pca.fit_transform(venue_embedding) X = np.hstack((X, venue_embedding)) venue_embedding_size = venue_embedding.shape[1] print(venue_embedding_size) for i in range(venue_embedding_size): selected_feature.append('venue_' + str(i)) nlp_pipeline = joblib.load('nlp_pipeline.pkl') embeddings = nlp_pipeline.transform(df['summary'].values) embeddings_size = embeddings.shape[1] print(embeddings_size) for i in range(embeddings_size): selected_feature.append('title_' + str(i)) X = np.hstack((X, embeddings)) y = df['citationCount'].values y = np.clip(y, 0, 23) kf = KFold(n_splits=5) accuracies = [] print(model_class) importance_stats = {} for train_index, test_index in tqdm(kf.split(y)): model = model_class(verbose=0) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) mse_score = mse(y_pred, y_test) accuracies.append(mse_score) feature_importance = model.get_feature_importance(prettified=True) for feature_id, score in feature_importance: if selected_feature[int(feature_id)] not in importance_stats: importance_stats[selected_feature[int(feature_id)]] = score else: importance_stats[selected_feature[int(feature_id)]] += score limit = 0 for key, value in importance_stats.items(): print(key, " score: ", value) limit += 1 if limit >= 30: break print("MSE: {} ({})".format(np.mean(accuracies), np.std(accuracies))) print("Sample : ") print(y_pred[:10]) print(y_test[:10])