コード例 #1
0
def register_model_with_scores(name, acc, precision, recall, f1, fn):
    mod_name.append(name)
    accuracy.append(acc)
    precision_score.append(precision)
    recall_score.append(recall)
    f1_score.append(f1)
    false_negative.append(fn)
コード例 #2
0
def plot_metrics(report, acc):
    accuracy = []
    precision = []
    recall =[]
    f1_score = []

    for mod in range(len(modulations)):
        plt.grid(b=True)
        plt.title('{} Performance'.format(modulations[mod]))
        plt.ylabel('Performance')
        plt.xlabel('SNR [dB]')
        plt.xticks(np.arange(len(snr_list)), [str(snr_values[i]) for i in range(len(snr_list))])
        
        for item in report:
            precision.append(item[str(mod)]['precision'])
            recall.append(item[str(mod)]['recall'])
            f1_score.append(item[str(mod)]['f1-score'])

        for item in acc[mod]:
            accuracy.append(item)

        plt.plot(accuracy, label='accuracy', linewidth=4)
        plt.plot(precision, label='precision', linewidth=2)
        plt.plot(recall, label='recall', linewidth=2, linestyle='--', color='r')
        plt.plot(f1_score, label='f1-score', linewidth=2)
        plt.legend(loc='best')
        #plt.show()
        plt.savefig('{}_software_performance.png'.format(modulations[mod]), bbox_inches='tight', dpi=300)
        plt.clf()
        accuracy.clear()
        precision.clear()
        recall.clear()
        f1_score.clear()
コード例 #3
0
def participant_analysis(big_table, tmp_list):

    means = []
    f1_score = []
    for i in tmp_list:
        tmp_value = big_table['actual_use_' + i] - big_table['Predictions_' +
                                                             i]
        tmp_val_add = big_table['actual_use_' + i] + big_table['Predictions_' +
                                                               i]
        tmp_abs = tmp_value.abs()
        means.append(tmp_abs.mean())
        tp = tmp_val_add[tmp_val_add > 1]
        tp = len(tp.tolist())
        tn = tmp_val_add[tmp_val_add < 1]
        tn = len(tn.tolist())
        fn = tmp_value[tmp_value > 0]
        fn = len(fn.tolist())
        fp = tmp_value[tmp_value < 0].abs()
        fp = len(fp.tolist())

        precision = tp / (tp + fn)
        recall = tp / (tp + fp)

        f1 = 2.0 * precision * recall / (precision + recall)
        f1_score.append(f1)
    final_value = 1 - np.mean(means)
    final_f1 = np.mean(f1_score)
    print(final_f1)
    return final_value
def run_all_models(dataset, data_file, embedding, is_oversample):
    x_text, labels = load_data(data_file)

    precision = []
    recall = []
    f1_score = []

    for i in range(4):
        results = train(x_text, labels, MODEL_TYPES[i], embedding,
                        is_oversample)
        precision.append(results[2][0])
        recall.append(results[2][1])
        f1_score.append(results[2][2])

    plot_graph(precision, recall, f1_score, dataset, embedding)
コード例 #5
0
def evaluate_model(model, X_test, Y_test, category_names):
    """
    Testing the model with test data and printing out the average precision and f1-score
    for all categories.

    Parameters
    ----------
    model : CLASS
        full NLP pipeline.
    X_test : array
        messages column.
    Y_test : array
        36 categoriy labels.
    category_names : list
        names of the categories.

    Returns
    -------
    None.

    """
    #predict test values
    Y_pred = model.predict(X_test)
    f1_score = []
    precision = []
    classif = []
    recall_score = []
    #check for each classification column the precision and f1-score
    for n in range(Y_pred.shape[1]):
        s = classification_report(Y_test[:, n], Y_pred[:, n], output_dict=True)
        classif.append(category_names[n])
        precision.append(s['1.0']['precision'])
        f1_score.append(s['1.0']['f1-score'])
        recall_score.append(s['1.0']['recall'])
    #print result
    print('avg_precision:' + str(sum(precision) / len(precision)) +
          ', avg_f1-score\
          :' + str(sum(f1_score) / len(f1_score)) + ', avg_recall-score:' +
          str(sum(recall_score) / len(recall_score)))

    #form a dataframe with the results and export it as excel file
    results = pd.DataFrame(classif, columns=['classifier'])
    results['precision'] = precision
    results['recall'] = recall_score
    results['f1-score'] = f1_score
    results['model'] = "DecisionTreeClassifier"
    results.to_excel("results.xlsx", index=False)
コード例 #6
0
def pca_svm_time_score_compare():  
    ac_score=[]  
    p_score=[]  
    r_score=[]  
    f1_score=[]  
    tt=[]  
    stand=MinMaxScaler((20,30))  
    steps=numpy.arange(10,410,10)  
    for n in steps:  
        ac,p,r,f1,t=pca_svm(pca_n=n)  
        p_score.append(p)  
        f1_score.append(f1)  
        r_score.append(r)  
        ac_score.append(ac)  
        tt.append(t)  
    p_score_stand=stand.fit_transform(numpy.array(p_score).reshape((-1,1)))  
    r_score_stand=stand.fit_transform(numpy.array(r_score).reshape((-1,1)))  
    f1_score_stand=stand.fit_transform(numpy.array(f1_score).reshape((-1,1)))  
    ac_score_stand=stand.fit_transform(numpy.array(ac_score).reshape((-1,1)))  
    figure=pyplot.figure()  
      
      
    pyplot.subplot(2,1,1)  
    pyplot.scatter(steps,f1_score,label='f1-score',color='red',s=p_score_stand,alpha=0.7)  
    pyplot.scatter(steps,r_score,label='recall-score',color='blue',s=r_score_stand,alpha=0.7)  
    pyplot.scatter(steps,p_score,label='precision-score',color='yellow',s=f1_score_stand,alpha=0.7)  
    pyplot.scatter(steps,ac_score,label='accuracy-score',color='purple',s=ac_score_stand,alpha=0.7)  
    pyplot.xlabel('n-components')  
    pyplot.ylabel('score')  
    pyplot.legend()  
    pyplot.title('The Score Of SVM After PCA To N_components')  
    pyplot.subplot(2,1,2)  
    pyplot.plot(steps,tt,label='cost-time',color='black',marker='o')  
    # for i in range(len(tt)):  
        # pyplot.text(steps[i],ac_score[i],str(round(tt[i],1))+'s',fontdict=dict(size=10,weight='normal'))  
        # pyplot.plot([steps[i],steps[i]],[0,ac_score[i]],'--b')  
    pyplot.legend()  
    pyplot.xlabel('n-components')  
    pyplot.ylabel('time')  
    pyplot.show() 
コード例 #7
0
    def train(self, algorithm):
        """Main training function. This is the entry for a training process in this class

        Arguments:
            algorithm {object} -- This is the instanciated algorithm object to call .fit()
                                  onto in order to get the model
        """

        kfold = StratifiedKFold(10, True, 1)
        f1_score = []
        precision_score = []
        recall_score = []
        for train, test in kfold.split(self.data_training, self.data_target):
            model = algorithm.fit(self.data_training.iloc[train],
                                  self.data_target.iloc[train])
            scores = self.score_model(model, self.data_training.iloc[test],
                                      self.data_target.iloc[test])
            f1_score.append(scores[0])
            precision_score.append(scores[1])
            recall_score.append(scores[2])

        self.print_results(f1_score, precision_score, recall_score)
コード例 #8
0
ファイル: accuracy.py プロジェクト: Kinnera323/TagPrediction
            else:
                confusion_matrix[1][1]+=1

    if(confusion_matrix[0][0]+confusion_matrix[1][0]==0):
        precision_for_label_j=0
    else:
        precision_for_label_j = confusion_matrix[0][0]/float(confusion_matrix[0][0]+confusion_matrix[1][0]) 
    if(confusion_matrix[0][0]+confusion_matrix[0][1]==0):
        recall_for_label_j=0
    else:
        recall_for_label_j = confusion_matrix[0][0]/float(confusion_matrix[0][0]+confusion_matrix[0][1])

    precision.append(precision_for_label_j)
    recall.append(recall_for_label_j)
    if(precision_for_label_j + recall_for_label_j==0):
        f1_score.append(0)
    else:
        f1_score.append(2*precision_for_label_j*recall_for_label_j/float(precision_for_label_j+recall_for_label_j))
    confusion_matrix=[[0,0],[0,0]]



for i in range(0,len(precision)):
    print Top_tags[i], precision[i], recall[i], f1_score[i]
    print "\n"


print mean(precision)
print mean(recall)
print mean(f1_score)
コード例 #9
0
    classifiers = []
    accuracy = []
    f1_score = []

    # reformat
    max_accuracy = -1
    max_accuracy_classifier = ""
    max_f1_score = -1
    max_f1_score_classifier = ""
    for classifier in results.keys():
        classifiers.append(classifier)
        accuracy.append(results[classifier]["accuracy"])
        if (results[classifier]["accuracy"] > max_accuracy):
            max_accuracy_classifier = classifier
            max_accuracy = results[classifier]["accuracy"]
        f1_score.append(results[classifier]["f1-score"])
        if (results[classifier]["f1-score"] > max_f1_score):
            max_f1_score_classifier = classifier
            max_f1_score = results[classifier]["f1-score"]

    #
    # Sort
    #

    sorted_indices = np.argsort(accuracy)
    classifiers = np.array(classifiers)[sorted_indices]
    accuracy = np.array(accuracy)[sorted_indices]
    f1_score = np.array(f1_score)[sorted_indices]

    #
    # Print
    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_RICHMOND.pickle.dat",
                                      'wb'))  #change pickle

# In[6]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Richmond', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Richmond_results.dat", index=False, header=False)
コード例 #11
0
                             input_x,
                             input_y,
                             cv=5,
                             scoring=[
                                 'accuracy', 'precision_macro', 'recall_macro',
                                 'f1_macro', 'roc_auc', 'average_precision'
                             ],
                             return_train_score=True,
                             return_estimator=True)
    # append performance index
    auroc.append(outputs['test_roc_auc'])
    auprc.append(outputs['test_average_precision'])
    acc.append(outputs['test_accuracy'])
    precision.append(outputs['test_precision_macro'])
    recall.append(outputs['test_recall_macro'])
    f1_score.append(outputs['test_f1_macro'])

# 변수선택 횟수 csv 파일 만들기
test = dict()
for f in vital:
    test[f] = 0
for i in list(test.keys()):
    for k in s:
        if k == i:
            test[i] += 1

for i in list(test.keys()):
    test[i] = [test[i]]

# odds ratio 평균값 구하기
odds_dict = dict()
コード例 #12
0
model = XGBClassifier(nthread = n_threads) #or -1
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)
param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_CENTRAL.pickle.dat", 'wb')) #change pickle


# In[6]:


#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average = 'micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Central', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Central_results.dat", index = False, header = False)

コード例 #13
0
	def crossValidationStrategy(self, reviews_df_preprocessed, do_pickle):
		print('\nK-Fold Cross Validation Strategy...\n')

		train_test_indices, X, y = self.kFoldSplit(reviews_df_preprocessed)

		accuracy = []
		precision = []
		recall = []
		f1 = []
		roc_auc = []
		cm = []

		for i in range(0, len(self.clf)):
			accuracy.append([])
			precision.append([])
			recall.append([])
			f1.append([])
			roc_auc.append([])
			cm.append(np.zeros((2,2), dtype = 'int32'))

		for train_idx, test_idx in train_test_indices:
			X_train, y_train = X[train_idx], y[train_idx]
			X_test, y_test = X[test_idx], y[test_idx]

			_, model = self.trainData(X_train, y_train, self.clf)
			prediction = self.predictData(X_test, model)
			clf_accuracy, clf_precision, clf_recall, clf_f1, clf_roc_auc, clf_cm, _ = self.evaluate(y_test, prediction)

			for j in range(0, len(self.clf)):
				accuracy[j].append(clf_accuracy[j])
				precision[j].append(clf_precision[j])
				recall[j].append(clf_recall[j])
				f1[j].append(clf_f1[j])
				roc_auc[j].append(clf_roc_auc[j])
				cm[j] += clf_cm[j]

		acc = []
		prec = []
		rec = []
		f1_score = []
		auc = []
		for i in range(0, len(self.clf)):
			if i == 0:
				print('======================================================\n')
			print('Evaluation metrics of Classifier ' + self.clf_names[i] + ':')
			print('Accuracy: {}'.format(np.mean(accuracy[i])))
			print('Precision: {}'.format(np.mean(precision[i])))
			print('Recall: {}'.format(np.mean(recall[i])))
			print('F1-score: {}'.format(np.mean(f1[i])))
			print('ROC AUC: {}'.format(np.mean(roc_auc[i])))
			print('Confusion Matrix: \n{}\n'.format(cm[i]))
			print('======================================================\n')
			acc.append(np.mean(accuracy[i]))
			prec.append(np.mean(precision[i]))
			rec.append(np.mean(recall[i]))
			f1_score.append(np.mean(f1[i]))
			auc.append(np.mean(roc_auc[i]))

		metrics_list = {
			'Classifier': self.clf_names,
			'Accuracy': clf_accuracy,
			'Precision': clf_precision,
			'Recall': clf_recall,
			'F1-score': clf_f1,
			'ROC AUC': clf_roc_auc
		}

		metrics_df = pd.DataFrame.from_dict(metrics_list)

		print('Comparison of different metrics for the various Classifiers used:\n')
		print(metrics_df)

		if do_pickle:
			with open('pickled/metrics_dataframe_kfold.pickle', 'wb') as df_kfold:
				pickle.dump(metrics_df, df_kfold)
    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_MISSION.pickle.dat",
                                      'wb'))  #change pickle

# In[6]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Mission', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Mission_results.dat", index=False, header=False)
コード例 #15
0
 
     y_pred = model.predict(X)
     cm_train = confusion_matrix(y, y_pred)
     #print(names)
     report = classification_report(y, y_pred, output_dict=True)
     #sccurate = accuracy_score(y_train, y_pred)
     #print(sccurate)
     
     macro_precision =  report['macro avg']['precision'] 
     macro_recall = report['macro avg']['recall']    
     macro_f1 = report['macro avg']['f1-score']
     
     accuracy = report['accuracy']
     print('jjjjj',macro_precision)
     #f1_score1 = (f1_score(y, y_pred))
     f1_score.append(macro_f1)
     
     #precision_score1 = (precision_score(y, y_pred))
     precision_score.append(macro_precision)
     
     #recall_score1 = (recall_score(y, y_pred)) 
     recall_score.append(macro_recall)
     
     #acc = accuracy_score(y, y_pred)
     accuracy_score.append(accuracy)
     
     sensitivity1 =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
     #print(sensitivity)
     #0.6666666666666666
     sensitivity.append(sensitivity1)
     
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_CITY.pickle.dat",
                                      'wb'))  #change pickle

# In[109]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)

# In[111]:

preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('City', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("cityresults.dat", index=False, header=False)

# In[156]:

#add code for graphing each parameter from cv search...
#https://stackoverflow.com/questions/42793254/what-replaces-gridsearchcv-grid-scores-in-scikit

param_scores = rand_result.cv_results_['mean_test_score']

# In[165]:

param_list = rand_result.cv_results_['params']
コード例 #17
0
    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_INGLESIDE.pickle.dat",
                                      'wb'))  #change pickle

# In[16]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Ingleside', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Ingleside_results.dat", index=False, header=False)
コード例 #18
0
sns.set_style("whitegrid")
msplt = sns.barplot(x = "model", y = "f1_score", data=mscore_df)
_ = plt.xlabel('Model')
_ = plt.ylabel('f1 score')
_ = plt.savefig('model_selection_bar')
_ = plt.show()


# In[71]:


#test on test set here.
#XGBoost non one-hot-encoded was the best model but only very slightly, XGB one-hot-encoded alone without
#feature reduction performed nearly as well and was much more efficient in time computations, thus I will use
#XGB w/o feature selection and with one-hot-encoding for my final modelling on test set as well as in other
#districts and whole city
#train the best estimator on the train set again then test on test set for results
best_XGB_only_estimator.fit(X_train, y_train)
preds = best_XGB_only_estimator.predict(X_test)
f1score = f1_score(y_test, preds, average = 'micro')


# In[72]:


f1_score = []
f1_score.append(('Bayview', f1score))
export_df = pd.DataFrame(f1_score)
export_df.to_csv("bayviewresults.dat", index = False, header = False)

    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_NORTHERN.pickle.dat",
                                      'wb'))  #change pickle

# In[6]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Northern', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Northern_results.dat", index=False, header=False)
コード例 #20
0
loss_test = []
f1_score_test = []

neural = []

for i in range(20):

    mlp = sklearn.neural_network.MLPClassifier(activation='logistic',
                                               hidden_layer_sizes=(i + 1),
                                               max_iter=2000)
    mlp.fit(x_train, y_train)

    loss.append(mlp.loss_)
    f1_score.append(
        sklearn.metrics.f1_score(y_train,
                                 mlp.predict(x_train),
                                 average='macro'))

    loss_test.append(mlp.loss_)
    f1_score_test.append(
        sklearn.metrics.f1_score(y_test, mlp.predict(x_test), average='macro'))

    neural.append(i + 1)

print('Loss', mlp.loss_)
print('F1',
      sklearn.metrics.f1_score(y_test, mlp.predict(x_test), average='macro'))

# In[7]:

loss = np.array(loss)
コード例 #21
0
def predict_result(list_of_random_items):
    def xgboost_hot_cold(date, Data):

        pd.options.mode.chained_assignment = None  # default='warn'

        contents = pd.DataFrame(Data,
                                columns=[
                                    'PAYMENT', 'PROGRAM_TYPE', 'New_Contents',
                                    'genre_Label', 'target_age', 'playtime',
                                    'channel_Label', 'contentnumber',
                                    'episode_count', 'past_view'
                                ])
        x_train = contents[contents['New_Contents'] == 0]
        x_train.loc[:, 'PROGRAM_TYPE'] = round(x_train.loc[:, 'PROGRAM_TYPE'])
        x_train = x_train[x_train['PROGRAM_TYPE'] ==
                          x_train.PROGRAM_TYPE.unique()[0]]
        x_train.contentnumber = x_train.contentnumber.fillna(0)
        x_train.episode_count = x_train.episode_count.fillna(1)
        x_train = x_train.drop('New_Contents', axis=1)
        x_train = x_train.drop('PROGRAM_TYPE', axis=1)
        x_train = x_train.values
        x_train = x_train.astype('float32')
        scaler = MinMaxScaler(feature_range=(0, 1))
        x_train = scaler.fit_transform(x_train)

        x_test = contents[contents['New_Contents'] == 1]
        x_test = x_test[x_test['PROGRAM_TYPE'] == x_test.PROGRAM_TYPE.unique()
                        [0]]
        x_test.contentnumber = x_test.contentnumber.fillna(0)
        x_test.episode_count = x_test.episode_count.fillna(1)
        x_test = x_test.drop('New_Contents', axis=1)
        x_test = x_test.drop('PROGRAM_TYPE', axis=1)
        x_test = x_test.values
        x_test = x_test.astype('float32')
        x_test = scaler.transform(x_test)

        contents = pd.DataFrame(
            Data, columns=['PROGRAM_TYPE', 'New_Contents', 'ViewCount'])
        y_train = contents[contents['New_Contents'] == 0]
        y_train = y_train.drop('New_Contents', axis=1)
        y_train.loc[:, 'PROGRAM_TYPE'] = round(y_train.loc[:, 'PROGRAM_TYPE'])
        y_train = y_train[y_train['PROGRAM_TYPE'] ==
                          y_train.PROGRAM_TYPE.unique()[0]]
        y_train = y_train.drop('PROGRAM_TYPE', axis=1)
        y_train = y_train.values
        y_train = y_train.astype('float32')

        y_test = contents[contents['New_Contents'] == 1]
        y_test = y_test[y_test['PROGRAM_TYPE'] == y_test.PROGRAM_TYPE.unique()
                        [0]]
        y_test = y_test.drop('PROGRAM_TYPE', axis=1)
        y_test = y_test.drop('New_Contents', axis=1)
        y_test = y_test.values

        import xgboost as xgb

        xgb = xgb.XGBRegressor(colsample_bytree=1,
                               learning_rate=0.4,
                               n_estimators=1000,
                               max_depth=8,
                               min_child_weight=1,
                               max_delta_step=2.5,
                               gamma=1.0,
                               subsample=0.8,
                               objective='reg:linear',
                               n_jobs=8,
                               scale_pos_weight=1.8,
                               random_state=27,
                               base_score=0.5)
        xgb.fit(x_train, y_train)
        xgb_preds = xgb.predict(x_test)

        for i in range(xgb_preds.shape[0]):
            if xgb_preds[i] < 1:
                xgb_preds[i] = 1
            else:
                xgb_preds[i] = int(xgb_preds[i])

        for i in range(xgb_preds.shape[0]):
            xgb_preds[i] = round(xgb_preds[i])

        contents = pd.DataFrame(Data,
                                columns=[
                                    'EPISODE', 'PAYMENT', 'PROGRAM_TYPE',
                                    'ViewCount', 'New_Contents', 'genre_Label',
                                    'target_age', 'playtime', 'channel_Label',
                                    'contentnumber', 'episode_count',
                                    'past_view'
                                ])
        x_test = contents[contents['New_Contents'] == 1]
        x_test = x_test.drop('ViewCount', axis=1)
        x_test = x_test[x_test['PROGRAM_TYPE'] == x_test.PROGRAM_TYPE.unique()
                        [0]]

        xgb_preds = pd.DataFrame(xgb_preds, columns=['ViewCount'])

        xgb_preds.index = x_test.index

        H_C = pd.concat([x_test, xgb_preds], axis=1)

        old = contents[contents['New_Contents'] == 0]
        old = old[old['PROGRAM_TYPE'] == old.PROGRAM_TYPE.unique()[0]]

        H_C = H_C[[
            'EPISODE', 'PAYMENT', 'PROGRAM_TYPE', 'ViewCount', 'New_Contents',
            'genre_Label', 'target_age', 'playtime', 'channel_Label',
            'contentnumber', 'episode_count', 'past_view'
        ]]

        yhat = pd.concat([old, H_C], axis=0)

        yhat = yhat.sort_values(by=['ViewCount'], ascending=False)

        values = yhat.values

        new_index = yhat[yhat['New_Contents'] == 1].index
        Hot_index = round(yhat.shape[0] / 5)
        yhat.index = range(0, len(yhat))

        HC = []
        for i in range(yhat.shape[0]):
            if yhat.index[i] < (Hot_index + 1):
                HC.append('HOT')
            else:
                HC.append('COLD')

        HC = pd.DataFrame(HC, columns=['H&C'])
        yhat = pd.concat([yhat, HC], axis=1)

        yhat = yhat[yhat['New_Contents'] == 1]
        yhat.index = new_index
        yhat = yhat.sort_index(ascending=True)

        yhat = yhat['H&C'].values

        actual = pd.DataFrame(Data,
                              columns=[
                                  'PAYMENT', 'PROGRAM_TYPE', 'New_Contents',
                                  'H&C', 'genre_Label', 'target_age',
                                  'playtime', 'channel_Label', 'contentnumber',
                                  'past_view'
                              ])
        actual = actual[actual['New_Contents'] == 1]
        actual = actual[actual['PROGRAM_TYPE'] == actual.PROGRAM_TYPE.unique()
                        [0]]
        actual = actual['H&C'].values

        from sklearn.metrics import recall_score, precision_score, f1_score
        recall_scores = recall_score(actual,
                                     yhat,
                                     average='macro',
                                     labels=['HOT'])
        recall_score_cold = recall_score(actual,
                                         yhat,
                                         average='macro',
                                         labels=['COLD'])
        precision_scores = precision_score(actual,
                                           yhat,
                                           average='macro',
                                           labels=['HOT'])
        precision_score_cold = precision_score(actual,
                                               yhat,
                                               average='macro',
                                               labels=['COLD'])
        f_score = f1_score(actual, yhat, average='macro', labels=['HOT'])
        f_score_cold = f1_score(actual, yhat, average='macro', labels=['COLD'])

        return f_score, f_score_cold, recall_scores, precision_scores, recall_score_cold, precision_score_cold

    precision_cold = []
    precision = []
    recall_cold = []
    recall = []
    f1_score = []
    f1_score_colds = []

    for i in tqdm(list_of_random_items):

        db = pymysql.connect(
            host='gcsdbinstance.cu0nuaw6yxna.us-east-1.rds.amazonaws.com',
            port=3306,
            user='******',
            passwd='awsg1020*',
            db='gcs_database',
            charset='utf8')
        cursor = db.cursor()
        sql = "SELECT * FROM gcs_database." + str(i)
        cursor.execute(sql)
        Data = pd.read_sql(sql, db)
        print('Date: ', i)

        f_score, f_score_cold, recall_scores, precision_scores, recall_score_cold, precision_score_cold = xgboost_hot_cold(
            i, Data)
        print('Precision-Score: ', precision_scores)
        print('Recall-Score: ', recall_scores)
        print('F1-Score: ', f_score)

        precision_cold.append(precision_score_cold)
        precision.append(precision_scores)
        recall_cold.append(recall_score_cold)
        recall.append(recall_scores)
        f1_score.append(f_score)
        f1_score_colds.append(f_score_cold)

    print('View pattern classification Precision - New_HOT : ' +
          str(np.mean(precision)))
    print('View pattern classification Recall - New_HOT : ' +
          str(np.mean(recall)))
    print('View pattern classification F1-Score - New_HOT : ' +
          str(np.mean(f1_score)))
    print('View pattern classification Precision - New_COLD : ' +
          str(np.mean(precision_cold)))
    print('View pattern classification Recall - New_COLD : ' +
          str(np.mean(recall_cold)))
    print('View pattern classification F1-Score - New_COLD : ' +
          str(np.mean(f1_score_colds)))
    return 'F1-score : ' + str(np.mean(f1_score)), np.mean(f1_score)
コード例 #22
0
        XGBClassifier_result,
        GaussianNB_result,
        KNN_result,
        Category_boost,
        voting]

# +
roc_score = []
f1_score = []

for i in model:
    
    roc_mean = pd.Series(i['test_ROC-AUC']).mean()
    f1_mean = pd.Series(i['test_f1']).mean()
    roc_score.append(roc_mean)
    f1_score.append(f1_mean)
    
print('roc_score:', roc_score)
print('f1_score:', f1_score)
    
    
# -

model_name = ['LGBMClassifier_result',
        'DecisionTreeClassifier_result',
        'LogisticRegression_result',
        'RandomForestClassifier_result',
        'GradientBoostingClassifier_result',
        'XGBClassifier_result',
        'GaussianNB_result',
        'KNN_result',
model = XGBClassifier(nthread = n_threads) #or -1
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)
param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_SOUTHERN.pickle.dat", 'wb')) #change pickle


# In[6]:


#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average = 'micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Southern', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Southern_results.dat", index = False, header = False)

コード例 #24
0
    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_PARK.pickle.dat",
                                      'wb'))  #change pickle

# In[6]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Park', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Park_results.dat", index=False, header=False)
コード例 #25
0
    'max_depth': [2, 4, 6, 8],  #tree depths to check
    'colsample_bytree': stats.uniform(0.3, 0.7)  #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 scoring='f1_micro',
                                 n_iter=3,
                                 n_jobs=-1,
                                 verbose=10,
                                 cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" %
      (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_TARAVAL.pickle.dat",
                                      'wb'))  #change pickle

# In[12]:

#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average='micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Taraval', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Taraval_results.dat", index=False, header=False)
model = XGBClassifier(nthread = n_threads) #or -1
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)
param_grid = {'n_estimators': [120, 240, 360, 480], #random int btwn 100 and 500 - removed
              'learning_rate': stats.uniform(0.01, 0.08), #.01 + loc, range of .01+/-.08
              'max_depth': [2, 4, 6, 8], #tree depths to check
              'colsample_bytree': stats.uniform(0.3, 0.7) #btwn .1 and 1.0    
}
rand_search = RandomizedSearchCV(model, param_distributions = param_grid, scoring = 'f1_micro', n_iter = 3, n_jobs=-1, verbose = 10, cv=kfold)
rand_result = rand_search.fit(X_train, y_train)
print("Best: %f using %s" % (rand_result.best_score_, rand_result.best_params_))
best_XGB_parameters = rand_result.best_estimator_
#INSERT CITY NAME FOR .DAT FILE
pickle.dump(best_XGB_parameters, open("xgb_TENDERLOIN.pickle.dat", 'wb')) #change pickle


# In[6]:


#test on test set
best_XGB_parameters.fit(X_train, y_train)
preds = best_XGB_parameters.predict(X_test)
f1score = f1_score(y_test, preds, average = 'micro')
#CSV append best score after test set
f1_score = []
f1_score.append(('Tenderloin', f1score))
export_df = pd.DataFrame(f1_score)
#change csv name
export_df.to_csv("Tenderloin_results.dat", index = False, header = False)