コード例 #1
0
def train(data, label, token, bin_size=None): 
    if token == 'LinR':
        train_data = data[:-60]
        train_label = label[:-60]
        test_data = data[-60:]
        test_label = label[-60:]
        train_label = np.expand_dims(train_label, axis=1)
        test_label = np.expand_dims(test_label, axis=1)
        model = model_generator(token)
        model.fit(train_data, train_label)
        modeldir = os.path.join('models', 'LinR') 
        if not os.path.exists(modeldir):
            os.makedirs(modeldir)
        pickle.dump(model, open(os.path.join(modeldir, 'model.pkl'), 'wb'))
        predict = model.predict(test_data)
        figdir = os.path.join('fig', token)
        if not os.path.exists(figdir):
            os.makedirs(figdir)
        plot_curve(predict, test_label, token, os.path.join(figdir, 'curve.pdf'))
        MSE = mean_squared_error(test_label, predict)
        MAE = mean_absolute_error(test_label, predict)
        return MSE, MAE
    else:
        train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.2)
        min_val = min(train_label)
        max_val = max(train_label)
        bins = [ min_val + idx * (max_val - min_val) / (bin_size - 1) for idx in range(bin_size)]
        labels = range(bin_size-1)
        train_label = pd.cut(train_label, bins=bins, labels=labels)
        test_label = pd.cut(test_label, bins=bins, labels=labels)
        model = model_generator(token)
        for i in range(len(train_label)):
            if train_label[i] != train_label[i]:
                train_label[i] = 0
        for i in range(len(test_label)):
            if test_label[i] != test_label[i]:
                test_label[i] = 0
        model.fit(train_data, train_label)
        modeldir = os.path.join('models', token) 
        if not os.path.exists(modeldir):
            os.makedirs(modeldir)
        pickle.dump(model, open(os.path.join(modeldir, 'bins-{}.pkl'.format(bin_size)), 'wb'))
        predict = model.predict(test_data)
        conf_matrix = confusion_matrix(test_label, predict, labels=labels)
        figdir = os.path.join('fig', token)
        if not os.path.exists(figdir):
            os.makedirs(figdir)
        figpath = os.path.join(figdir, 'bins-{}.pdf'.format(bin_size))
        plot_conf_matrix(conf_matrix, labels, True, token, figpath)
        accuracy = accuracy_score(test_label, predict)
        precision, recall, f, _ = precision_recall_fscore_support(test_label, predict, average='weighted')
        return accuracy, precision, recall, f
コード例 #2
0
def generate_samples(x1, x2, step=1):
    x1_min = int(np.percentile(x1, 1))
    x1_max = int(np.percentile(x1, 100))
    sample_x1 = np.arange(x1_min, x1_max, step).reshape(-1, 1)

    lm = LinearRegression()
    X = x1.reshape(-1, 1)
    model = lm.fit(X, x2)

    predicted_x2 = model.predict(sample_x1)
    plot.plot_curve(sample_x1, predicted_x2, x1, x2)

    return np.hstack([sample_x1, predicted_x2.reshape(-1, 1)])
コード例 #3
0
from google_trends import init_google_trends, searches_for
from plot import plot_curve

if __name__ == "__main__":
    pytrend = init_google_trends()
    keywords = ['desinfektionsmittel', 'seife', 'klopapier', 'mundschutz']
    data = searches_for(pytrend, keywords)
    plot_curve(data, keywords)

    keywords = [
        'corona', 'bundesliga', 'ausgangssperre', 'soforthilfe', 'italien'
    ]
    data = searches_for(pytrend, keywords)
    plot_curve(data, keywords)

    keywords = [
        'kinderbetreuung', 'schule', 'kita', 'alleinerziehend', 'notbetreuung'
    ]
    data = searches_for(pytrend, keywords)
    plot_curve(data, keywords)

    keywords = [
        'depression', 'mein mann schlägt mich', 'häusliche gewalt', 'seelsorge'
    ]
    data = searches_for(pytrend, keywords)
    plot_curve(data, keywords)
コード例 #4
0
def main():
    parser = argparse.ArgumentParser(description='Classification task')
    parser.add_argument('--normalize', action='store_true')
    parser.add_argument('--plot-pca', action='store_true')
    parser.add_argument('--plot-corr', action='store_true')
    parser.add_argument('--plot-feat-dist', action='store_true')
    parser.add_argument('--plot-curve', action='store_true')
    parser.add_argument('--classifier')
    parser.add_argument('datafile')

    args = parser.parse_args()

    data, X, y = preprocessing(
        args.datafile,
        #feature_discret=True,
        plot_feat_dist=args.plot_feat_dist,
        plot_corr=args.plot_corr,
        plot_PCA=args.plot_pca)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        test_size=0.4,
                                                        random_state=0)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test,
                                                    y_test,
                                                    stratify=y_test,
                                                    test_size=0.5,
                                                    random_state=0)
    if args.normalize:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_dev = scaler.transform(X_dev)
        X_test = scaler.transform(X_test)
        """
        #for neural network
        scaler = MinMaxScaler((-1, 1)) 
        X_train = scaler.fit_transform(X_train)
        X_dev = scaler.transform(X_dev)
        X_test = scaler.transform(X_test)
        """

    print("DataSet summary:")
    print("train X: ", X_train.shape, "y: ", y_train.shape)
    print("dev X: ", X_dev.shape, "y: ", y_dev.shape)
    print("test X:", X_test.shape, "y: ", y_test.shape)

    positive_weight = 1 - sum(y_train) / y_train.count()
    class_weight = {1: positive_weight, 0: 1 - positive_weight}
    print("Class weights: ", {1: positive_weight, 0: 1 - positive_weight})

    if args.classifier == 'gbdt':
        clf = train_gbdt_classifier(X_train, y_train)
    elif args.classifier == 'rf':
        clf = train_random_forest_classifier(class_weight, X_train, y_train)
    elif args.classifier == 'nn':
        clf = train_MLP_classifier(X_train, y_train)
    else:
        clf = train_logit_classifier(class_weight, X_train, y_train)

    title = "Learning Curve"
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.33, random_state=0)
    X = pd.concat([X_train, X_dev])
    y = pd.concat([y_train, y_dev])
    plot_learning_curve(clf, title, X, y, cv=cv, n_jobs=2)
    plt.show()

    plot_feature_importance(clf, X_train, y_train, data.columns)

    y_train_pred = clf.predict(X_train)
    y_train_prop = clf.predict_proba(X_train)
    y_dev_pred = clf.predict(X_dev)
    y_dev_prop = clf.predict_proba(X_dev)
    y_test_pred = clf.predict(X_test)
    y_test_prop = clf.predict_proba(X_test)
    y_train_score = y_train_prop[:, 1]
    y_dev_score = y_dev_prop[:, 1]
    y_test_score = y_test_prop[:, 1]

    print('Training data report')
    print(classification_report(y_train, y_train_pred))
    print('Dev data report')
    print(classification_report(y_dev, y_dev_pred))
    #print('Test data report')
    #print(classification_report(y_test, y_test_pred))

    if args.plot_curve:
        opthd = plot_curve(y_dev, y_dev_score, min_p=None, min_r=None)
        print("optimal threshold: ", opthd)

        y_test_pred = np.array(y_test_score > opthd)
        y_test_pred = y_test_pred.astype(int)
        print('Test data report (with optimal theshold)')
        print(classification_report(y_test, y_test_pred))
コード例 #5
0
ファイル: corona_main.py プロジェクト: killingo7/coronatrends
from google_trends import init_google_trends, searches_for
from plot import plot_curve, make_axis_for_pytrends, make_axis_for_wiki
from wikipedia import wiki_visits

if __name__ == "__main__":
    pytrend = init_google_trends()
    keywords = ['desinfektionsmittel', 'seife', 'klopapier', 'mundschutz']
    data = searches_for(pytrend, keywords)
    data = make_axis_for_pytrends(data, keywords)
    plot_curve(data)

    keywords = [
        "Suizid_durch_Vergiftung_mit_Medikamenten",
        "Suizid_durch_Sprung_aus_der_Höhe"
    ]
    plot_data = {}
    for keyword in keywords:
        data = wiki_visits(keyword, "2019100100", "2020042800")
        if data:
            data = make_axis_for_wiki(data, keyword)
            plot_data.update(data)

    plot_curve(plot_data)
コード例 #6
0
import matplotlib
""" Save figure without displaying it """
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from pylab import rcParams

epoch_list=[i for i in range(config.epoch)]
class_names=[i for i in range(3)]
pred_label= model.predict(test_data)
print pred_label
# for i in pred_label:
#     i=i*(max(new)-min(new))+min(new)
print pred_label
pred_label=list(itertools.chain.from_iterable(pred_label))
test_label=list(itertools.chain.from_iterable(test_label))
print pred_label
print test_label


# In[13]:


from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(test_label, pred_label)
# np.set_printoptions(prescision=2)
plot.plot_confusion_matrix(cnf_matrix, class_names, False, "Confusion matrix", "/Users/nicole/Desktop/python/finalproject")
plot.plot_curve(epoch_list, train_loss_list, "Accuracy curve", "/Users/wei-jer-chang/Desktop/final project", training=True, accuracy=False)
   

コード例 #7
0
ファイル: die_new.py プロジェクト: jxmmy7777/Resume
pred_list = classification.classification(pred_label)
test_list = classification.classification(test_label)
# print pred_list
# print test_list

from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(test_list, pred_list)
class_names = [
    "0-10", "10-20", "20-30", "30-40", "40-50", "50-60", "60-70", "70-80",
    "80-90", "90-100"
]
plot.plot_confusion_matrix(cnf_matrix, class_names, False, "Confusion matrix",
                           "")
plot.plot_curve(epoch_list,
                train_loss_list,
                "Clear Accuracy curve",
                "",
                training=True,
                accuracy=False)

"write file"
import csv
all_value = [[i for i in range(1, config.epoch + 1)], train_loss_list,
             val_loss_list]
title = 'epoch' + "," + 'train_loss_list' + "," + 'val_loss_list'
row_value = zip(*all_value)
sav_value = title + "\n"
data = sav_value
for i in range(len(row_value)):
    data += str(row_value[i][0]) + "," + str(row_value[i][1]) + "\n"

with open("clear_data.csv", "w") as f1: