Exemple #1
0
def standrand_data():
    lrmfc_data_path = get_path('section1', 'LRFMC_data_path')
    data = pd.read_excel(lrmfc_data_path)
    data = (data - data.mean(axis=0)) / (data.std(axis=0))
    data.columns = ['Z' + i for i in data.columns]
    zscored_data_path = get_path('section1', 'zscored_data_path')
    data.to_excel(zscored_data_path, index=False)
Exemple #2
0
def clean_data():
    source_data_path = get_path('section1', 'source_data_path')
    clean_data_path = get_path('section1', 'clean_data_path')
    data = pd.read_csv(source_data_path, encoding='utf-8')
    data = data[data['SUM_YR_1'].notnull() * data['SUM_YR_2'].notnull()]
    index1 = data['SUM_YR_1'] != 0
    index2 = data['SUM_YR_2'] != 0
    index3 = (data['SEG_KM_SUM'] == 0) & (data['avg_discount'] == 0)
    data = data[index1 | index2 | index3]
    data.to_excel(clean_data_path)
def largrange_interpolation():
    inputfile = get_path('section1', 'inputfile')
    outputfile = get_path('section2', 'outputfile')
    data = pd.read_excel(inputfile, header=None)
    # judge data if interpolation
    for i in data.columns:
        for j in range(len(data)):
            if (data[i].isnull())[j]:
                data[i][j] = ployinterp_column(data[i], j)

    data.to_excel(outputfile, header=None, index=False)
Exemple #4
0
def explore_data():
    source_data_path = get_path('section1', 'source_data_path')
    result_data_path = get_path('section1', 'result_data_path')
    data = pd.read_csv(source_data_path, encoding='utf-8')
    explore = data.describe(percentiles=[],
                            include='all').T  # T is transposition
    explore['null'] = len(data) - explore['count']
    explore = explore[['null', 'max', 'min']]
    explore.columns = [u'空值数', u'最大值', u'最小值']
    explore.describe()
    explore.to_excel(result_data_path)
Exemple #5
0
def train_kmeans():
    zscored_data_path = get_path('section1', 'zscored_data_path')
    kmeans_file = get_path('section2', 'kmeans_file')
    data = pd.read_excel(zscored_data_path)
    kmodel = KMeans(n_clusters=5, n_jobs=4)
    kmodel.fit(data)
    # print(kmodel.cluster_centers_) ### five centres
    # print(kmodel.inertia_)
    kmodel.predict(data)

    # print (kmodel.labels_)
    joblib.dump(kmodel, kmeans_file)
Exemple #6
0
def lm_model():
    net_file = get_path('section2', 'net_file')
    train = get_data()[0]
    test = get_data()[1]
    net = load_model(net_file)
    predict_result = net.predict_classes(test[:, :3]).reshape(
        len(test))  ## transform result
    cm_plot(test[:, 3], predict_result).show()
Exemple #7
0
def get_data():
    model_data_path = get_path('section1', 'model_data_path')
    model_data = pd.read_excel(model_data_path)
    model_matrix = model_data.as_matrix()
    shuffle(model_matrix)
    p = 0.8  # ratio of training
    train_data = model_matrix[:int(len(model_matrix) * p), :]
    test_data = model_matrix[int(len(model_matrix) * p):, :]
    print(train_data, test_data)
    return train_data, test_data
Exemple #8
0
def train_lm_classification():
    # init net
    net = Sequential()
    netfile = get_path('section1', 'lm_net_file')
    net.add(Dense(input_dim=3, output_dim=10))  # input to hide
    net.add(Activation('relu'))  # relu function between their
    net.add(Dense(input_dim=10, output_dim=1))  # hide to output
    net.add(Activation('sigmoid'))  # sigmoid's function between their
    net.compile(loss='binary_crossentropy', optimizer='adam')  ## use adam
    train = get_data()[0]

    net.fit(train[:, :3], train[:, 3], nb_epoch=1000,
            batch_size=1)  # train model ,1000's loop
    net.save(netfile)
Exemple #9
0
def train_cart_classification():
    treefile = get_path('section1', 'tree_file')
    tree = DecisionTreeClassifier()
    train = get_data()[0]
    tree.fit(train[:, :3], train[:, 3])
    joblib.dump(tree, treefile)  # save training model by joblib
Exemple #10
0
def cart_model():
    tree_file = get_path('section2', 'tree_file')
    train = get_data()[0]
    test = get_data()[1]
    cart = joblib.load(tree_file)
    cm_plot(train[:, 3], cart.predict(train[:, :3])).show()