Exemple #1
0
def dTrees_predict():
    """决策树分类预测"""
    data_set = read_data(FILE_PATH)
    data_set = filter_data(data_set)
    data_set = fit_transform(data_set)

    test_set = read_data(TEST_PATH)
    test_set = filter_data(test_set)
    test_set = fit_transform(test_set)

    column_x = get_column_x(data_set)
    column_y = get_column_y(data_set)

    dtrees = DTrees(data_set, test_set, column_x, column_y)

    train_x, train_y = dtrees.get_train_x_y()
    test_x, test_y = dtrees.get_test_x_y()

    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    # dot_data = tree.export_graphviz(model, out_file=None,
    #                                 filled=True, rounded=True,
    #                                 special_characters=True)
    # graph = graphviz.Source(dot_data)
    #
    # graph.render('example.gv', directory='.\\', view=True)

    predicted = model.predict(test_x)
    print("决策树准确度:", accuracy_score(test_y, predicted))
Exemple #2
0
def bayes_predict():
    """贝叶斯分类预测"""
    data_set = read_data(FILE_PATH)
    data_set = filter_data(data_set)

    test_set = read_data(TEST_PATH)
    test_set = filter_data(test_set)

    column_x = get_column_x(data_set)
    column_y = get_column_y(data_set)

    bayes = Bayes(data_set, column_x, column_y)
    # column_x_value = bayes.set_test_x(Dates=23,DayOfWeek='Wednesday',PdDistrict='NORTHERN')
    # dict,result = bayes.predict(column_x_value)

    print('准备开始...')
    p = bayes.predict_all(test_set)
    print(p)
Exemple #3
0
    def __init__(self, train_data_path, output_test_path, max_iter=50, max_time=10, C=9, tolerance=0.0001, kernel=SMO.linear_kernel):
        self.data = read_data(train_data_path)
        self.output_test_data = read_data(output_test_path)

        # TODO change to submit format
        self.training_data, self.testing_data = split_data(self.data)
        self.train_X, self.train_Y = self.training_data[:, :-1], np.squeeze(self.training_data[:, -1:])
        self.test_X, self.test_Y = self.testing_data[:, :-1], np.squeeze(self.testing_data[:, -1:])

        # print(self.train_X.shape, self.train_Y.shape)

        # self.alphas = np.random.randn(len(self.train_X))
        self.alphas = np.zeros(len(self.train_X))
        self.b = 0.0
        self.m = len(self.train_X)

        self.max_iter = max_iter
        self.max_time = max_time
        self.kernel = kernel
        self.C = C
        self.tolerance = tolerance
Exemple #4
0
def neual_network_predict():
    """神经网络分类预测"""
    data_set = read_data(FILE_PATH)
    data_set = filter_data(data_set)
    data_set = fit_transform(data_set)

    test_set = read_data(TEST_PATH)
    test_set = filter_data(test_set)
    test_set = fit_transform(test_set)

    train_x = get_data_set_x(data_set)
    train_y = get_data_set_y(data_set)
    test_x = get_data_set_x(test_set)
    test_y = get_data_set_y(test_set)

    labels_train = fit_bin_transform(train_y)
    print(labels_train)
    network = NeuralNetwork([3, 50, len(labels_train[0])])
    network.fit(train_x, labels_train, epochs=3000)
    a, b = network.predict_all(test_x, test_y)
    print(a, '\n', b)
Exemple #5
0
from __future__ import print_function

import collections
import math
import os
import random

import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from data_process import maybe_download,read_data,build_dataset,vocabulary_size

from tensorflow.contrib.tensorboard.plugins import projector

filename = maybe_download(31344016)
vocabulary = read_data(filename)
data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
print('Data size', len(vocabulary))
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

data_index = 0
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window

    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
Exemple #6
0
import data_process
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import sys
from pickle import dump
from pickle import load
"""This module uses methods in data_process to process the dataset and trains the Naive Bayes classifier:
1. Tf-idf unigram model: using filtered unigram data
2. Tf-idf bigram model: using filtered bigram data
3. Tf-idf bigram pmi model: based on 2, only keep the bigrams with positive mutual information as feature set
"""
# split the data into training and test sets
targets, raw_docs = data_process.read_data("spam.csv")
docs = data_process.data_filter(raw_docs)
docs_train, docs_test, cate_train, cate_test = train_test_split(
    docs, targets, test_size=0.20, random_state=12)
# training and test sets for bigram model
docs_bigram_train = [data_process.get_bigram(doc) for doc in docs_train]
docs_bigram_test = [data_process.get_bigram(doc) for doc in docs_test]
# using bigram with positive mutual information as training set
fre_uni = data_process.frequency(docs_train)
fre_big = data_process.frequency(docs_bigram_train)
docs_bigram_train_pmi = [
    data_process.filter_pmi(doc, fre_uni, fre_big) for doc in docs_bigram_train
]


def dummy(doc):
    """This is the dummy tokenizer for CountVectorizer,
Exemple #7
0
import numpy as np
import phenome_classify as pc
import sub_string as sb
import data_process
import pre_process
import calc_target as ct

#从文件中按列读取数据
root_mono = "labels/mono"
root_full = "labels/full"
file_list_mono = sb.traverse_dir(root_mono)
file_list_full = sb.traverse_dir(root_full)
sb.read_files(file_list_full)
sb.read_files_time(file_list_full)
sb.read_mono(file_list_mono)

#####从按列保存的数据中读取所需行(每个note保存音节核)信息
data_process.read_data()

#####
dir = "res/note_lines.npy"
dir_time = "res/note_time.npy"
dir_mono = "res/note_mono_lines.npy"

##### 从按音节保存的行信息中读取所需要的特征,并保存到到all_train.npy
pre_process.get_train_data(dir, dir_time)

##### 从all_train.npy中按谱面时间和mono的时间计算target, 并保存target在[-15,14]的行
##### 最后的shape是target和data的shape 按照这个改模型输入的神经元
ct.get_targets("res/note_time.npy", "res/note_mono_lines.npy",
               "res/all_train.npy")
        out = model(x)
        loss = F.binary_cross_entropy(out, target.float())
        losses.append(loss.item())
        targets += list(target.numpy())
        out = out.view(-1).detach().numpy()
        outs += list(np.int64(out > 0.5))
    acc = accuracy_score(targets, outs)
    return acc, sum(losses) / len(losses)


if __name__ == "__main__":
    args = set_args()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # 加载数据
    data_path = './data/adult.data'
    data = read_data(data_path)
    train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine(
        data)
    data_wide = train_data[0]
    train_data = (torch.from_numpy(train_data[0].values),
                  torch.from_numpy(train_data[1].values),
                  torch.from_numpy(train_data[2].values))
    train_data = MyDataSet(train_data)

    test_data = (torch.from_numpy(test_data[0].values),
                 torch.from_numpy(test_data[1].values),
                 torch.from_numpy(test_data[2].values))
    test_data = MyDataSet(test_data)
    trainloader = DataLoader(train_data,
                             batch_size=args.batch_size,
                             shuffle=True)
Exemple #9
0
"""
# -*- coding: utf-8 -*-
# @File    : predict.py
# @Time    : 2020/12/28 4:13 下午
# @Author  : xiaolu
# @Email   : [email protected]
# @Software: PyCharm
"""
import torch
from model import WideDeep
from data_process import read_data, feature_engine
from config import set_args

args = set_args()
path = './data/adult.data'
data = read_data(path)
train_data, test_data, deep_columns_idx, embedding_columns_dict = feature_engine(data)
data_wide = train_data[0]

# 预测数据的输入格式,这里预测一条数据
t = (torch.from_numpy(train_data[0].values[0].reshape(-1, train_data[0].values.shape[1])),
     torch.from_numpy(train_data[1].values[0].reshape(-1, train_data[1].values.shape[1])))

# parameters setting
deep_model_params = {
    'deep_columns_idx': deep_columns_idx,
    'embedding_columns_dict': embedding_columns_dict,
    'hidden_size_list': args.hidden_size_list,
    'dropouts': args.dropouts,
    'deep_output_dim': args.deep_out_dim}
wide_model_params = {
Exemple #10
0
    print "x size: ", len(x)
    print "y size: ", len(y)
    plt.scatter(x, y, c=color)
    plt.xlabel(xname)
    plt.ylabel(yname)
    # add legend
    classes = ['0', '1']
    class_colours = ['r', 'g']
    recs = []
    for i in range(len(class_colours)):
        recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i]))
    plt.legend(recs, classes, loc='upper left')
    plt.show()


train, test, features, features_non_numeric = data_process.read_data()

train, test, features, features_non_numeric = data_process.process_data(
    train, test, features, features_non_numeric)

tsize = 0.001
dtrain, dtest = cross_validation.train_test_split(train, test_size=tsize)

#importance_feat(features)

#Correlation_Matrix_plot(train)

features = ['Customers', 'Sales', 'Promo']
data = dtest[features]

Scatter_plot(data)