def main(): # Here in our main method we simply load and scale the # data, one hot encode the labels and then initialise # our deep neural net. # Below we test our network with both activation functions, # 'ReLU' and 'Tanh'. train_data = pd.read_csv('mnist_train.csv') test_data = pd.read_csv('mnist_test.csv') train_labels = train_data.iloc[:,0] test_labels = test_data.iloc[:,0] train_labels = one_hot(train_labels, 10) train_data = train_data.iloc[:, 1:] / 255.00 test_data = test_data.iloc[:, 1:] / 255.00 train_data = train_data.to_numpy() test_data = test_data.to_numpy() dnn_relu = DeepNeuralNet(0.001, 784, 2, 28, 10, 5, "relu", "MEGATRON!") dnn_relu.fit(train_data, train_labels) predictions = [] for item in test_data: p = list(dnn_relu.predict(item)) predictions.append(p.index(max(p))) dnn_relu.accuracy_scores(predictions, test_labels) print("\n\n=============\n\n") dnn_tanh = DeepNeuralNet(0.001, 784, 2, 28, 10, 5, "tanh", "MEGATRON!") dnn_tanh.fit(train_data, train_labels) predictions = [] for item in test_data: p = list(dnn_tanh.predict(item)) predictions.append(p.index(max(p))) dnn_tanh.accuracy_scores(predictions, test_labels)
def one_hot_encode(messages, dimension): data = [] for msg in messages: temp = one_hot(msg, vocabulary_lenght) data.append(temp) return data
def getXY(input, algo, model, test=0): """ input: 预处理过的语料库 algo: 使用的特征权重计算方法名 model: 使用的模型名 test = 0 : 记录文件中出现的词汇并构造词汇表(训练集) test = 1 : 不构造词汇表,用已经构造好的(测试集) """ global package corpus = preprocess(input, package, test) labelset = package["labelset"] voca = package["voca"] level = 2 mod = 0 if algo == "tf_idf": weights = tf_idf(corpus,test,package) mod=1 elif algo == "tf_dc": weights = tf_dc(corpus,test,package) elif algo == "tf_bdc": weights = tf_bdc(corpus,test,package) elif algo == "iqf_qf_icf": weights = iqf_qf_icf(corpus,test,package) elif algo == "tf_eccd": weights = tf_eccd(corpus,test,package) elif algo == "tf_ig": weights = tf_ig(corpus,test,package) elif algo == "tf_rf": weights = tf_rf(corpus,test,package) level = 3 elif algo == "tf_chi": weights = tf_chi(corpus,test,package) level = 3 #print weights X = [] Y = [] count = 0 vocalen = len(voca) for doc in corpus: if count%100 ==0: print str(count) + "/" + str(len(corpus)) count+=1 # process label labelset.append(doc["label"]) Y.append(int(np.argmax(one_hot(labelset)[-1]))) labelset = labelset[:-1] # process word temvocalist = voca + doc["split_sentence"] tem_one_hot = one_hot(temvocalist)[vocalen:] for word in range(len(tem_one_hot)): temlabel = doc["label"] temword = doc["split_sentence"][word] temdoc = doc["document"] if level == 2: if mod ==0: tem_one_hot[word] *= weights[temlabel][temword] else: tem_one_hot[word] *= weights[temdoc][temword] else: tem_one_hot[word] *= weights[temlabel][temdoc][temword] tem_one_hot = np.max(tem_one_hot,axis=0) if (model.lower()=="knn"): tem_one_hot = preprocessing.normalize(np.array(tem_one_hot).reshape(1,-1), norm='l2') X.append(tem_one_hot) return np.squeeze(X),Y
# neural networks # 先将y转化为one-hot形式, 即(len(y), np.unique(y).shape]) ==> (30804, 3) def expand(y, kind): res = [] for y_i in y: y_array = np.zeros(kind) y_array[y_i + 1] = 1 res.append(y_array) return np.array(res) y_train_softmax = expand(y_train, 3) print(y_train_softmax.shape) # theta1(25, 101) theta2(3, 26) final_theta = NeuralNetworks.neural_network(x_train, y_train_softmax) print('neural network accuracy: ', NeuralNetworks.accuracy(final_theta, x_test, y_test)) # Native Bayes word_vec = one_hot(sentiment) x_train, x_test, y_train, y_test = train_test_split(word_vec, y, test_size=0.25, stratify=y) p_class_vect, p_class = NativeBayes.train_bayes(x_train, y_train) y_pred = NativeBayes.predict_bayes(p_class_vect, p_class, x_test, y_test) y_pred = y_pred - 1 # -1 0 1 print('Native Bayes accuracy: ', NativeBayes.accuracy(y_pred, y_test))
#!/usr/bin/python3 from keras.datasets import cifar10 from model import Sequential from layers.pool import MaxPool from one_hot import one_hot (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_train, x_test = x_train / 255, x_test / 255 y_train = one_hot(y_train) model = Sequential(x_train, y_train) model.add_Conv(32, (3, 3)) model.add_Activation() model.add_Pool() model.add_Conv(32, (3, 3)) model.add_Activation() model.add_Pool() model.add_Conv(64, (3, 3)) model.add_Activation() model.add_Pool() model.add_Dense(512) model.out(10) model.compile(1, 32)
def getXY(input, algo, model, test=0, k=25): """ input: 预处理过的语料库 algo: 使用的特征权重计算方法名 model: 使用的模型名 test = 0 : 记录文件中出现的词汇并构造词汇表(训练集) test = 1 : 不构造词汇表,用已经构造好的(测试集) """ global package # global voca_list # global labelset_list # global vocafreq_list # global weights_list # global doclist # global docname corpus = preprocess(input, package, test, k) labelset = package["labelset"] # 获得preprocess确定的package voca = package["voca"] level = 2 mod = 0 if algo == "tf_idf": weights = tf_idf(corpus, test, package) mod = 1 elif algo == "tf_dc": weights = tf_dc(corpus, test, package) elif algo == "tf_bdc": weights = tf_bdc(corpus, test, package) elif algo == "iqf_qf_icf": weights = iqf_qf_icf(corpus, test, package) elif algo == "tf_eccd": weights = tf_eccd(corpus, test, package) elif algo == "tf_ig": weights = tf_ig(corpus, test, package) elif algo == "tf_rf": weights = tf_rf(corpus, test, package) level = 3 elif algo == "tf_chi": weights = tf_chi(corpus, test, package) level = 3 elif algo == "tf_mrf": weights = tf_mrf(corpus, test, package) level = 3 elif algo == "tf_nrf": weights = tf_nrf(corpus, test, package) level = 3 elif algo == "tf_vc": weights = tf_vc(corpus, test, package) # print weights X = [] Y = [] # 标签集 count = 0 vocalen = len(voca) for doc in corpus: if count % 1000 == 0: print(str(count) + "/" + str(len(corpus))) # print('weights\'s size:') # print(sys.getsizeof(weights)) # print(sys.getsizeof(X)) # process = psutil.Process(os.getpid()) # print('Used Memory:', process.memory_info().rss / 1024 / 1024, 'MB') # print(memory_usage_psutil()) count += 1 # process label labelset.append(doc["label"]) Y.append(int(np.argmax(one_hot( labelset)[-1]))) # 在确定的labelset中添加label,以保证label的位置一致,再进行截取 # np.argmax返回最大数的索引 labelset = labelset[:-1] # 重置labelset # process word temvocalist = list(voca) + list( doc["split_sentence"]) # 与label同理 voca用以确定位置 tem_one_hot = one_hot(temvocalist)[vocalen:] # 截取 # for word in range(len(tem_one_hot)): # .shape[0] for word in range(tem_one_hot.shape[0]): temlabel = doc["label"] # earn temword = doc["split_sentence"][word] temdoc = doc["document"] # earn638 # print("\ntem_one_hot:") # print(tem_one_hot) # print("\n") # weights--词频*权重 if level == 2: if mod == 0: # 有监督 tem_one_hot[word] *= weights[temlabel][temword] else: # 无监督 tem_one_hot[word] *= weights[temdoc][temword] else: tem_one_hot[word] *= weights[temlabel][temdoc][temword] # 空array try: tem_one_hot = np.max(tem_one_hot, axis=0) # 去除多余行 每列只保留最大数 except ValueError: # tem_one_hot = tem_one_hot[0] # print(tem_one_hot) pass if model.lower() == "knn": tem_one_hot = preprocessing.normalize( np.array(tem_one_hot).reshape(1, -1), norm='l2') # 转变为矩阵 # print(tem_one_hot.toarray()) # tem_one_hot = np.full(tem_one_hot) # print(tem_one_hot) # 稀疏矩阵转化回原矩阵! # print(type(tem_one_hot.toarray())) X.append(np.squeeze(tem_one_hot.toarray().tolist())) # print(tem_one_hot.toarray().tolist()) # X.append(tem_one_hot) # print(np.array(X)) # print(Y) return X, Y # squeeze压缩维度 如将二维转变为一维
import one_hot import numpy labels = numpy.array([0, 1, 1, 2, 2, 3]) res = one_hot.one_hot(labels, 4) print(res)