def main(): global solution # 创建Data类实例对象 data = Data('src_data/data.txt') # 调用data对象的solution_judge方法,来判断该同余方程组是否有解 if (data.error_flag): if data.solution_judge(): print("该同余式组有解") # 如果有解,则分两种方式求解 # 先调用crt_judge方法,判断是否可以用中国剩余定理求解 if data.crt_judge(): print("该同余式组可以用中国剩余定理") sol = "解为: x=%d(mod %d)" % Crt(data).crt_compute() print(sol) # 将答案写入文件 with open("src_data/data.txt", "a") as f: f.write("\n" + sol) return sol # 如果crt_judge方法返回0,则调用nst方法,用一般求解方式进行求解 else: print("该同余式组不可以用中国剩余定理,以一般方法求解") sol = "解为: x=%d(mod %d)" % Nst(data).nst_compute() print(sol) # 将答案写入文件 with open("src_data/data.txt", "a") as f: f.write("\n" + sol) return sol else: solution = "该同余式组无解" print("该同余式组无解") return solution else: return data.solution
def main(): equation = [] if not os.path.exists(dir): os.mkdir(dir) os.chdir(dir) num = int(input("请输入一共有多少个同余方程:")) with open('da ta.txt', 'w') as f: for i in range(num): equation.append(input("请输入第%d个方程:" % i) + "\n") f.write(equation[i]) else: print("目录已存在") os.chdir(dir) data = Data('./data.txt') if data.solution_judge(): print("该同余式组有解") if data.crt_judge(): print("该同余式组可以用中国剩余定理") print("解为: x=%d(mod %d)" % Crt(data).crt_compute()) else: print("该同余式组不可以用中国剩余定理,以一般方法求解") print("解为: x=%d(mod %d)" % Nst(data).nst_compute()) else: print("该同余式组无解") return False
def __init__(self, hparams, data_dir): self.hparams = hparams self.data_dir = data_dir #logger self._logger = logging.getLogger(__name__) #data_process self.data_process = Data(self.hparams, self.data_dir) (self.char_inputs, self.char_lengths), (self.inputs, self.labels, self.lengths) = \ self.data_process.load_data() #char, id self.char2id = self.data_process.char2id self.id2char = self.data_process.id2char # word, id self.word2id = self.data_process.word2id # dict() self.id2word = self.data_process.id2word # vocabulary # label, id self.label2id = self.data_process.label2id self.id2label = self.data_process.id2label # pre-trained word2vec with np.load(os.path.join(self.hparams.glove_dir, "glove.6B.300d.trimmed.npz")) as pretrained_data: self.word_embeddings = pretrained_data["embeddings"] print(np.shape(self.word_embeddings))
def nst_compute(self): for i in range(self.num_equation): a, b = prime_Decomposition(self.mod[i]) self.base2.extend(a) self.exponent.extend(b) for j in range(len(a)): self.remainder2.append(self.remainder[i]) self.num_equation2 += 1 # print("********************") # print(self.base2) # print(self.exponent) # print(self.num_equation) # print(self.num_equation2) # print("********************") for i in range(self.num_equation2): for j in range(i+1, self.num_equation2): if self.base2[i] == self.base2[j]: flag = int(self.exponent[i] > self.exponent[j]) c = [self.exponent[j], self.exponent[i]][self.exponent[j] > self.exponent[i]] b = [self.exponent[i], self.exponent[j]][self.exponent[j] > self.exponent[i]] if abs(self.remainder2[i] - self.remainder2[j]) % pow(self.base2[i], c) == 0: if flag: self.remainder3.append(self.remainder2[i]) else: self.remainder3.append(self.remainder2[j]) self.base3.append(self.base2[i]) self.exponent3.append(b) self.num_equation3 += 1 else: print("矛盾,该同余式组无解") for index, value in enumerate(self.base2): if self.base2.count(value) == 1: self.remainder3.append(self.remainder2[index]) self.base3.append(self.base2[index]) self.exponent3.append(self.exponent[index]) self.num_equation3 += 1 # print("&&&&&&&&&&&&&&&&&&&&&&") # print(self.base3) # print(self.exponent3) # print(self.num_equation3) # print(self.remainder3) # print("&&&&&&&&&&&&&&&&&&&&&&") with open("transformed_data.txt", 'w') as f: for i in range(self.num_equation3): f.write("x=%d(mod%d)\n" % (self.remainder3[i], pow(self.base3[i], self.exponent3[i]))) transformed_data = Data("./transformed_data.txt") if transformed_data.crt_judge(): print("该同余式组可以用中国剩余定理") tcrt = Crt(transformed_data) return tcrt.crt_compute() else: print("以一般方法 递归 求解") return Nst(transformed_data).nst_compute()
def _pre_process(self): self.model_file = os.path.join(self.model_dir, 'model.ckpt') self.meta_file = os.path.join(self.model_dir, 'model.ckpt.meta') var_file = os.path.join(self.model_dir, 'var.pkl') with open(var_file, 'rb') as f: self.var, self.config = pickle.load(f) basic_config = config.basic_config() basic_config.__dict__.update(self.config) self.config = basic_config vocab_file = './data/vocab.txt' self.data_tools = Data(vocab_file, None, basic_config, logging) self.tokenizer = Tokenizer(logging)
def __init__(self): config = Config() self.emb = UtteranceEmbed(config.word2vec_filename) train_dataset = Data(config.train_filename, config.test_filename).train_set random.shuffle(train_dataset) self.train_dataset = train_dataset[:361] self.test_dataset = train_dataset[361:] self.cate_mapping_dict = joblib.load(config.cate_mapping_dict) self.bow_embed = BowEmbed(self.train_dataset, self.train_dataset, 200) nb_hidden = 128 # obs_size = self.emb.dim + self.bow_embed.get_vocab_size() obs_size = self.emb.dim self.memory_net = Memory_net(obs_size, nb_hidden)
def main(config): data = Data(config.train_filename, config.dev_filename, config.test_filename) train_data = data.train_set dev_data = data.dev_set test_data = data.test_set # load word2vec utter_embed = UtteranceEmbed(config.word2vec_filename) input_size = (utter_embed.get_vector_size() * 2) # concat size num_neurons = [7500, 7500, 5000, 2500] model = Dnn(input_size, num_neurons, 3, utter_embed, config) model.build() model.train(train_data, dev_data, test_data)
def predict_score(): data = Data() test_features = feature_extraction(data, "test") submission = pd.DataFrame.from_dict({'id': data.test['id']}) for i in range(len(data.classes)): print("Processing " + data.classes[i]) lr_model_pkl = open( '../model/logistic_regression_%s.pkl' % data.classes[i], 'rb') lr_model = pickle.load(lr_model_pkl) print( "Loaded Logistic Regression Model for class %s :: " % data.classes[i], lr_model) submission[data.classes[i]] = lr_model.predict_proba(test_features)[:, 1] print(submission.head(5)) print("Saving output") submission.to_csv('../data/output.csv', index=False) print("Output saved")
def create_and_save(): data = Data() train_features = feature_extraction(data, "train") scores = [] # print (train_features.shape) # kbest = SelectKBest(chi2, k=1000) for i in range(len(data.classes)): print("Processing " + data.classes[i]) train_target = data.train[data.classes[i]] # x_feature = kbest.fit_transform(train_features, train_target) # print (x_feature) classifier = LogisticRegression(solver='sag') cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) # cv_score = np.mean(cross_val_score(classifier, x_feature, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) print('CV score for class {} is {}'.format(data.classes[i], cv_score)) # Calculate ROC_AUC roc_auc(train_features, np.array(train_target), data.classes[i]) print("Creating model for class " + data.classes[i]) classifier.fit(train_features, train_target) # classifier.fit(x_feature, train_target) print("Saving model logistic_regression_%s" % data.classes[i]) lr_pkl_filename = '../model/logistic_regression_%s.pkl' % data.classes[ i] lr_model_pkl = open(lr_pkl_filename, 'wb') pickle.dump(classifier, lr_model_pkl) lr_model_pkl.close() print("Model saved") print('Total CV score is {}'.format(np.mean(scores))) print("Successfully created and saved all models!")
total_loss / data_len, total_acc / data_len)) np.save(result_src, total_y_pre) return y, total_y_pre if __name__ == "__main__": # 模型路径 model_save_src = "data/model/2_layer_lstm_model" num_category = 9 # 向量化后的数据集 x_src = "data/vectorized_data/test/x.npy" y_src = "data/vectorized_data/test/y.npy" result_src = "data/results/rnn_pre.npy" vocab_src = "data/middle_result/vocab.npy" data = Data() vocab, _ = data.load_vocab(vocab_src) # 模型 config = TRNNConfig() config.vocab_size = len(vocab) model = TextRNN(config) # 测试 print("Begin Testing") start_time = time.time() y, y_pre = test(x_src, y_src, result_src) print("the time is {}".format(get_time_dif(start_time))) # 评估 precision_score, recall_score, f1_val, accuracy = evaluate(y, y_pre)
def train(): epochs = 50 length = 10 n_units = 128 n_features = 6 batch_size = 64 data = Data(batch_size) num_batches = data.num_batches() xplaceholder = tf.placeholder(tf.float32, shape=[None, length, n_features]) yplaceholder = tf.placeholder(tf.float32, shape=[None, 1]) midPrice_means = tf.placeholder(tf.float32, shape=[None, 1]) midPrice_stddevs = tf.placeholder(tf.float32, shape=[None, 1]) #origin midPrice origin_midPrice = yplaceholder * midPrice_stddevs + midPrice_means pred = lstm_model(xplaceholder, n_units) #pred midPrice pred_midPrice = pred * midPrice_stddevs + midPrice_means loss = tf.losses.mean_squared_error(labels=yplaceholder, predictions=pred) tf.summary.scalar('loss', loss) accuracy = tf.sqrt( tf.losses.mean_squared_error(origin_midPrice, pred_midPrice)) tf.summary.scalar('accuracy', accuracy) step = tf.Variable(0) learning_rate = 1e-4 tf.summary.scalar('learning rate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate).minimize( loss, global_step=step) merged = tf.summary.merge_all() init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) train_writer = tf.summary.FileWriter('log', graph=sess.graph) saver = tf.train.Saver(max_to_keep=3) step_val = None for epoch in range(epochs): data.reset_batch() total_loss = 0.0 total_acc = 0.0 for i in range(num_batches): batch_inputs, batch_labels, batch_means, batch_stddevs = data.next_batch( ) feed_dict = { xplaceholder: batch_inputs, yplaceholder: batch_labels, midPrice_means: batch_means, midPrice_stddevs: batch_stddevs } _, loss_val, acc_val, step_val, summary = sess.run( [optimizer, loss, accuracy, step, merged], feed_dict=feed_dict) total_acc += acc_val total_loss += loss_val train_writer.add_summary(summary, global_step=step_val) print 'Epoch', epoch, 'train_loss', total_loss / num_batches, 'train_acc', total_acc / num_batches ''' dev_inputs, dev_labels = data.get_dev_data() feed_dict = {xplaceholder: dev_inputs, yplaceholder: dev_labels} acc_val, loss_val = sess.run([accuracy, loss], feed_dict = feed_dict) print 'dev_loss', loss_val, 'dev_acc', acc_val ''' outfile = open('outputs10.csv', 'w') outfile.write('midprice\n') test_inputs_list, test_means_list, test_stddevs_list = data.get_test_data( ) for i in range(data.test_num_half_day): test_means = [] test_stddevs = [] test_inputs = test_inputs_list[i] mean = test_means_list[i][0] stddev = test_stddevs_list[i][0] for j in range(len(test_inputs)): test_means.append(mean) test_stddevs.append(stddev) test_inputs = np.asarray(test_inputs) test_means = np.asarray(test_means).reshape([-1, 1]) test_stddevs = np.asarray(test_stddevs).reshape([-1, 1]) feed_dict = { xplaceholder: test_inputs, midPrice_means: test_means, midPrice_stddevs: test_stddevs } pred_val = sess.run(pred_midPrice, feed_dict=feed_dict) pred_val = np.asarray(pred_val) #print pred_val.shape for i in range(len(pred_val)): outfile.write(str(pred_val[i][0]) + '\n') outfile.close()
if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # 跳出循环 if flag: # 同上 break print("the best acc on validation is {}".format(best_acc_val)) if __name__ == '__main__': train_dir = "data/vectorized_data/train" val_dir = "data/vectorized_data/validation" vocab_dir = "data/file_dict/train/vocab.npy" save_dir = 'data/model2' data_process = Data() config = CharCNNConfig() if not os.path.exists(vocab_dir): data_process.build_vocab(train_dir, vocab_dir) words, word_to_id = data_process.load_vocab(vocab_dir) config.vocab_size = len(words) model = CharCNN(config) train()
class TextClassifier: def __init__(self, hparams, data_dir): self.hparams = hparams self.data_dir = data_dir #logger self._logger = logging.getLogger(__name__) #data_process self.data_process = Data(self.hparams, self.data_dir) (self.char_inputs, self.char_lengths), (self.inputs, self.labels, self.lengths) = \ self.data_process.load_data() #char, id self.char2id = self.data_process.char2id self.id2char = self.data_process.id2char # word, id self.word2id = self.data_process.word2id # dict() self.id2word = self.data_process.id2word # vocabulary # label, id self.label2id = self.data_process.label2id self.id2label = self.data_process.id2label # pre-trained word2vec with np.load(os.path.join(self.hparams.glove_dir, "glove.6B.300d.trimmed.npz")) as pretrained_data: self.word_embeddings = pretrained_data["embeddings"] print(np.shape(self.word_embeddings)) def _inference(self, inputs: tf.Tensor, lengths: tf.Tensor, char_inputs: tf.Tensor, char_lengths: tf.Tensor): print("Building graph for model: Text Classifier") # Number of possible output cateIories. output_dim = len(self.id2label) # output_dim -> 2 char_vocab_size = len(self.id2char) char_embeddings = tf.get_variable( name="char_embeddings", shape=[char_vocab_size, self.hparams.char_embedding_dim], initializer=tf.initializers.variance_scaling( scale=2.0, mode="fan_in", distribution="uniform" ) ) char_embedded = tf.nn.embedding_lookup(char_embeddings, char_inputs) # default shape [batch_size, sentence_time, word_time, char_embedding(50)] # reshape [batch_size * sentence_time, word_time, char_embeddings(50)] char_embedded = tf.reshape(char_embedded, [-1, tf.shape(char_inputs)[-1], self.hparams.char_embedding_dim]) # reshape [batch_size * word_time] char_lengths = tf.reshape(char_lengths, [-1]) with tf.variable_scope("char-bi-RNN"): char_rnn_cell_forward = rnn.GRUCell(self.hparams.char_embedding_dim) char_rnn_cell_backward = rnn.GRUCell(self.hparams.char_embedding_dim) if self.hparams.dropout_keep_prob < 1.0: char_rnn_cell_forward = rnn.DropoutWrapper(char_rnn_cell_forward, output_keep_prob=self._dropout_keep_prob_ph) char_rnn_cell_backward = rnn.DropoutWrapper(char_rnn_cell_backward, output_keep_prob=self._dropout_keep_prob_ph) _, (char_output_fw_states, char_output_bw_states) = \ tf.nn.bidirectional_dynamic_rnn( char_rnn_cell_forward, char_rnn_cell_backward, inputs=char_embedded, sequence_length=char_lengths, dtype=tf.float32 ) char_hiddens = tf.concat([char_output_fw_states, char_output_bw_states], axis=-1) char_hiddens = tf.reshape(char_hiddens, [tf.shape(char_inputs)[0], tf.shape(char_inputs)[1], self.hparams.char_embedding_dim*2]) word_embeddings = tf.Variable( self.word_embeddings, name="word_embeddings", dtype=tf.float32, trainable=True ) ## shape = [batch_size, time, embed_dim] word_embedded = tf.nn.embedding_lookup(word_embeddings, inputs) char_word_inputs = tf.concat([word_embedded, char_hiddens], axis=-1) with tf.variable_scope("bi-RNN"): # Build RNN layers rnn_cell_forward = rnn.GRUCell(self.hparams.rnn_hidden_dim) rnn_cell_backward = rnn.GRUCell(self.hparams.rnn_hidden_dim) # Apply dropout to RNN if self.hparams.dropout_keep_prob < 1.0: rnn_cell_forward = tf.contrib.rnn.DropoutWrapper(rnn_cell_forward, output_keep_prob=self._dropout_keep_prob_ph) rnn_cell_backward = tf.contrib.rnn.DropoutWrapper(rnn_cell_backward, output_keep_prob=self._dropout_keep_prob_ph) _, (states_fw_final, states_bw_final) = \ tf.nn.bidirectional_dynamic_rnn( rnn_cell_forward, rnn_cell_backward, inputs=char_word_inputs, sequence_length=lengths, dtype=tf.float32 ) # shape = [batch_size, rnn_hidden_dim * 2] final_hiddens = tf.concat([states_fw_final, states_bw_final], axis=-1) with tf.variable_scope("layer_out"): layer_out = tf.layers.dense( inputs=final_hiddens, units=output_dim, activation=None, kernel_initializer=tf.initializers.variance_scaling( scale=2.0, mode="fan_in", distribution="normal" ) ) return layer_out def make_placeholder(self): self.inputs_ph = tf.placeholder(tf.int32, shape=[None, None], name="train_input_ph") self.labels_ph = tf.placeholder(tf.int32, shape=[None], name="train_label_ph") self.lengths_ph = tf.placeholder(tf.int32, shape=[None], name="train_lengths_ph") #[batch_size, word_time, char_time] self.char_inputs_ph = tf.placeholder(tf.int32, shape=[None, None, None], name="char_input_ph") self.char_lengths_ph = tf.placeholder(tf.int32, shape=[None, None], name="char_lengths_ph") self._dropout_keep_prob_ph = tf.placeholder(tf.float32, shape=[], name="dropout_keep_prob") def make_feed_dict(self, batch_data): feed_dict = {} batch_inputs, batch_labels, batch_lengths, batch_char_inputs, batch_char_lengths = batch_data # word-level feed_dict[self.inputs_ph] = batch_inputs feed_dict[self.labels_ph] = batch_labels feed_dict[self.lengths_ph] = batch_lengths # char-level feed_dict[self.char_inputs_ph] = batch_char_inputs feed_dict[self.char_lengths_ph] = batch_char_lengths feed_dict[self._dropout_keep_prob_ph] = self.hparams.dropout_keep_prob return feed_dict def build_graph(self): self.global_step = tf.Variable(0, name='global_step', trainable=False) # logits with tf.variable_scope("inference", reuse=False): logits = self._inference(self.inputs_ph, self.lengths_ph, self.char_inputs_ph, self.char_lengths_ph) with tf.name_scope("cross_entropy"): loss_op = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=self.labels_ph, name="cross_entropy") self.loss_op = tf.reduce_mean(loss_op, name='cross_entropy_mean') self.train_op = tf.train.AdamOptimizer().minimize(loss_op, global_step=self.global_step) eval = tf.nn.in_top_k(logits, self.labels_ph, 1) correct_count = tf.reduce_sum(tf.cast(eval, tf.int32)) with tf.name_scope("accuracy"): self.accuracy = tf.divide(correct_count, tf.shape(self.labels_ph)[0]) def train(self): sess = tf.Session() with sess.as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) # build placeholder self.make_placeholder() # build train graph self.build_graph() # checkpoint file saver saver = tf.train.Saver() # get data inputs_id, labels_id, chars_id = \ self.data_process.data_id(self.inputs, self.labels, self.char_inputs) total_batch = int(len(inputs_id) / self.hparams.batch_size) + 1 tf.global_variables_initializer().run() for epochs_completed in range(self.hparams.num_epochs): for iter in range(total_batch): batch_data = self.data_process.get_batch_data(inputs_id, labels_id, self.lengths, chars_id, self.char_lengths, iter, self.hparams.batch_size) accuracy_val, loss_val, global_step_val, _ = sess.run( [self.accuracy, self.loss_op, self.global_step, self.train_op], feed_dict=self.make_feed_dict(batch_data) ) if global_step_val % 10 == 0: self._logger.info("[Step %d] loss: %.4f, accuracy: %.2f%%" % ( global_step_val, loss_val, accuracy_val * 100)) self._logger.info("End of epoch %d." % (epochs_completed + 1)) save_path = saver.save(sess, os.path.join(self.hparams.root_dir, "saves_%s/model.ckpt" % (self.hparams.model)), global_step=global_step_val) self._logger.info("Model saved at: %s" % save_path)
from data_process import Data if __name__ == '__main__': target_data = Data() path = ".././source/origin_source/C1_data.xlsx" # 首先还原出我原先版本的结构 # 最少要读入5行数据,否则会报错 # 首先处理500的数据量级 target_data.classfing_all_data() ''' nrows = 5000 # 数据读入并设置基本参数 target_data.reset() target_data.store_data_with_can_matrix_for_single_canid(path, nrows) target_data.set_data_length() # 进行分类 target_data.initial_classfy_data() target_data.process_classfy_data() # 贪心算法求最优解 target_data.greedy_find_solution() # 显示 target_data.show_results() # target_data.check_sensor_or_counter() target_data.show_counter() target_data.show_sensor()
def nst_compute(self): # 首先对同余式方程组的模进行质数分解(调用prime_Decomposition函数) # 质数分解获得模的标准分解式,然后构成了第二套等价的方程组 for i in range(self.num_equation): a, b = prime_Decomposition(self.mod[i]) self.base2.extend(a) self.exponent.extend(b) for j in range(len(a)): self.remainder2.append(self.remainder[i]) self.num_equation2 += 1 # 双重循环遍历因数分解后的同余方程式组 # 这一步将使同余式方程组符合中国剩余定理 for i in range(self.num_equation2): for j in range(i + 1, self.num_equation2): if self.base2[i] == self.base2[j]: # flag为了标记第i项和第j项指数到底是谁比较大 flag = int(self.exponent[i] > self.exponent[j]) # c为指数较小项 # b为较大指数项 c = [self.exponent[j], self.exponent[i] ][self.exponent[j] >= self.exponent[i]] b = [self.exponent[i], self.exponent[j]][self.exponent[j] > self.exponent[i]] # 判别条件:(base^c)|余数差 if abs(self.remainder2[i] - self.remainder2[j]) % pow( self.base2[i], c) == 0: # 按条件扩充余数列表 if flag: self.remainder3.append(self.remainder2[i]) else: self.remainder3.append(self.remainder2[j]) # 按条件扩充底数,指数,并且方程个数增加一 self.base3.append(self.base2[i]) self.exponent3.append(b) self.num_equation3 += 1 else: print("矛盾,该同余式组无解") exit() # 将同余方程式组中落单的方程添加到最终方程组中 # 使用了enumerate函数,使用列表的count方法, # 若某项只出现一次,用该索引值获取该项值,并添加到相应列表中 for index, value in enumerate(self.base2): if self.base2.count(value) == 1: self.remainder3.append(self.remainder2[index]) self.base3.append(self.base2[index]) self.exponent3.append(self.exponent[index]) self.num_equation3 += 1 # 创建新文件,构造最终方程式组并将该方程式组放入txt文件中, # 这样就可以用该路径创建新的数据对象 # 到此为止,历尽千辛万苦构造的新同余式方程组就可能满足中国剩余定理了,如果满足,只要调用crt类即可 with open("src_data/transformed_data.txt", 'w') as f: for i in range(self.num_equation3): f.write("x=%d(mod%d)\n" % (self.remainder3[i], pow(self.base3[i], self.exponent3[i]))) # 构造Data对象 transformed_data = Data("src_data/transformed_data.txt") # 调用crt_judge方法,若返回1,则说明经过一轮转化已经满足中国剩余定理 if transformed_data.crt_judge(): tcrt = Crt(transformed_data) return tcrt.crt_compute() # 如果crt_judge方法返回0,则说明经过一轮转化并不转化成满足中国剩余定理形式,但已经更接近中国剩余定理条件 # 此时则需要进行递归运算,将此时的transformed_data再次传入nst,进行新的一轮三部转化 else: print("以一般方法 递归 求解") return Nst(transformed_data).nst_compute()