def train(): tensorboard_dir = './tensorboard/Lstm_CNN' save_dir = './checkpoints/Lstm_CNN' if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, 'best_validation') tf.summary.scalar('loss', model.loss) tf.summary.scalar('accuracy', model.accuracy) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) saver = tf.train.Saver() session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print("Preparing the training data....") x_train, y_train = process(pm.train_filename, word_ids, cat_to_id, max_length=300) print("Preparing the testing data....") x_test, y_test = process(pm.test_filename, word_ids, cat_to_id, max_length=300) for epoch in range(pm.num_epochs): print('Epoch:', epoch + 1) num_batchs = int((len(x_train) - 1) / pm.batch_size) + 1 batch_train = batch_iter(x_train, y_train, batch_size=pm.batch_size) for x_batch, y_batch in batch_train: real_seq_len = seq_length(x_batch) feed_dict = model.feed_data(x_batch, y_batch, real_seq_len, pm.keep_prob) _, global_step, _summary, train_loss, train_accuracy = session.run( [ model.optimizer, model.global_step, merged_summary, model.loss, model.accuracy ], feed_dict=feed_dict) if global_step % 100 == 0: test_loss, test_accuracy = model.test(session, x_test, y_test) print('global_step:', global_step, 'train_loss:', train_loss, 'train_accuracy:', train_accuracy, 'test_loss:', test_loss, 'test_accuracy:', test_accuracy) if global_step % num_batchs == 0: print('Saving Model...') saver.save(session, save_path, global_step=global_step)
def _load(self, use_user_info): ''' load cache_size lines from the last ending :return: whether the file ends ''' if self.max_lines: lines = list( islice( self.file, min([self.cache_size, self.max_lines - self.current_line]))) else: lines = list(islice(self.file, self.cache_size)) if self.drop_val and len(lines) < self.cache_size: lines = lines[:-val_size] if not lines: return False self.current_line += len(lines) if self.shuffle: permuation = np.random.permutation(len(lines)) lines = [lines[i].strip().split('\x01') for i in permuation] else: lines = [line.strip().split('\x01') for line in lines] self.data = [(process(lines[i:i + batch_size], self.token_embedding_level, use_user_info)) for i in range(0, len(lines), batch_size)] return True
def load_data(file_path, queue, offset, stride, cache_size, use_user_info, shuffle=True, drop_val=True): f = open(file_path, encoding="utf-8") list(islice(f, offset * cache_size)) while True: lines = list(islice(f, cache_size)) if drop_val and len(lines) < cache_size: lines = lines[:-val_size] if not lines: f.seek(0) list(islice(f, offset * cache_size)) lines = list(islice(f, cache_size)) if shuffle: permuation = np.random.permutation(len(lines)) lines = [lines[i].strip().split('\x01') for i in permuation] else: lines = [line.strip().split('\x01') for line in lines] for i in range(0, len(lines), batch_size): queue.put(process(lines[i:i + batch_size], None, use_user_info)) list(islice(f, (stride - 1) * cache_size))
def get_positions(): position_dic, position_counter, top_person, positions_set, frame = process() model=gensim.models.Word2Vec.load('output_word.model') new_positions_dict = {} model=gensim.models.Word2Vec.load('output_word.model') category_list = ['software','product', 'consultant','analyst', 'accounting','manufacturing','sports','banking','fundraiser','fashion','information', 'operation', 'market', 'reporter', 'sales', 'finance', 'hr', 'public','technology','business'] count = 0 for word in positions_set: old_word = word new_position = '' word = re.sub('[^0-9a-zA-Z]+', ' ', word) word_ist = word.split() max_similarity = 0 for i in word_ist: for j in category_list: simi = 0 try: simi = model.similarity(i.lower(), j) except: continue if simi >= max_similarity: max_similarity = simi new_position = j if new_position not in category_list: new_position = 'other' count += 1 new_positions_dict[old_word] = new_position # print count print new_positions_dict return new_positions_dict
def post_classification_result(): app.logger.debug('/hitec/classify/concepts/seanmf/run called') timestamp = '{:%Y-%m-%d_%H%M%S-%f}'.format(datetime.datetime.now()) # app.logger.debug(request.data.decode('utf-8')) content = json.loads(request.data.decode('utf-8')) app.logger.info(content) # save content dataset = content["dataset"]["documents"] with open('data/data_' + timestamp + ".txt", 'w') as out_file: for doc in dataset: out_file.write(doc["text"] + '\n') # get parameter params = content["params"] # start pre-processing data_process.process(timestamp, vocab_min_count=int(params["vocab_min_count"])) # start concept detection train.train(timestamp, content["method"], float(params["alpha"]), float(params["beta"]), int(params["n_topics"]), int(params["max_iter"]), float(params["max_err"]), params["fix_random"] == "true") # prepare results topics, doc_topic = results.prepare_results(timestamp) res = dict() res.update({"topics": topics}) res.update({"doc_topic": doc_topic}) # calculate metrics metrics = vis_topic.get_metrics(timestamp) res.update({"metrics": metrics}) # cleanup utils.cleanup(timestamp) # send results back return jsonify(res)
def run(self) -> None: # actually run to process the tasks num = len(self.tasks) # get num of tasks for n, (fp, sh) in enumerate(self.tasks): # enumerate each task try: # try-except helps to avoid crashing the app when something bad happens self.sig_msg.emit(f'<p><span style="color: blue;">[{n+1}/{num}]</span>, Normalising <b>{fp}</b></p>' ) # emit a message telling what's going on r = process(fp=fp, shiftInput=sh) # call process function the normalise the raw file self.sig_result.emit(fp, r) # emit the results self.sig_msg.emit( f'<p><span style="color: blue;">[{n + 1}/{num}]</span>,' f' <span style="color: green;">[SUCCEEDED]</span> to normalise {fp}</p>') # emit a success message except: self.sig_msg.emit(f'<p><span style="color: blue;">[{n + 1}/{num}]</span>,' f' <span style="color: red;">[FAILED]</span> to normalise {fp}</p>') # emit a failed message self.sig_finished.emit() # emit a message "all tasks processed"
def do_POST(self): content_length = int( self.headers['Content-Length']) # <--- Gets the size of data self.data_string = self.rfile.read(content_length) data = simplejson.loads(self.data_string) logging.info("POST request,\nPath: %s\nHeaders:\n%s\n", str(self.path), str(self.headers)) bpm = data["BPM"] audio = data["AudioData"] ##PASS THE DATA TO SCRIPT PROCESS data_to_return = data_process.process(audio, bpm) self._set_response() json_string = json.dumps(data_to_return) self.wfile.write(json_string.encode(encoding='utf_8'))
def predict(): pre_labels = [] labels = [] session = tf.Session() save_path = tf.train.latest_checkpoint('./checkpoints/Lstm_CNN') saver = tf.train.Saver() saver.restore(sess=session, save_path=save_path) val_x, val_y = process(pm.val_filename, word_to_ids, cat_to_id, max_length=pm.seq_length) batch_val = batch_iter(val_x, val_y, batch_size=64) for x_batch, y_batch in batch_val: real_seq_len = seq_length(x_batch) feed_dict = model.feed_data(x_batch, y_batch, real_seq_len, 1.0) predict_label = session.run(model.predict, feed_dict=feed_dict) pre_labels.extend(predict_label) labels.extend(y_batch) return pre_labels, labels
import os import pandas as pd import numpy as np from data_process import process INPUT_FOLDER = '/labs/colab/3DDD/kaggle_data/sample_images/' patients = os.listdir(INPUT_FOLDER) patients.sort() labels = pd.read_csv('/labs/colab/3DDD/kaggle_data/stage1_labels.csv', index_col=0) sample_data = [] for num, patientID in enumerate(patients): try: label = labels.get_value(patientID, 'cancer') rnnInput,label = process(INPUT_FOLDER + patientID, label, rsp=False) sample_data.append( (rnnInput,label) ) print 'Patient',num,'processed.' except KeyError as err: print 'Unlabeled patient:', num, "passed." np.save('sample_data.npy', sample_data) print 'pre-processed data saved.'
import numpy as np from data_process import process # generate some data for training process_data = process() X = [] y = [] for i in range(60): dim = i + 1 print ('curr iter: %d' % i) #for N in range(10000, 110000, 10000): for N in range(50000,150000,10000): layer_dims = [np.random.randint(60, 80), 100] data = process_data.gen_data(dim, layer_dims, N).astype('float32') eigenvlues = process_data.get_eigenvalues(data) X.append(eigenvlues) y.append(dim) X = np.array(X) y = np.array(y) np.savez('train_data.npz', X=X, y=y) print('completed data generation')
def main(): nltk.download("stopwords") data_process = process() base_dir, result_dir = data_process.get_path() store_path, history_data = data_process.history_data(result_dir, nb_epoch) ### read training and testing data (Y_data, X_data, tag_list) = data_process.read_data(train_path, True) (_, X_test, _) = data_process.read_data(test_path, False) all_corpus_temp = X_data + X_test train_tag = data_process.to_multi_categorical(Y_data, tag_list) # #part3 # train_tag_all = np.sum(train_tag, axis=0) # train_tag_all = np.array(train_tag_all, dtype = "int") # for i in range(len(tag_list)): # print (str(tag_list[i])+":"+" "+str(train_tag_all[i])) ### RNN #----------------------------------------------------------------------------------------------------------------------------------------- ### tokenizer for all data tokenizer_temp = Tokenizer() tokenizer_temp.fit_on_texts(all_corpus_temp) word_index_temp = tokenizer_temp.word_index all_corpus = [ w for w in word_index_temp if not w in stopwords.words("english") ] tokenizer = Tokenizer() tokenizer.fit_on_texts(all_corpus) word_index = tokenizer.word_index tokenizer_file = os.path.join(store_path, "tokenizer.pickle") data_process.store_pickle(tokenizer, tokenizer_file) label_file = os.path.join(store_path, "label.pickle") data_process.store_pickle(tag_list, label_file) ### convert word sequences to index sequence train_sequences = tokenizer.texts_to_sequences(X_data) test_sequences = tokenizer.texts_to_sequences(X_test) ### padding to equal length train_sequences = pad_sequences(train_sequences, maxlen=max_article_length) test_sequences = pad_sequences(test_sequences, maxlen=max_article_length) ### split data into training set and validation set (X_train, Y_train), (X_val, Y_val) = data_process.split_data( train_sequences, train_tag, split_ratio) ### get mebedding matrix from glove embedding_dict = data_process.get_embedding_dict('glove.6B.%dd.txt' % embedding_dim) num_words = len(word_index) + 1 embedding_matrix = data_process.get_embedding_matrix( word_index, embedding_dict, num_words, embedding_dim) ### build model model = Sequential() model.add( Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_article_length, trainable=False)) model.add(GRU(128, activation='tanh', dropout=0.5)) model.add(Dense(256, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(len(tag_list), activation='sigmoid')) model.summary() #rms = RMSprop(lr = 0.0005) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=[data_process.f1_score]) earlystopping = EarlyStopping(monitor='val_f1_score', patience=10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='best.hdf5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_f1_score', mode='max') #csv_logger = CSVLogger('RNN.log') hist = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=nb_epoch, batch_size=batch_size, callbacks=[history_data, earlystopping, checkpoint]) #callbacks=[history_data,earlystopping,checkpoint,csv_logger]) data_process.dump_history(store_path, history_data) plot_model(model, to_file=os.path.join(store_path, 'model.png'))
mean_reward = total_reward * 1.0 / n_replications var = var * 1.0 / n_replications - mean_reward * mean_reward CI = [ mean_reward - 1.96 * np.sqrt(var / n_replications), mean_reward + 1.96 * np.sqrt(var / n_replications) ] mean_regret = total_regret * 1.0 / n_replications return mean_regret, mean_reward, CI if __name__ == '__main__': option = 2 # 1: generate new; 2: read from files n_categories = 20 n_reviews = 20 featurelist, mean, cov = data.process(option, n_categories, n_reviews) print 'data processed' sigma = 0.8 n_steps = 2275 np.random.seed(1) #U = user.user(mean, cov, sigma) #S = server.server(mean, cov, sigma, featurelist, n_steps) #restaurant = S.recommand() #print restaurant print server.ALG() n_replications = 100 mean_regret, mean_reward, CI = run(n_replications, n_steps, mean, cov,
mean_reward = total_reward * 1.0 / n_replications var = var * 1.0 / n_replications - mean_reward * mean_reward CI = [mean_reward - 1.96 * np.sqrt(var/n_replications), mean_reward + 1.96 * np.sqrt(var/n_replications)] mean_regret = total_regret * 1.0 / n_replications return mean_regret, mean_reward, CI if __name__ == '__main__': option = 2 # 1: generate new; 2: read from files n_categories = 20 n_reviews = 20 featurelist, mean, cov = data.process(option, n_categories, n_reviews) print 'data processed' sigma = 0.8 n_steps = 2275 np.random.seed(1) #U = user.user(mean, cov, sigma) #S = server.server(mean, cov, sigma, featurelist, n_steps) #restaurant = S.recommand() #print restaurant print server.ALG() n_replications = 100
from data_process import Token, process, batch_iter from bert_classify import * import os #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" best = 0.0 n = 0 print("Loading Training data...") input_id, input_segment_, mask_, label = Token(pm.train_filename) test_id, test_segment, test_mask, test_label = Token(pm.test_filename) label = process(label) def evaluate(sess, test_id, test_segment, test_mask, test_label): A = 1e-10 for i in range(len(test_label)): pre_lab = sess.run(predict, feed_dict={ input_x: [test_id[i]], input_segment: [test_segment[i]], mask: [test_mask[i]], keep_pro: 1.0 }) result = test_label[i] if int(pre_lab) == int(result): A += 1
import os import sys import numpy as np import tensorflow as tf from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.models import load_model from data_process import process test_path = sys.argv[1] result_path = sys.argv[2] max_article_length = 190 range_value = 0.4 data_process = process() (_, X_test,_) = data_process.read_data(test_path,False) tag_list = data_process.load_pickle("./model/label.pickle") tokenizer = data_process.load_pickle("./model/tokenizer.pickle") test_sequences = tokenizer.texts_to_sequences(X_test) test_sequences = pad_sequences(test_sequences,maxlen=max_article_length) model = load_model("./model/best.hdf5", custom_objects={"f1_score": data_process.f1_score}) Y_pred = model.predict(test_sequences) linfnorm = np.linalg.norm(Y_pred, axis=1, ord=np.inf) preds = Y_pred.astype(np.float) / linfnorm[:, None]
num_file = len(csvfile_list) #数据集文件列表 #filename_list = ['F:\业务分类\sourcecode\datasets\pcap\Game.pcap'] #保存处理后的数据集的文件 #csvfile_list = ['game.csv'] #每条数据的长度 DATA_LENGTH = 1500 #存储每个数据集文件中数据的个数 num_list = [] for index in range(num_file): #w表示写入,newline=“”表示两行写入之间不空行 with open(csvfile_list[index], 'w', newline="") as file: #将数据截取为1500字节长度,其中UDP包头长度后补12个0,与TCP对齐,并把每字节数据换算成0-255之间的数,并作0-1标准化 #最后写入csvfile_list中对应的文件中 num = process(file, filename_list[index], DATA_LENGTH) num_list.append(num) #选出数据个数最少的 min_num = min(num_list) #计算丢弃个数 drop_num = [x - min_num for x in num_list] #选取数据个数最少的,把比最少的数据集个数多的数据集中多出的部分全部随机丢弃 random_drop(csvfile_list, num_list, drop_num) #对多个文件的数据concat,随机拆分为训练集和测试集,并添加label data_concat(csvfile_list, train_list, test_list) train_model(DATA_LENGTH, num_file) # import numpy as np # print(x_train) # print(np.shape(x_train))