def test_plot(self): input_file = "../../EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS" result = read_data.read(input_file) result_keys = result.keys() plotting.plot_two_columns(result.df(), result_keys[1], result_keys[2])
def continous_train(self, smoothing=1e-2): self.gaussians = {} self.priors = {} # P(c) eprint("continous mode!") """ read data""" d, data_num = read_data.read("training") # data_num=100 eprint("train data_num: ", data_num) labels_list = np.array([d(i)[0] for i in range(data_num)]) labels = set(labels_list) """ calcuate mean and var for each pixel and label!!""" for c in labels: img_list = np.array( [d(i)[1] for i in range(data_num) if d(i)[0] == c]) # print(img_list.mean(axis=0).shape ) self.gaussians[c] = { "mean": img_list.mean(axis=0), "var": img_list.var(axis=0) + smoothing } self.priors[c] = len([ele for ele in labels_list if ele == c ]) / len(labels_list) # print( img_list.shape) # break pass
def interactive_plots(): current_feature_name = request.args.get("feature_name") if current_feature_name == None: current_feature_name = "GR" input_file = DATA_DIR + "EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS" result = read_data.read(input_file).df() result_small = result[current_feature_name] print(result_small.name) plot = result_small.plot() print(current_feature_name) fig = plot.get_figure() fname="output{}.png".format(current_feature_name) fig.savefig('static/{}'.format(fname)) mytext = "Hello with my text" #myimage = plotting.plot_two_columns(result.df(), result_keys[1], result_keys[2]) #print(type(myimage)) #print(myimage) print('rendering') return render_template('interactive_plots.html', mytext=mytext, myimage=fname, feature_names=result.columns, current_feature_name='GR')
def discrete_train(self): self.bit_shift = 3 d, data_num = read_data.read("training") """ init dicttionary""" count_d = {} def return_0_9_d(d): for i in range(10): # d[i]=[1 for j in range( 256/(2**self.bit_shift) )] d[i] = [0.00000001 for j in range(32)] return d self.two_d_list_of_d = [[return_0_9_d({}) for ele in range(28)] for ele in range(28)] self.label_count = [0 for i in range(10)] # data_num=1000 eprint("loading data " + str(data_num) + ", it will be late!") for i in range(data_num): """ [0] in label, [1] is image """ label = d(i)[0] self.label_count[label] += 1 img = d(i)[1] # print("data:", i) # print("label:", label) for row, p in enumerate(img): # print([ ele>>self.bit_shift for ele in p]) for col, v in enumerate([ele >> self.bit_shift for ele in p]): self.two_d_list_of_d[row][col][label][v] += 1 print(self.label_count)
def get_data(input_file_name=constants.FILE_DATA,column_offset=0,source_type=constants.SOURCE_TYPE,to_index=True,to_append=False): selected_columns = get_features() #Only select these features log.info("Reading data") try: df = read_data.read(input_file_name,source_type,selected_columns) except IOError as e: log.error("Could not open file - %s" % input_file_name) raise if(DEBUG): df.to_csv('raw.csv') df = process_data(df) if(DEBUG): df.to_csv('preprocessed.csv') if(to_index): log.info("Connecting to db: %s" % constants.FILE_INDEX_DB) conn = sqlite3.connect(constants.FILE_INDEX_DB) outf = df[selected_columns[0]] #select only ids if(to_append): log.info("Appending to index") outf.to_sql(constants.TABLE_NAME,conn,if_exists='append',index_label=constants.COL_INDEX) else: log.info("Saving to index") outf.to_sql(constants.TABLE_NAME,conn,if_exists='replace',index_label=constants.COL_INDEX) conn.close() return df[selected_columns[column_offset:]]
def analyze_file(p, n, K): freq_dict = {} frac_dim_size_lim = 50 frac_p_lim=0.465 f_arr = np.zeros(shape=(10,)) max_bin = 0 for i in range(1,K+1): burst_sizes, bonds = read('../data/p{p}n{n}/run{i}p{p}n{n}.data'.format(i=i, p=p, n=n), force_small = False) if bonds and n>frac_dim_size_lim and p>frac_p_lim: #bonds not empty... (ant not n not (too small)) freq, n_bins = fracdim_anal(bonds, frac_spacing) #If we dont need to check bond, we should definiatle force_small f_arr = add_frequencies(f_arr, freq) if n_bins > max_bin: max_bin = n_bins for size in burst_sizes: if size in freq_dict: freq_dict[size]+=1 else: freq_dict[size]=1 sys.stdout.write('\r ... completed file {i}'.format(i=i)) burst_size = list(freq_dict.keys()) multiplicity = list(freq_dict.values()) pdf, surv = tau_b_anal(burst_size, multiplicity) print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='tau'), *pdf) print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='survivor'), *surv) if bonds and n>frac_dim_size_lim and p>frac_p_lim: bins = plot_bins(exp_bins(1, base=frac_spacing, nums=max_bin)) print_data('./data/p{p}n{n}{type}'.format(p=p, n=n, type='fracdim'), bins, f_arr)
def bokeh(): input_file = DATA_DIR + "EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS" las = read_data.read(input_file) df = las.df() keys = las.keys() script, div, js_resources, css_resources = plotting.plot_bokeh(df, keys[1], keys[2]) return render_template('bokeh_index.html', plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources )
def classify(image): train_images_gray, train_labels = read_data.read(range(10), 'training') if USEBW: train_images_bw = convert_bw(train_images_gray) test = otsu.otsu(numpy.array(jtov.jtov(image))) clf = svm.SVC(kernel="poly", degree=1) clf.fit(train_images_bw[:10000], train_labels[:10000]) print clf.predict(test) else: test = numpy.array(jtov.jtov(image)) clf = svm.SVC(kernel="poly", degree=2) clf.fit(train_images_gray[:10000], train_labels[:10000]) print clf.predict(test)
def load_data(digits): images = [] for i in xrange(len(digits)): images.append(read_data.read([digits[i]], 'training')[0]) for i in xrange(len(digits)): target_images = images[i] rest_images = images[:i] + images[i+1:] with open('svm_train' + str(digits[i]) + '.txt', 'w') as f: for image in target_images: f.write(to_svm_format(image, 1) + "\n") for num in rest_images: for image in num: f.write(to_svm_format(image, -1) + "\n") print "Done " + str(digits[i])
def init(file): global packets, pkt, label, attack_cat packets = rd.read(file) prep.proto_to_value(packets) prep.state_to_value(packets) prep.service_to_value(packets) pkt = packets.copy() prep.ip_to_value(packets) label = packets['Label'].to_numpy() attack_cat = packets['attack_cat'].to_numpy() del packets['Label'] del packets['attack_cat']
def analyze_file(p, n, K, gamma_dict): freq_dict = {} for i in range(1, K + 1): burst_sizes, bonds = read( '../data/p{p}n{n}/run{i}p{p}n{n}.data'.format(i=i, p=p, n=n), force_small=True) for size in burst_sizes: if size in freq_dict: freq_dict[size] += 1 else: freq_dict[size] = 1 sys.stdout.write('\r ... completed file {i}'.format(i=i)) burst_size = list(freq_dict.keys()) multiplicity = list(freq_dict.values()) mean_cluster = gamma_anal(burst_size, multiplicity) gamma_dict[n][p] = mean_cluster #this works..
def continous_test(self): """ read test data""" d, data_num = read_data.read("testing") eprint("test data_num: ", data_num) # data_num=100 X = np.array([d(i)[1] for i in range(data_num)]) Y = np.array([d(i)[0] for i in range(data_num)]) correct = 0 for i, ele in enumerate(X): P = (self.continous_predict(ele)) if Y[i] == P: # print("OK!") correct += 1 print("accuracy(%): ", 100 * correct / len(X)) """ accuracy(%): 63.2 """ pass
def discrete_test(self): d, data_num = read_data.read("testing") correct = 0 eprint("start training!") eprint("train data_num: ", data_num) """ test """ #data_num=1000 for i in range(data_num): t_label = d(i)[0] t_img = d(i)[1] # print("i:",i) # print(self.predict(t_img)[0] ) # print("t_label:",t_label) if t_label == self.predict(t_img)[0]: correct += 1 # input() print("accurcy(%):", 100 * correct / data_num) #70.1% pass
def init(file): """ with open(file, newline='') as csvfile: global packets, pkt packets = pd.read_csv(csvfile) #print(type(packets)) """ packets = rd.read(file) global label, attack_cat prep.proto_to_value(packets) prep.state_to_value(packets) prep.service_to_value(packets) pkt = packets.copy() prep.ip_to_value(packets) label = packets['Label'].to_numpy() attack_cat = packets['attack_cat'].to_numpy() del packets['Label'] del packets['attack_cat']
print('\033[92m' + text.replace('\n', '') + '\033[0m') # Save a review sentence to the dataset def save_sentence(destination, entry_id, text, polarity='', aspects=''): formatted_aspects = '%-20s' % ('[' + str(' '.join(word for word in aspects)) + ']') write_data( destination, entry_id + ' ' + polarity + formatted_aspects + SENTENCE_BEGIN_INDICATOR + ' ' + text) write_data(destination, '\n\n') products_data = read_data.read(DIR_read) current_time = datetime.now() import os if not os.path.exists(DIR_save): os.makedirs(DIR_save) for product_data in products_data: meta = product_data['meta'] data = product_data['data'] product_id = meta['ID']
noduplicate_points.append(presention[0]) noduplicate_points.append(compressed_points[-1][1]) return noduplicate_points if __name__ == '__main__': # 读取轨迹数据 tradata = [] # 文件目录路径 raw_path = r"C:\Users\TJ\Desktop\Dataset\Geolife Trajectories 1.3\Data_4.0\9" new_path = r"C:\Users\TJ\Desktop\Dataset\Geolife Trajectories 1.3\OPERB\operb_data\9" file_list = os.listdir(raw_path) file_list.sort(key=lambda x: x[10:-5]) # 读取每个文件的轨迹数据 for i in range(len(file_list)): tradata.append(read(raw_path, file_list[i])) time_records = [] compression_ratios = [] error_bound = 10 total_time = 0 for j in range(len(tradata)): id = 0 points = [] for point in tradata[j]: p = Point(id, point[0], point[1], point[2]) points.append(p) id = id + 1 compressed_points = [] start_time = time.clock() operb(error_bound) end_time = time.clock()
croo = roo - mroo crco = rco - mrco crcc_ = rcc_ - mrcc_ croc_ = roc_ - mroc_ crvp_ = rvp_ / mrvp_ ctvl_ = tvl_ / mtvl_ ind = ind[:, 3:] d['rcc_'] = rcc_ d['rcc'] = rcc d['rco'] = rco d['roc'] = roc d['roc_'] = roc_ d['roo'] = roo d['ind'] = ind d['croo'] = croo d['crco'] = crco d['crcc_'] = crcc_ d['croc_'] = croc_ d['crvp_'] = crvp_ d['ctvl_'] = ctvl_ # return d pass if __name__ == '__main__': d = read_data.read() print calc(d)
def test_reader(self): input_file = "../../EAGE2018/Well-A_finished/HQLD_B_2C1_75-1_Well-A_ISF-BHC-MSFL-GR__COMPOSIT__1.LAS" result = read_data.read(input_file) assert (isinstance(result, lasio.las.LASFile))
entropy_cost = tf.train.GradientDescentOptimizer(0.0003).minimize(cost) batches = input_x.shape[0]//batch_size if input_x.shape[0] % batch_size != 0: batches += 1 with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for _ in range(ephocs): for batch in range(batches): start = batch * batch_size % input_x.shape[0] end = min(start + batch_size, input_x.shape[0]) sess.run([entropy_cost], feed_dict={x: input_x[start:end], y: input_y[start:end]}) c = sess.run([cost], feed_dict={x: input_x, y: input_y}) print(c) data, label = read_data.read() train(data, label)
def predict(df=None): """ :param df: 一条或多条http请求,DataFrame格式 :return: result 其中pre为预测结果 """ # 读取模型 net = torch.load('model/LSTM_model.pkl') gs = joblib.load('model/gs.m') result = read_data.read('data/data.csv') if df is None: result['tmp'] = 0 else: df = read_data.get_data(df) result['tmp'] = 1 df['tmp'] = 0 result = pd.concat([result, df], axis=0) result = cal_smilar.group_feature(result, 'user_agent') result = cal_smilar.group_feature(result, 'host') result = cal_smilar.group_feature(result, 'accept_language') result = cal_smilar.group_feature(result, 'accept_encoding') result = cal_smilar.group_feature(result, 'ip_dst') result = result[result['tmp'] == 0] del result['tmp'] CPT = pd.read_csv('model/CPT.csv', sep='\a') train = pd.read_csv('data/test.csv') safe_host = pd.read_csv('data/host.csv', names=['host']) result['original_host'] = result['original_host'].apply(lambda x: get_host(x)) # 规则筛选 ans_host = result['original_host'].unique() safe_host = list(safe_host['host'].values) same_host = list(set(safe_host) & set(ans_host)) result_safe = result[result['original_host'].isin(same_host)] result = result[~result['original_host'].isin(same_host)] print(len(result[result['y'] == 'malicious'])) test_ip_1 = train['ip_24'].unique() test_ip_2 = [ip for ip in result['ip_24'].unique() if ip not in test_ip_1] # result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')] dis = [] lens = len(result) with tqdm.tqdm(range(lens), 'calculate dis') as t: for i in t: # print(test_df.iloc[i,:]) cpt, tmp = match.get_dis(pd.DataFrame(result.iloc[i, :]).T, CPT, gs) dis.append(tmp) result['dis'] = dis result_in_ip = result[result['ip_24'].isin(test_ip_1)] result = result[result['ip_24'].isin(test_ip_2)] # result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')] # LSTM预测不在模板中的 now_time = time.time() seq_length = 200 pre = [] for s in result['original_host']: val = tldextract.extract(s) strs = val.domain pre.append(net.predict(strs, seq_length)) result['label'] = pre print("LSTM spent time {} s".format(time.time() - now_time)) result_in_ip['pre'] = result_in_ip['dis'].apply(lambda x: in_ip(x)) result_safe['pre'] = 0 if len(result) != 0: result['pre'] = result.apply(lambda x: not_in_ip(x['dis'], x['label']), axis=1) result = pd.concat([result, result_in_ip, result_safe]) # x = 0.12 # print(len(result[(result['y'] == 'safe') | (result['y'] == 'malicious')])) # print("safe dis > " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] > x)]))) # print("safe dis > " + str(x) + " label=1:" + str( # len(result[(result['y'] == 'safe') & (result['dis'] > x) & (result['label'] == 1)]))) # print("safe dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] <= x)]))) # print("safe dis <= " + str(x) + " label=1:" + str( # len(result[(result['y'] == 'safe') & (result['dis'] <= x) & (result['label'] == 1)]))) # print("malicious dis > " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] > x)]))) # print("malicious dis > " + str(x) + " label=1:" + str( # len(result[(result['y'] == 'malicious') & (result['dis'] > x) & (result['label'] == 1)]))) # print("malicious dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] <= x)]))) # print("malicious dis <= " + str(x) + " label=1:" + str( # len(result[(result['y'] == 'malicious') & (result['dis'] <= x) & (result['label'] == 1)]))) # # # 计算得分 # TP = len(result[(result['pre'] == 1) & (result['y'] == 'malicious')]) # FN = len(result[(result['pre'] == 0) & (result['y'] == 'malicious')]) # FP = len(result[(result['pre'] == 1) & (result['y'] == 'safe')]) # TN = len(result[(result['pre'] == 0) & (result['y'] == 'safe')]) # P = TP / (TP + FP) # R = TP / (TP + FN) # F1 = 2 * TP / (2 * TP + FP + FN) # auc = (TP + TN) / (TP + FN + FP + TN) # print("精确率 = {} 召回率 = {} F1_Score = {} 准确率 = {}".format(P, R, F1, auc)) # print(TP, FN, FP, TN) return result
# hello world 30 Jun 2020 import pandas as pd import read_data data = read_data.read() # print(data.df) # print(data.sname) # if __name__ == '__main__': # a = 1
def dice_coef_loss(y_true, y_pred): return -dice_coef(y_true, y_pred) smooth = 1. MASSACHUSETTS_PATH = "Massachusetts/" TRAINING_SET = 1 MODEL_NAME = 'UNETV2' # or 'UNET' or 'INCEPTION' window = 28 * 8 path = MASSACHUSETTS_PATH + 'train/' x_train, y_train = read_data.read(path, 110) if 2 == TRAINING_SET: index = 75 * 49 x_train = x_train[0:index, :, :, :] y_train = y_train[0:index, :, :, :] print("len train ", len(x_train)) path = MASSACHUSETTS_PATH + 'validation/' x_valid, y_valid = read_data.read(path, 4) print("len valid ", len(x_valid)) if 'UNET' == MODEL_NAME: model = unet.get_unet() if 'INCEPTION' == MODEL_NAME:
import read_data ''' Things to experiment with -Use pure black/white based on thresholding http://en.wikipedia.org/wiki/Otsu%27s_Method for SVMs: -Kernels -C KNN: -k Decision Trees: ''' images, labels = read_data.read(range(10)) print "Read Raw Data" sparse_images = preprocessing.scale(images) #sparse_images = sparse.csr_matrix(sparse_images) print "Made images sparse" clf = svm.SVC(kernel='linear', cache_size=1000.) print "initialized SVC" clf.fit(sparse_images[:1000], labels[:1000]) print "Fit SVM" guesses = clf.predict(sparse_images[10001:11001]) print metrics.classification_report(labels[10001:11001], guesses) print guesses[:10] print labels[10001:10011]
from read_data import read, show # Reading training and testing features from dataset features_labels, features_train = read(dataset="training") labels_test, features_test = read(dataset="testing") # Reshaping features_train and features_test as sklearn # requires 2D array in fit() nsamples, nx, ny = features_train.shape features_train = features_train.reshape((nsamples,nx*ny)) nsamples, nx, ny = features_test.shape features_test = features_test.reshape((nsamples,nx*ny)) # NaiveBayes Model from sklearn.naive_bayes import GaussianNB clf_nb = GaussianNB() clf_nb.fit(features_train, features_labels) labels_pred_nb = clf.predict(features_test) # SVM from sklearn.svm import SVC clf_svm = SVC(kernel='rbf') clf_svm.fit(features_train, features_labels) labels_pred_svm = clf.predict(features_test)
def _process_data(self): # Read config file config = self._config if "input" not in config.keys(): self._no_input_dialog() return if "output" not in config.keys(): self._no_output_dialog() return fields = [ 'type', 'modules', 'lmax summary remove', ] for f in fields: if f not in config.keys(): self._incomplete_config_dialog(f) return # Infer file type if config set to auto if config["type"] == "auto": file_type = infer_filetype.infer(config["input"][0]) print("\nInferred file type: " + file_type) else: file_type = config["type"] # Read input data print("Reading data...") user_metadata = config["percentiles"].copy() user_metadata.insert(0, config["frequency weighting"]) if 'columns' in config.keys(): data, metadata = read(file_type, config["input"], user_metadata, columns=config["columns"]) else: data, metadata = read(file_type, config["input"], user_metadata) print("Data read successfully") # Run pre-processing modules data, _ = process_batch(data, config["modules"], metadata) # Generate output tables tables = outputs_ui.daily_table(data, metadata["Frequency Weighting"], config["lmax summary remove"], config["lmax summary override"]) # Export to Excel (also export config duplicate) writer, config_out = outputs_ui.export_excel(data, metadata, tables, config) while True: try: # Export workbook writer.save() # Export config file with open(config_out, 'w') as file: file.write(dumps(config, indent=4)) print("Export complete") startfile(config["output"][0] + ".xlsm") QCoreApplication.quit() break except PermissionError: action = self._file_open_dialog() if action == 4194304: QCoreApplication.quit() break
# -*- coding: utf-8 -*- import read_data import GA import plots #made changes to this script to incorporate the curve, addition of plots after the print avg specificity is the change. [X, Y, features_no] = read_data.read() obj = GA.genetic_algo(population_no=1000, features=features_no, X=X, Y=Y, generations=1000, crossover_prob=.05, mutation_prob=.001, partition=0.70, cross_validation=False, fold_cv=1) sol = obj.build_solution2() #score=obj.build_solution() #obj.final_solution(score) precision = 0 recall = 0 f1score = 0 accuracy = 0 specificity = 0 for i in range(len(sol)): precision = precision + sol[i][0] recall = recall + sol[i][1] f1score = f1score + sol[i][2] accuracy = accuracy + sol[i][3]
#this file will serve to generate the initial model, and could be retooled #to serve as a model "updater", i.e. generating a new model any time #we gain access to a new labeled data set #It also saves the model to a file so that we don't have to regenerate #the model each time we want to use it to classify new data from read_data import read from split_data import split from sklearn import svm from joblib import dump from test_model import run_metrics, F1, read_model_from_file data = read("data_151.csv") data += read("data_130.csv") + read("data_53.csv") features_training, labels_training, features_testing, labels_testing = split( data) #create SVM model svm_model = svm.SVC(max_iter=10000, kernel='rbf') svm_model.fit(features_training, labels_training.ravel()) dump(svm_model, 'model.joblib') svm_model = read_model_from_file() TP, FP, TN, FN = run_metrics(svm_model, features_testing, labels_testing) f1 = F1(TP, FP, TN, FN) print( 'Coord_x|Coord_y|ISOS_z|ISOS_Size_x|ISOS_Size_y|COST_z|COST_Size_x|COST_Size_y ' ) #print(svm_model.coef_)
import read_data import math import cProfile import pylab as pl import otsu import numpy import Image #See http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.cross_val_score.html from sklearn import cross_validation from sklearn import svm from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import ExtraTreesClassifier #Read dataset train_images_gray, train_labels = read_data.read(range(10), 'training') test_images_gray, test_labels = read_data.read(range(10), 'testing') print "Done reading data" def generate_pngs(): for x in xrange(10): test = train_images_gray[x] test2 = [] for i in xrange(len(test)): test2.append(numpy.uint8(255 - test[i])) test2 = numpy.array(test2) test2.shape = (28,28) print test2 img = Image.fromarray(test2, 'L') img.save('report/' +str(x) + '.png') def convert_bw(images):
# Author Taher Ahmadi import read_data import matplotlib.pyplot as plt from matplotlib import gridspec from sklearn import datasets, linear_model import numpy as np import random import methods data_set, labels = read_data.read('./data_set/Dataset1.csv', 8) # Split train and test data train_data = data_set[:400] train_labels = labels[:400] test_data = data_set[400:] test_labels = labels[400:] print('Train data size:', len(train_data)) print('Test data size:', len(test_data)) # a) Scatter plot each feature vs label fig1 = plt.figure('a') gs = gridspec.GridSpec(3, 3) counter = 0 for i in range(0, 3): for j in range(0, 3): counter += 1 if counter == 9: break ax_temp = fig1.add_subplot(gs[i, j])
def run_training(): """Train for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on . filename = "adult.data.txt" vectors_data, labels_data = read_data.read(filename) print(vectors_data.shape) filename = "adult.test.txt" test_data, tlabels_data = read_data.read(filename) print(test_data.shape) # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default() as data: # Generate placeholders for the images and labels. vectors_placeholder, labels_placeholder = placeholder_inputs( FLAGS.batch_size) # Build a Graph that computes predictions from the inference model. logits = mnist_tb.inference(vectors_placeholder, FLAGS.hidden1, FLAGS.hidden2) # Add to the Graph the Ops for loss calculation. loss = mnist_tb.loss(logits, labels_placeholder) #test_loss = mnist_tb.loss(test_logits, test_labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = mnist_tb.training(loss, FLAGS.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = mnist_tb.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph) # And then after everything is built, start the training loop. for step in xrange(FLAGS.max_steps): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = fill_feed_dict(step, vectors_data, labels_data, vectors_placeholder, labels_placeholder, FLAGS.batch_size) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str_train = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str_train, step) saver.save(sess, FLAGS.train_dir, global_step=step) # Save a checkpoint and evaluate the model periodically. if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps: saver.save(sess, FLAGS.train_dir, global_step=step) # Evaluate against the training set. print('Training Data Eval:') do_eval(sess, eval_correct, vectors_placeholder, labels_placeholder, vectors_data, labels_data, FLAGS.batch_size) # Evaluate against the validation set. # Evaluate against the test set. print('Test Data Eval:') do_eval(sess, eval_correct, vectors_placeholder, labels_placeholder, test_data, tlabels_data, FLAGS.batch_size)
def test(): # 读取模型 net = torch.load('model/LSTM_model.pkl') gs = joblib.load('model/gs.m') result = read_data.read('data/data.csv') result = cal_smilar.group_feature(result, 'user_agent') result = cal_smilar.group_feature(result, 'host') result = cal_smilar.group_feature(result, 'accept_language') result = cal_smilar.group_feature(result, 'accept_encoding') result = cal_smilar.group_feature(result, 'ip_dst') CPT = pd.read_csv('model/CPT.csv', sep='\a') valid = pd.read_csv('data/valid.csv') train = pd.read_csv('data/test.csv') safe_host = pd.read_csv('data/host.csv', names=['host']) result['original_host'] = result['original_host'].apply(lambda x: get_host(x)) test_ip_1 = valid['ip_24'].unique() test_ip_2 = [ip for ip in result['ip_24'].unique() if ip not in test_ip_1] result = result[result['ip_24'].isin(test_ip_2)] result = result[(result['y'] == 'safe') | (result['y'] == 'malicious')] ans_host = result['original_host'].unique() safe_host = list(safe_host['host'].values) same_host = list(set(safe_host) & set(ans_host)) result = result[~result['original_host'].isin(same_host)] print(len(result[result['y'] == 'malicious'])) dis = [] lens = len(result) with tqdm.tqdm(range(lens), 'calculate dis') as t: for i in t: # print(test_df.iloc[i,:]) cpt, tmp = match.get_dis(pd.DataFrame(result.iloc[i, :]).T, CPT, gs) dis.append(tmp) result['dis'] = dis # LSTM预测不在模板中的 now_time = time.time() seq_length = 200 pre = [] for s in result['original_host']: val = tldextract.extract(s) strs = val.domain pre.append(net.predict(strs, seq_length)) result['label'] = pre print("LSTM spent time {} s".format(time.time() - now_time)) result['pre'] = result.apply(lambda x: not_in_ip(x['dis'], x['label']), axis=1) x = 0.12 print(len(result[(result['y'] == 'safe') | (result['y'] == 'malicious')])) print("safe dis > " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] > x)]))) print("safe dis > " + str(x) + " label=1:" + str( len(result[(result['y'] == 'safe') & (result['dis'] > x) & (result['label'] == 1)]))) print("safe dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'safe') & (result['dis'] <= x)]))) print("safe dis <= " + str(x) + " label=1:" + str( len(result[(result['y'] == 'safe') & (result['dis'] <= x) & (result['label'] == 1)]))) print("malicious dis > " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] > x)]))) print("malicious dis > " + str(x) + " label=1:" + str( len(result[(result['y'] == 'malicious') & (result['dis'] > x) & (result['label'] == 1)]))) print("malicious dis <= " + str(x) + " :" + str(len(result[(result['y'] == 'malicious') & (result['dis'] <= x)]))) print("malicious dis <= " + str(x) + " label=1:" + str( len(result[(result['y'] == 'malicious') & (result['dis'] <= x) & (result['label'] == 1)]))) # 计算得分 TP = len(result[(result['pre'] == 1) & (result['y'] == 'malicious')]) FN = len(result[(result['pre'] == 0) & (result['y'] == 'malicious')]) FP = len(result[(result['pre'] == 1) & (result['y'] == 'safe')]) TN = len(result[(result['pre'] == 0) & (result['y'] == 'safe')]) P = TP / (TP + FP) R = TP / (TP + FN) F1 = 2 * TP / (2 * TP + FP + FN) auc = (TP + TN) / (TP + FN + FP + TN) print("精确率 = {} 召回率 = {} F1_Score = {} 准确率 = {}".format(P, R, F1, auc)) print(TP, FN, FP, TN)
# Author Taher Ahmadi import read_data import numpy as np from matplotlib import gridspec import matplotlib.pyplot as plt from sklearn.linear_model import Lasso import pandas as pd data_set_1, label_1 = read_data.read('./data_set/Dataset2.csv', 6) # Split train and test data train_data_1 = data_set_1[:200] train_label_1 = label_1[:200] test_data_1 = data_set_1[200:] test_label_1 = label_1[200:] print('Train data size:', len(train_data_1)) print('Test data size:', len(test_data_1)) # Scatter plot each feature vs label fig = plt.figure() gs = gridspec.GridSpec(2, 3) counter = 0 for i in range(0, 2): for j in range(0, 3): counter += 1 ax_temp = fig.add_subplot(gs[i, j]) ax_temp.scatter(train_data_1.get(counter - 1), train_label_1) ax_temp.title.set_text(('Feature ' + str(counter))) plt.show()
def running(learning_rate, keep_prob, BATCH_SIZE, weight_decay): x = tf.placeholder(tf.float32, [BATCH_SIZE, 140, 320, 3]) y = tf.placeholder(tf.float32, [BATCH_SIZE]) global_step = tf.Variable(0, trainable=False) ##### training queue inputs ##### ## get input train_images, train_angles, valid_images, valid_angles = read_data.read( input_path, eval_path) num_train = len(train_angles) num_valid = len(valid_angles) train_per_epoch = int((num_train * 1.0) / BATCH_SIZE) valid_per_epoch = int((num_valid * 1.0) / BATCH_SIZE) learning_rate_decay = tf.train.exponential_decay(learning_rate, global_step, 30000, 0.80, staircase=True) tf.scalar_summary('learning_rate', learning_rate_decay) ## pointer train_pointer = 0 valid_pointer = 0 ## inference build model prediction = pred_steer.inference(x, train_flag, drop_prob, wd) ## calculate loss loss = pred_steer.loss(prediction, y) ## build model per batch and update parameters train_op = pred_steer.train(loss, learning_rate_decay, global_step) ## build initialization peration init = tf.initialize_all_variables() ## merge all summaries and initialize writer #summary_op = tf.merge_all_summaries() #train_writer = tf.train.SummaryWriter("./tensorboard", graph = tf.get_default_graph()) tf.scalar_summary('train_RMSE', tf.sqrt(loss)) #tf.scalar_summary('l2_norm', l2) #tf.scalar_summary('train_pred', tf.reduce_mean(prediction)) #tf.scalar_summary('eval_pred', tf.reduce_mean(eval_pred)) #tf.scalar_summary('train_angle', tf.reduce_mean(angles)) #tf.scalar_summary('eval_angle', tf.reduce_mean(tf.string_to_number(eval_angs, out_type = tf.float32))) sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("./tensor/", sess.graph) saver = tf.train.Saver() sess.run(init) epoch = 0 ## start the queue runners #coord = tf.train.Coordinator() #enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord) saver.restore(sess, './save/my-model-121000') for step in range(1, 400000): start_time = time.time() images_array, angles_array = read_data.Train_Batch( train_images, train_angles, BATCH_SIZE) #, train_pointer) _, summary = sess.run( [train_op, merged], feed_dict={ x: images_array, y: angles_array, train_flag: True, drop_prob: keep_prob, wd: weight_decay }) if step % 20 == 0: #train_images_sub, train_angles_sub = read_data.Train_Batch(train_images, train_angles, BATCH_SIZE) eval_images_array, eval_angles_array = read_data.Valid_Batch( valid_images, valid_angles, BATCH_SIZE) #, valid_pointer) #print("step: %d, eval_loss: %g"%(step, sess.run(loss, feed_dict = { # x: eval_images_array, y:eval_angles_array, train_flag:False, drop_prob:1.0}))) #train_out = sess.run(loss, feed_dict = {x: train_images_sub, y: train_angles_sub, train_flag:False, drop_prob:1.0, wd:0.0}) out = sess.run(loss, feed_dict={ x: eval_images_array, y: eval_angles_array, train_flag: False, drop_prob: 1.0, wd: 0.0 }) print("step: " + str(step) + " loss: " + str(np.sqrt(out))) if step % 2000 == 0: #checkpath = "./save/model.ckpt" filename = saver.save(sess, './save/my-model', global_step=global_step) #filename = saver.save(sess, checkpath) print("Model saved in file: %s" % filename) # _, summary = sess.run([train_op, summary_op]) #train_writer.add_summary(summary, step) #duration = time.time() - start_time writer.add_summary(summary, step)
from datetime import datetime from alexnet import AlexNet from generator import Generator import read_data as readData import evaluation as eval from sklearn import utils from tqdm import * learning_rates = [0.0000001] num_epochs = 10 batch_size = 450 path = 'data/dataset/' print "####################### ConTagNet ############################" print "loading the dataset: please wait for a while!" X1_train, X2_train, Y_train = readData.read(path + 'train/') X1_val, X2_val, Y_val = readData.read(path + 'validate/') X1_test, X2_test, Y_test = readData.read(path + 'test/') X2_train = readData.readCoTagNet(path + 'train/') X2_val = readData.readCoTagNet(path + 'validate/') X2_test = readData.readCoTagNet(path + 'test/') X_train_generator = Generator(X1_train, X2_train, Y_train) X_validate_generator = Generator(X1_val, X2_val, Y_val) X_test_generator = Generator(X1_test, X2_test, Y_test) train_batches_per_epoch = np.floor(X_train_generator.num_samples / batch_size).astype(np.int16) val_batches_per_epoch = np.floor(X_validate_generator.num_samples / batch_size).astype(np.int16)
first_bin_edge = 50 min_exp = int(np.log(first_bin_edge)/np.log(base)) max_exp = int(np.ceil(np.log(max_num)/np.log(base))) bins = [] for i in range(min_exp, max_exp+1): bins.append(base**i) return bins fracdim = [] n_bins = 100 f_arr = np.zeros((n_bins,)) for i in range(1, 11): print(i) freq, bonds = read('../data/p0.49n10000/run'+str(i)+'p0.49n10000.data') flatten = [bond for burst in bonds for bond in burst] radius = [r(*bond[0:2]) for bond in flatten] f, bins = np.histogram(radius, bins=n_bins) f_arr += f bins = bin_middles(bins) M = 1/10000*np.cumsum(f_arr) #get mass as the integral of f_arr. logm = np.log(M) logr = np.log(bins) s=slice(0,-1) logfit, logcov = opt.curve_fit(lin, logr[s], logm[s]) #filter end and start plt.plot(logr, logm,'o') plt.plot(logr[s], [lin(x, *logfit) for x in logr[s]], '--')