def data_process(sql, normal, scale): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] embeddings = word2vec["embeddings"] reverse_dictionary = word2vec["reverse_dictionary"] sql_data = [] normal_data = [] with open(sql, "r", encoding="utf-8") as f: reader = csv.DictReader((line.replace('\0', '') for line in f), fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) if len(word) <= maxlen: sql_data.append(word) with open(normal, "r", encoding="utf-8") as f: reader = csv.DictReader((line.replace('\0', '') for line in f), fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) if len(word) <= maxlen: normal_data.append(word) sql_num = len(sql_data) normal_num = len(normal_data) sql_labels = [1] * sql_num normal_labels = [0] * normal_num datas = sql_data + normal_data labels = sql_labels + normal_labels def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1, maxlen=maxlen) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] datas_embed = [] dims = len(embeddings[0]) for data in datas: data_embed = [] for d in data: if d != -1: data_embed.extend(embeddings[d]) else: data_embed.extend([0.0] * dims) datas_embed.append(data_embed) train_datas, test_datas, train_labels, test_labels = train_test_split( datas_embed, labels, test_size=scale) return train_datas, test_datas, train_labels, test_labels
def pre_process(): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] embeddings = word2vec["embeddings"] reverse_dictionary = word2vec["reverse_dictionary"] xssed_data = [] normal_data = [] with open("data\\xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) xssed_data.append(word) with open("data\\normal_examples2.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1, maxlen=maxlen) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] datas_embed = [] dims = len(embeddings["UNK"]) n = 0 for data in datas: data_embed = [] for d in data: if d != -1: data_embed.extend(embeddings[reverse_dictionary[d]]) else: data_embed.extend([0.0] * dims) datas_embed.append(data_embed) n += 1 if n % 1000 == 0: print(n) train_datas, test_datas, train_labels, test_labels = train_test_split( datas_embed, labels, test_size=0.3) return train_datas, test_datas, train_labels, test_labels
def pre_process(): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] xssed_data = [] normal_data = [] with open("data\\xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) xssed_data.append(word) with open("data\\normal_examples.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels labels = to_categorical(labels) def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=0.3) process_datas = word2vec process_datas["train_datas"] = train_datas process_datas["test_datas"] = test_datas process_datas["train_labels"] = train_labels process_datas["test_labels"] = test_labels with open(process_datas_dir, "wb") as f: pickle.dump(process_datas, f) print("Preprocessing data over!") print("Saved datas and labels to ", process_datas_dir)
skip_window = 5 num_sampled = 64 num_iter = 5 plot_only = 100 log_dir = "word2vec.log" plt_dir = "./file/word2vec.png" vec_dir = "./file/word2vec.pickle" start = time.time() words = [] datas = [] with open("./data/sql.csv", "r", encoding='UTF-8') as f: reader = f.readlines() for row in reader: #print row word = GeneSeg(row) datas.append(word) words += word #print datas #构建数据集 def build_dataset(datas, words): count = [["UNK", -1]] counter = Counter(words) count.extend(counter.most_common(vocabulary_size - 1)) vocabulary = [c[0] for c in count] data_set = [] for data in datas: d_set = [] for word in data:
def pre_process(): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] xssed_data = [] normal_data = [] with open("data\\xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) xssed_data.append(word) with open("data\\normal_examples.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels labels = to_categorical(labels) def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=0.3) train_size = len(train_labels) test_size = len(test_labels) input_num = len(train_datas[0]) dims_num = embeddings["UNK"].shape[0] word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(vec_dir, "wb") as f: pickle.dump(word2vec, f) print("Saved word2vec to:", vec_dir) print("Write trian datas to:", pre_datas_train) with open(pre_datas_train, "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) print("Write test datas to:", pre_datas_test) with open(pre_datas_test, "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) print("Write datas over!")
model_1 = load_model(model_dir1) model_2 = load_model(model_dir2) with open(vec_dir, "rb") as f: # print f.readlines() word2vec = pickle.load(f) # print type(word2vec) # print len(word2vec) # print word2vec dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] test_data = input() data_test = [] data = GeneSeg(test_data) print(u"split words result:" + str(data)) for word in data: # print word if word in dictionary.keys(): data_test.append(dictionary[word]) else: data_test.append(dictionary["UNK"]) #data_test=pad_sequences(data_test,value=-1) #plot_model(model,to_file='./model.png',show_shapes=True,show_layer_names=True) data_test2 = [] for i in range(0, 591 - len(data_test)): data_test2.append(-1) data_test2 += data_test data_embed = [] for d in data_test2:
skip_window = 5 num_sampled = 64 num_iter = 5 plot_only = 100 log_dir = "word2vec.log" plt_dir = "file\\word2vec.png" vec_dir = "file\\word2vec.pickle" start = time.time() words = [] datas = [] with open("data\\xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) datas.append(word) words += word # 构建数据集 def build_dataset(datas, words): count = [["UNK", -1]] counter = Counter(words) count.extend(counter.most_common(vocabulary_size - 1)) #List the n most common elements and their counts from the most common to the least. # If n is None, then list all element counts. vocabulary = [c[0] for c in count] # put out the first element in every list data_set = [] for data in datas:
top_k = 8 num_sampled = 64 num_steps = 101 plot_only = 100 log_dir = "word2vec.log" plt_dir = "./file/word2vec.png" vec_dir = "./file/word2vec.pickle" start = time.time() words = [] print(os.listdir("./")) with open("./data/sql.csv", "r") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(unquote(payload)) words += word print("words size:", len(words)) #构建数据集 def build_dataset(words): count = [["UNK", -1]] counter = Counter(words) #print counter.most_common(vocabulary_size-1) count.extend(counter.most_common(vocabulary_size - 1)) dictionary = {} for word, _ in count: dictionary[word] = len(dictionary) print(dictionary[word], word)
def pre_process(): with open(vec_dir, "rb") as f: #print(f.readlines()) word2vec = pickle.load(f) print(type(word2vec)) #<class 'dict'> print(len(word2vec)) #7 print(word2vec) dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] xssed_data = [] normal_data = [] with open("data\\xssedtiny.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) xssed_data.append(word) with open("data\\normal_less.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num #标签 normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels labels = to_categorical( labels) #Converts a class vector (integers) to binary class matrix. def to_index(data): #去word2vec里查询 d_index = [] for word in data: #print(word) if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) #print(d_index) return d_index datas_index = [to_index(data) for data in datas[0:]] for i in datas_index: if len(i) > 100: print(1, len(i)) datas_index = pad_sequences(datas_index, value=-1) #print(datas_index) rand = random.sample(range(len(datas_index)), len(datas_index)) #打乱重采样 datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=0.3) #print('train data:',train_datas) train_size = len(train_labels) #print(train_size)#15381 test_size = len(test_labels) #print(test_size)#6593 input_num = len(train_datas[0]) #print(input_num) #258 #print('sb',type(embeddings)) dims_num = embeddings["UNK"] #print(dims_num) #128 word2vec["train_size"] = train_size #15381 word2vec["test_size"] = test_size #6593 word2vec["input_num"] = input_num #258 word2vec["dims_num"] = dims_num #128 with open(vec_dir, "wb") as f: pickle.dump(word2vec, f) print("Saved word2vec to:", vec_dir) print("Write trian datas to:", pre_datas_train) with open(pre_datas_train, "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) print("Write test datas to:", pre_datas_test) with open(pre_datas_test, "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) print("Write datas over!")
def pre_process(): with open(vec_dir, "rb") as f: #print f.readlines() word2vec = pickle.load(f) #print type(word2vec) #print len(word2vec) #print word2vec dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] sql_data = [] normal_data = [] with open("./data/sql.csv", "r", encoding='UTF-8') as f: reader = f.readlines() for row in reader: ################## word = GeneSeg(row) sql_data.append(word) with open("./data/normal_less.csv", "r", encoding='UTF-8') as f: reader = f.readlines() for row in reader: word = GeneSeg(row) normal_data.append(word) sql_num = len(sql_data) normal_num = len(normal_data) #print xssed_num,normal_num sql_labels = [1] * sql_num normal_labels = [0] * normal_num data = sql_data + normal_data labels = sql_labels + normal_labels labels = to_categorical(labels) def to_index(data): #去word2vec里查询 d_index = [] for word in data: #print word if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) #print d_index return d_index data_index = [to_index(d) for d in data[0:]] data_index = pad_sequences(data_index, value=-1) # 变长序列 rand = random.sample(range(len(data_index)), len(data_index)) #打乱重采样 data = [data_index[index] for index in rand] labels = [labels[index] for index in rand] train_data, test_data, train_labels, test_labels = train_test_split( data, labels, test_size=0.1) #print 'train data:',train_datas train_size = len(train_labels) test_size = len(test_labels) input_num = len(train_data[0]) #print 'sb',type(embeddings) #print embeddings dims_num = embeddings["UNK"] word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(vec_dir, "wb") as f: pickle.dump(word2vec, f) print("Saved word2vec to:", vec_dir) print("Write train data to:", data_train) with open(data_train, "w") as f: for i in range(train_size): data_line = str(train_data[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) print("Write test data to:", data_test) with open(data_test, "w") as f: for i in range(test_size): data_line = str(test_data[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) print("Write data over!")
def pre_process(): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] xssed_data = [] normal_data = [] with open("data/xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) xssed_data.append(word) with open("data/normal_examples.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels labels = to_categorical(labels) def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] if sys.argv[1] == 'kfold': folder = KFold(n_splits=10, shuffle=False, random_state=0) number = 0 for train, test in folder.split(datas, labels): train = np.random.choice(train, size=len(train) // int(sys.argv[3]), replace=False) test = np.random.choice(test, size=len(test) // int(sys.argv[3]), replace=False) train_datas = [datas[i] for i in train] test_datas = [datas[i] for i in test] train_labels = [labels[i] for i in train] test_labels = [labels[i] for i in test] train_size = len(train_labels) test_size = len(test_labels) input_num = len(train_datas[0]) dims_num = embeddings["UNK"].shape[0] word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(pre_datas_trains[number], "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) with open(pre_datas_tests[number], "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) number = number + 1 else: train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=float(sys.argv[2])) train_size = len(train_labels) test_size = len(test_labels) input_num = len(train_datas[0]) dims_num = embeddings["UNK"].shape[0] word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(pre_datas_trains[0], "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) with open(pre_datas_tests[0], "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) with open(vec_dir, "wb") as f: pickle.dump(word2vec, f)
def pre_process(): with open(vec_dir, "rb") as f: #print(f.readlines()) word2vec = pickle.load(f) #print(type(word2vec))#<class 'dict'> #print(len(word2vec))#7 #print(word2vec) dictionary = word2vec["dictionary"] reverse_dictionary = dict(word2vec["reverse_dictionary"]) embeddings = word2vec["embeddings"] sql_data = [] normal_data = [] with open("data\\sql-original-noshort-norepeat-sqlfilter.txt", "r", encoding="utf-8") as f: reader = csv.DictReader((line.replace('\0', '') for line in f), fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) #print(len(word)) if len(word) <= 400: sql_data.append(word) with open("data\\normal-original-noshort-norepeat-sqlfilter.txt", "r", encoding="utf-8") as f: reader = csv.DictReader((line.replace('\0', '') for line in f), fieldnames=["payload"]) for row in reader: payload = row["payload"] word = GeneSeg(payload) #print(len(word)) if len(word) <= 400: normal_data.append(word) sql_num = len(sql_data) normal_num = len(normal_data) sql_labels = [1] * sql_num #标签 normal_labels = [0] * normal_num datas = sql_data + normal_data labels = sql_labels + normal_labels labels = to_categorical( labels) #Converts a class vector (integers) to binary class matrix. def to_index(data): #去word2vec里查询 d_index = [] for word in data: #print(word) if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) #print(d_index) return d_index datas_index = [to_index(data) for data in datas[0:]] #for i in datas_index: #if len(i)>100: #print(1,len(i)) #序列中最长长度进行长度序列化 datas_index = pad_sequences(datas_index, value=-1) #print(datas_index) rand = random.sample(range(len(datas_index)), len(datas_index)) #打乱重采样 datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=0.1) #print('train data:',train_datas) train_size = len(train_labels) #print(train_size)#15381 test_size = len(test_labels) #print(test_size)#6593 input_num = len(train_datas[0]) #print(input_num) #189 #print('sb',type(embeddings)) dims_num = len(embeddings[0]) #print(dims_num) #128 word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(vec_dir, "wb") as f: pickle.dump(word2vec, f) print("Saved word2vec to:", vec_dir) print("Write trian datas to:", pre_datas_train) with open(pre_datas_train, "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) print("Write test datas to:", pre_datas_test) with open(pre_datas_test, "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) print("Write datas over!")