def check(model,w_model,data): if data !=None: xx=[] filepath='file/INPUT_SHAPE' input_shape=[] with open(filepath,'r') as f : for line in f.readlines() : input_shape=int(line) if len(data.strip()): #判断是否是空行 for text in URLDECODE(data) : #print(text) try: xx.append(w_model[text]) except KeyError: continue xx=np.array(xx, dtype='float') if not len(xx): return [0] x=[] x.append(xx) x=np.array(x) x=keras.preprocessing.sequence.pad_sequences(x,dtype='float32',maxlen=input_shape) result=model.predict_classes(x, batch_size=len(x)) return result else: return [0]
def __iter__(self): for line in open(self.filename, encoding="utf-8"): if len(line.strip()): #判断是否是空行 self.f_len += 1 xx = [] for text in URLDECODE(line): try: xx.append(self.model[text]) except KeyError: continue xx = np.array(xx, dtype='float') if self.max_len < len(xx): self.max_len = len(xx) yield xx
from utils import URLDECODE a = 'cc%3D-%26ct%3D-%26java%3D1%26lang%3D-%26pf%3D-%26scl%3D-%26scr%3D-%26tt%3D-%26tz%3D-8%26vs%3D3.3%26dm%3Dappbase.qzone.qq.com%26url%3D/qz/358/normal%26rdm%3Dqzs.qq.com%26rurl%3D/open/fusion/api_v115.htm%26flash%3D20.0%26pgv_pvid%3D7725784408%26sds%3D0.9023870290257037' print(URLDECODE(a))
embedding_size = 128 num_skips = 4 skip_window = 5 num_sampled = 64 num_iter = 5 plot_only = 100 plt_dir = "file\\word2vec.png" start = time.time() words = [] datas = [] with open("data\\xssed.csv", "r", encoding="utf-8") as file: reader = csv.DictReader(file, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = URLDECODE(payload) datas.append(word) words += word model = Word2Vec(size=embedding_size, window=skip_window, negative=num_sampled, iter=num_iter) embeddings = model.wv def plot_with_labels(low_dim_embs, labels, filename=plt_dir): plt.figure(figsize=(10, 10)) for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y)
def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname), encoding="utf-8"): if len(line.strip()): #判断是否是空行 yield URLDECODE(line)
def pre_process(): with open(vec_dir, "rb") as f: word2vec = pickle.load(f) dictionary = word2vec["dictionary"] reverse_dictionary = word2vec["reverse_dictionary"] embeddings = word2vec["embeddings"] xssed_data = [] normal_data = [] with open("data\\xssed.csv", "r", encoding="utf-8") as f: reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = URLDECODE(payload) xssed_data.append(word) with open("data\\normal_examples.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) reader = csv.DictReader(f, fieldnames=["payload"]) for row in reader: payload = row["payload"] word = URLDECODE(payload) normal_data.append(word) xssed_num = len(xssed_data) normal_num = len(normal_data) xssed_labels = [1] * xssed_num normal_labels = [0] * normal_num datas = xssed_data + normal_data labels = xssed_labels + normal_labels labels = to_categorical(labels) def to_index(data): d_index = [] for word in data: if word in dictionary.keys(): d_index.append(dictionary[word]) else: d_index.append(dictionary["UNK"]) return d_index datas_index = [to_index(data) for data in datas] datas_index = pad_sequences(datas_index, value=-1) rand = random.sample(range(len(datas_index)), len(datas_index)) datas = [datas_index[index] for index in rand] labels = [labels[index] for index in rand] train_datas, test_datas, train_labels, test_labels = train_test_split( datas, labels, test_size=0.3) train_size = len(train_labels) test_size = len(test_labels) input_num = len(train_datas[0]) dims_num = embeddings["UNK"].shape[0] word2vec["train_size"] = train_size word2vec["test_size"] = test_size word2vec["input_num"] = input_num word2vec["dims_num"] = dims_num with open(vec_dir, "wb") as f: pickle.dump(word2vec, f) print("Saved word2vec to:", vec_dir) print("Write trian datas to:", pre_datas_train) with open(pre_datas_train, "w") as f: for i in range(train_size): data_line = str(train_datas[i].tolist()) + "|" + str( train_labels[i].tolist()) + "\n" f.write(data_line) print("Write test datas to:", pre_datas_test) with open(pre_datas_test, "w") as f: for i in range(test_size): data_line = str(test_datas[i].tolist()) + "|" + str( test_labels[i].tolist()) + "\n" f.write(data_line) print("Write datas over!")