Exemple #1
0
def check(model,w_model,data):
    if data !=None:
        xx=[]
        filepath='file/INPUT_SHAPE'
        input_shape=[]
        with open(filepath,'r') as f :
            for line in f.readlines() :
                input_shape=int(line)
        if len(data.strip()): #判断是否是空行
            for text in URLDECODE(data) :
                #print(text)
                try:
                    xx.append(w_model[text])
                except KeyError:
                    continue
            xx=np.array(xx, dtype='float')
        if not len(xx):
            return [0]
        x=[]
        x.append(xx)
        x=np.array(x)
 
        x=keras.preprocessing.sequence.pad_sequences(x,dtype='float32',maxlen=input_shape)

        result=model.predict_classes(x, batch_size=len(x))
   
        return result
    else:
        return [0]
Exemple #2
0
    def __iter__(self):
        for line in open(self.filename, encoding="utf-8"):
            if len(line.strip()):  #判断是否是空行
                self.f_len += 1
                xx = []
                for text in URLDECODE(line):
                    try:

                        xx.append(self.model[text])
                    except KeyError:
                        continue
                xx = np.array(xx, dtype='float')
                if self.max_len < len(xx):
                    self.max_len = len(xx)
                yield xx
Exemple #3
0
from utils import URLDECODE

a = 'cc%3D-%26ct%3D-%26java%3D1%26lang%3D-%26pf%3D-%26scl%3D-%26scr%3D-%26tt%3D-%26tz%3D-8%26vs%3D3.3%26dm%3Dappbase.qzone.qq.com%26url%3D/qz/358/normal%26rdm%3Dqzs.qq.com%26rurl%3D/open/fusion/api_v115.htm%26flash%3D20.0%26pgv_pvid%3D7725784408%26sds%3D0.9023870290257037'

print(URLDECODE(a))
Exemple #4
0
embedding_size = 128
num_skips = 4
skip_window = 5
num_sampled = 64
num_iter = 5
plot_only = 100
plt_dir = "file\\word2vec.png"

start = time.time()
words = []
datas = []
with open("data\\xssed.csv", "r", encoding="utf-8") as file:
    reader = csv.DictReader(file, fieldnames=["payload"])
    for row in reader:
        payload = row["payload"]
        word = URLDECODE(payload)
        datas.append(word)
        words += word

model = Word2Vec(size=embedding_size,
                 window=skip_window,
                 negative=num_sampled,
                 iter=num_iter)
embeddings = model.wv


def plot_with_labels(low_dim_embs, labels, filename=plt_dir):
    plt.figure(figsize=(10, 10))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
Exemple #5
0
 def __iter__(self):
     for fname in os.listdir(self.dirname):
         for line in open(os.path.join(self.dirname, fname),
                          encoding="utf-8"):
             if len(line.strip()):  #判断是否是空行
                 yield URLDECODE(line)
Exemple #6
0
def pre_process():
    with open(vec_dir, "rb") as f:
        word2vec = pickle.load(f)
        dictionary = word2vec["dictionary"]
        reverse_dictionary = word2vec["reverse_dictionary"]
        embeddings = word2vec["embeddings"]
    xssed_data = []
    normal_data = []
    with open("data\\xssed.csv", "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, fieldnames=["payload"])
        for row in reader:
            payload = row["payload"]
            word = URLDECODE(payload)
            xssed_data.append(word)
    with open("data\\normal_examples.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        reader = csv.DictReader(f, fieldnames=["payload"])
        for row in reader:
            payload = row["payload"]
            word = URLDECODE(payload)
            normal_data.append(word)
    xssed_num = len(xssed_data)
    normal_num = len(normal_data)
    xssed_labels = [1] * xssed_num
    normal_labels = [0] * normal_num
    datas = xssed_data + normal_data
    labels = xssed_labels + normal_labels
    labels = to_categorical(labels)

    def to_index(data):
        d_index = []
        for word in data:
            if word in dictionary.keys():
                d_index.append(dictionary[word])
            else:
                d_index.append(dictionary["UNK"])
        return d_index

    datas_index = [to_index(data) for data in datas]
    datas_index = pad_sequences(datas_index, value=-1)
    rand = random.sample(range(len(datas_index)), len(datas_index))
    datas = [datas_index[index] for index in rand]
    labels = [labels[index] for index in rand]
    train_datas, test_datas, train_labels, test_labels = train_test_split(
        datas, labels, test_size=0.3)
    train_size = len(train_labels)
    test_size = len(test_labels)
    input_num = len(train_datas[0])
    dims_num = embeddings["UNK"].shape[0]
    word2vec["train_size"] = train_size
    word2vec["test_size"] = test_size
    word2vec["input_num"] = input_num
    word2vec["dims_num"] = dims_num
    with open(vec_dir, "wb") as f:
        pickle.dump(word2vec, f)
    print("Saved word2vec to:", vec_dir)
    print("Write trian datas to:", pre_datas_train)
    with open(pre_datas_train, "w") as f:
        for i in range(train_size):
            data_line = str(train_datas[i].tolist()) + "|" + str(
                train_labels[i].tolist()) + "\n"
            f.write(data_line)
    print("Write test datas to:", pre_datas_test)
    with open(pre_datas_test, "w") as f:
        for i in range(test_size):
            data_line = str(test_datas[i].tolist()) + "|" + str(
                test_labels[i].tolist()) + "\n"
            f.write(data_line)
    print("Write datas over!")