Beispiel #1
0
    def load_dataset(self):
        """ Load all 3 part of each dataset and building dictionary index """
        if learning_config['dataset_size'] == 'small':
            self.text_training = preprocess.load_from_file(
                learning_config['small_training_set_path'])
            self.text_validation = preprocess.load_from_file(
                learning_config['small_validation_set_path'])
            self.text_test = preprocess.load_from_file(
                learning_config['small_testing_set_path'])
        elif learning_config['dataset_size'] == 'medium':
            self.text_training = preprocess.load_from_file(
                learning_config['medium_training_set_path'])
            self.text_validation = preprocess.load_from_file(
                learning_config['medium_validation_set_path'])
            self.text_test = preprocess.load_from_file(
                learning_config['medium_testing_set_path'])
        elif learning_config['dataset_size'] == 'large':
            self.text_training = preprocess.load_from_file(
                learning_config['large_training_set_path'])
            self.text_validation = preprocess.load_from_file(
                learning_config['large_validation_set_path'])
            self.text_test = preprocess.load_from_file(
                learning_config['large_testing_set_path'])
        self.text_all = self.text_training + self.text_validation + self.text_test
        print('Total corpus length:', len(self.text_all))
        self.chars = sorted(list(set(self.text_all)))
        print('Total corpus chars:', len(self.chars))
        # print(chars)

        # Building dictionary index
        print('Building dictionary index ...')
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        # print(char_indices)
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
Beispiel #2
0
    def __init__(self,
                 dim_x=32,
                 dim_y=32,
                 dim_z=32,
                 batch_size=32,
                 shuffle=True):
        'Initialization'
        self.dim_x = dim_x
        self.dim_y = dim_y
        self.dim_z = dim_z
        self.batch_size = batch_size
        self.shuffle = shuffle
        """ for using with data generator"""
        trainset_path = './trainset/pdfobjs.txt'
        trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt'
        self.text = poc.load_from_file(trainset_path)
        print('corpus length:', len(self.text))

        self.chars = sorted(list(set(self.text)))
        print('Total chars:', len(self.chars))

        # Vectorization
        print('Building dictionary index ...')
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        # print(char_indices)
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
        # print(indices_char)

        # cut the text in semi-redundant sequences of maxlen characters
        self.maxlen = 100  # Good idea: use ave_object_len to determine this hyper-parameter
Beispiel #3
0
    def __init__(self,
                 host_id=None,
                 object_file_path=iu_config['baseline_object_path'],
                 stream_directory_path=iu_config['stream_directory_path']):
        """

        :param host_id: Name of host file without postfix, e.g. host1_max, host2_min or host3_avg
        :param object_file_path: See iu_config, new_objects_path
        :param stream_directory_path: See iu_config, stream_directory_path
        """
        self.host_id = host_id

        self.object_file_path = object_file_path
        self.obj_list = preprocess.get_list_of_object(seq=preprocess.load_from_file(self.object_file_path),
                                                      is_sort=False)

        self.stream_directory_path = '../' + stream_directory_path
        self.stream_filename_list = os.listdir(self.stream_directory_path)

        # Creating new directory foreach time that program run and we want to generate new test data
        dt = datetime.datetime.now().strftime(self.host_id + '_date_%Y-%m-%d_%H-%M-%S')
        self.storage_dir_name = iu_config['new_pdfs_directory'] + self.host_id + '/' + dt + '/'
        if not os.path.exists(self.storage_dir_name):
            os.makedirs(self.storage_dir_name)
            print('new storage directory build.')

        self.obj_getter = self.obj_generator(self.obj_list)
Beispiel #4
0
def dataset2npyfilse():
    """ for using with data generator"""
    trainset_path = './trainset/pdfobjs.txt'
    trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt'
    text = preprocess.load_from_file(trainset_path)
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('Total chars:', len(chars))
    # print(chars)

    # Vectorization
    print('Building dictionary index ...')
    char_indices = dict((c, i) for i, c in enumerate(chars))
    # print(char_indices)
    indices_char = dict((i, c) for i, c in enumerate(chars))
    # print(indices_char)

    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 100  # Good idea: use ave_object_len to determine this hyper-parameter
    step = 1  # should set to 1 for best result
    epochs = 10  # number of epochs for training
    sentences = []  # list of all sentence as input
    next_chars = []  # list of all next chars as labels
    for i in range(0, len(text) - maxlen, step):  # arg2 why this?
        sentences.append(text[i:i + maxlen])
        preprocess.save_to_file('./npysamples/IDs/id-' + str(i),
                                text[i:i + maxlen])
        next_chars.append(text[i + maxlen])
        preprocess.save_to_file('./npysamples/Labels/id-' + str(i),
                                text[i + maxlen])

    print('semi sequences:', len(sentences))

    print('end...')
Beispiel #5
0
def get_one_object():
    """ provide one pdf data object whether an existing object in corpus or
    an online new generated object from learnt model
    this function is not complete yet!
    """
    object_file_path = '../trainset/pdf_object_trainset_100_to_500_percent33.txt'

    seq = poc.load_from_file(object_file_path)
    obj_list = poc.get_list_of_object(seq, is_sort=False)
    random_object_index = random.randint(50, len(obj_list) - 1)
    obj = obj_list[random_object_index]
    return obj
Beispiel #6
0
    def __data_generation(self, labels, list_IDs_temp):
        'Generates data of batch_size samples'  # X : (n_samples, v_size, v_size, v_size, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.dim_x, self.dim_y, self.dim_z, 1))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store volume
            text = poc.load_from_file('./npysamples/IDs/' + ID)
            labe = poc.load_from_file('./npysamples/Labels/' + ID)
            x = np.zeros((1, self.maxlen, len(self.chars)),
                         dtype=np.bool)  # input x
            y = np.zeros((1, len(self.chars)), dtype=np.bool)  # output label y
            for t, char in enumerate(text):
                x[0, t, self.char_indices[char]] = 1
            y[0, self.char_indices[next_chars[i]]] = 1

            X[i, :, :, :, 0] = np.load(ID + '.npy')

            # Store class
            y[i] = labels[ID]

        return X, sparsify(y)
Beispiel #7
0
 def __get_objects_sequence(self):
     seq = ''
     for filename in os.listdir(iu_config['new_objects_path']):
         try:
             seq += preprocess.load_from_file(
                 iu_config['new_objects_path'] + filename)
         except Exception as e:
             print('Extracting failed from %s:' % filename, file=sys.stderr)
             print(str(e), file=sys.stderr)
         # finally:
     obj_list = preprocess.get_list_of_object(seq=seq, is_sort=False)
     print('obj_list len', len(obj_list))
     print(obj_list)
     # for o in obj_list:
     #     print(o, '\n', '#'*50)
     # input()
     return obj_list
Beispiel #8
0
"""
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

import pdf_object_preprocess as preprocess
from config import learning_config

batch_size = 128  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 128  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.
# Path to the data txt file on disk.
text_training = preprocess.load_from_file(learning_config['small_training_set_path']) \
                + preprocess.load_from_file(learning_config['small_validation_set_path'])

d = 50
step = 1

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

for i in range(0, len(text_training) - d, step):
    input_text = text_training[i * d:(i + 1) * d]
    target_text = text_training[i * d + 1:(i + 1) * d + 1]
    # We use "tab" as the "start sequence" character
Beispiel #9
0
def attach_new_object():
    """ incremental update pdf file """
    host_names = ['host1', 'host2', 'host3']
    with open(host_directory + host_names[0] + '.pdf', 'br') as f:
        data = f.read()
        print(len(data))

    # find last trailer in a pdf file
    trailer_index = 0
    while data.find(b'trailer', trailer_index + 7) != -1:
        trailer_index = data.find(b'trailer', trailer_index + 7)
    print('trailer_index', trailer_index)

    trailer_index_dic_endof = data.find(b'>>', trailer_index)
    print('trailer_index_dic_endof', trailer_index_dic_endof)

    trailer_content = data[trailer_index:trailer_index_dic_endof + 2]
    print('trailer_content', trailer_content)

    # find last startxref offset in a pdf file
    startxref_index = trailer_index
    while data.find(b'startxref', startxref_index + 9) != -1:
        startxref_index = data.find(b'startxref', startxref_index + 9)
    # print('index ===', index_startxref)
    index_eof = data.find(b'%%EOF', startxref_index)
    # print('index 2===', index_eof)
    if data[startxref_index + 9] == b'\n' or b'\r':
        # print('yes', data[index_startxref+9])
        startxref_index += 10
    if data[index_eof - 1] == b'\n' or b'\r':
        index_eof -= 1
    startxref_offset = int(data[startxref_index:index_eof])
    print('startxref_offset', startxref_offset)

    # print(type(trailer_content))
    trailer_content_new = trailer_content[:-2] + b'   /Prev ' \
                          + bytes(str(startxref_offset), 'ascii') + b' \n>>'
    print('trailer_content_new', trailer_content_new)

    # print(bytes(str(startxref_offset), 'ascii'))

    # load the pdf object form file
    seq = poc.load_from_file(
        host_directory + 'gen_objs_20180221_142612_epochs10_div1.5_step1.txt')
    obj_list = poc.get_list_of_object(seq)
    random_object_index = random.randint(0, len(obj_list) - 1)
    obj = obj_list[random_object_index]

    last_object_id = str(get_last_object_id(host_names[0]))

    random_rewrite_object = str(random.randint(1, int(last_object_id)))

    print('len object', len(obj))
    startxref_offset_new = len(data) + 1 + len(
        random_rewrite_object) + 3 + len(obj)  # if we attach just one obj
    print('startxref_offset_new', startxref_offset_new)

    attach_content = bytes(str(random_rewrite_object + ' 0 ' + obj + '\nxref\n0 1\n0000000000 65535 f\n' +\
                               random_rewrite_object + ' 1\n' + str(len(data)).zfill(10) + ' 00000 n\n'), 'ascii') +\
                     trailer_content_new + b'\nstartxref\n' + \
                     bytes(str(startxref_offset_new), 'ascii') + b'\n%%EOF\n'

    print('attach_content\n', attach_content)

    new_pdf_file = data + attach_content
    with open(host_directory + host_names[0] + 'iu_auto7.pdf', 'bw') as f:
        f.write(new_pdf_file)
def train():
    trainset_path = './trainset/pdfobjs.txt'
    trainset_path = './trainset/pdf_object_trainset_100_to_500_percent01.txt'
    text = poc.load_from_file(trainset_path)
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('Total chars:', len(chars))
    # print(chars)

    # Vectorization
    print('Building dictionary index ...')
    char_indices = dict((c, i) for i, c in enumerate(chars))
    # print(char_indices)
    indices_char = dict((i, c) for i, c in enumerate(chars))
    # print(indices_char)

    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 50  # Good idea: use ave_object_len to determine this hyper-parameter
    step = 1  # should set to 1 for best result
    epochs = 10  # number of epochs for training
    sentences = []  # list of all sentence as input
    next_chars = []  # list of all next chars as labels
    for i in range(0, len(text) - maxlen, step):  # arg2 why this?
        sentences.append(text[i:i + maxlen])
        # print(sentences)
        next_chars.append(text[i + maxlen])
        # print(next_chars)

    print('semi sequences:', len(sentences))

    print('One-Hot vectorization...')
    x = np.zeros((len(sentences), maxlen, len(chars)),
                 dtype=np.bool)  # input x
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)  # output label y
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

    # build the model: a single LSTM layer # we need to deep it
    print('Build model...')
    model = Sequential()
    # model.add(LSTM(128, input_shape=(maxlen, len(chars))))
    model.add(
        LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
    # model.add(LSTM(128, input_shape=(maxlen, len(chars)),
    #                activation='relu', return_sequences=True, dropout=0.2))
    model.add(LSTM(128, input_shape=(maxlen, len(chars))))
    # model.add(LSTM(128, activation='relu', dropout=0.2))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    input()
    # sys.exit()

    model.fit(x, y, batch_size=128, epochs=epochs,
              validation_split=0.2)  # why epochs=?
    save(model, epochs)

    # del model
    # model = load_model('./modelh5/lstm_text_generation_pdf_objs_1_20180214_235713_epochs10.h5')
    """ sampling the model and generate new object """
    diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8]
    # diversities = [0.1, 0.2, 0.3, 0.5, 0.7, 1, 1.2, 1.5, 1.7, 2]
    generated_obj_max_number = 5
    generated_obj_max_allowed_len = 500
    t_fuzz = 0.9
    p_t = 0.9  # 0.9 for format fuzzing and 0.5 or letter for data fuzzing. Now format fuzzing

    list_of_objects = poc.get_list_of_object(text)
    list_of_objects_with_maxlen = []
    for o in list_of_objects:
        if len(o) > maxlen:
            list_of_objects_with_maxlen.append(o)

    for diversity in diversities:
        obj_index = random.randint(0, len(list_of_objects_with_maxlen) - 1)
        generated_obj_counter = 0
        generated_obj_len_index = 0

        stop_condition = False
        print()
        print('-- Diversity:', diversity)

        # generated = ''
        obj_prefix = str(list_of_objects_with_maxlen[obj_index])[
            0:maxlen]  # len(sentence) equals 100 here

        generated = obj_prefix
        prob_vals = '100\n' * maxlen
        learnt_grammar = obj_prefix

        print('--- Generating text with seed:\n "' + obj_prefix + '"')
        sys.stdout.write(generated)

        if generated.endswith('endobj'):
            generated_obj_counter += 1

        if generated_obj_counter > generated_obj_max_number:
            stop_condition = True

        while not stop_condition:
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(obj_prefix):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index, prob, preds2 = sample(preds, diversity)
            p_fuzz = random.random()
            if p_fuzz > t_fuzz and preds2[next_index] > p_t:
                next_index = np.argmin(preds2)
                print('FUZZ DONE!')

            next_char = indices_char[next_index]

            # print()
            # print(preds2)
            # print(np.argmax(preds))
            # print(preds[np.argmax(preds)])

            # print(prob)
            # print(np.argmax(prob))
            # print('====>',next_index)
            # print(prob[0, next_index])

            # prob_vals += str(preds2[next_index]) + '\n'
            # if preds2[next_index] > 0.9980:
            #     learnt_grammar += next_char
            # else:
            #     learnt_grammar += '.'
            # input()

            obj_prefix = obj_prefix[1:] + next_char
            generated += next_char
            generated_obj_len_index += 1

            if generated.endswith('endobj'):
                generated_obj_counter += 1
                generated_obj_len_index = 0
            elif generated_obj_len_index > generated_obj_max_allowed_len:
                generated += '\nendobj\n'
                generated_obj_counter += 1
                generated_obj_len_index = 0

            if generated_obj_counter > generated_obj_max_number:
                stop_condition = True

            sys.stdout.write(next_char)
            sys.stdout.flush()

        # save generated text to file inside program
        dt = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S_')
        dir_name = './generated_results/pdfobjs_new/'
        file_name = 'gen_objs' + dt + 'epochs' + repr(epochs) + '_div' \
                    + repr(diversity) + '_step' + repr(step) + '.txt'
        poc.save_to_file(dir_name + file_name, generated)