Beispiel #1
0
def dataset2npyfilse():
    """ for using with data generator"""
    trainset_path = './trainset/pdfobjs.txt'
    trainset_path = './trainset/pdf_object_trainset_100_to_500_percent10.txt'
    text = preprocess.load_from_file(trainset_path)
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('Total chars:', len(chars))
    # print(chars)

    # Vectorization
    print('Building dictionary index ...')
    char_indices = dict((c, i) for i, c in enumerate(chars))
    # print(char_indices)
    indices_char = dict((i, c) for i, c in enumerate(chars))
    # print(indices_char)

    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 100  # Good idea: use ave_object_len to determine this hyper-parameter
    step = 1  # should set to 1 for best result
    epochs = 10  # number of epochs for training
    sentences = []  # list of all sentence as input
    next_chars = []  # list of all next chars as labels
    for i in range(0, len(text) - maxlen, step):  # arg2 why this?
        sentences.append(text[i:i + maxlen])
        preprocess.save_to_file('./npysamples/IDs/id-' + str(i),
                                text[i:i + maxlen])
        next_chars.append(text[i + maxlen])
        preprocess.save_to_file('./npysamples/Labels/id-' + str(i),
                                text[i + maxlen])

    print('semi sequences:', len(sentences))

    print('end...')
Beispiel #2
0
    def generate_and_fuzz_new_samples(self,
                                      model=None,
                                      model_name='model_1',
                                      epochs=1,
                                      current_epoch=1,
                                      dir_name=None):
        """
        sampling the model and generate new object
        :param model: The model which is training.
        :param model_name: Name of model (base on hyperparameters config in deep_model.py file) e.g. [model_1, model_2,
        ...]
        :param epochs: Number of total epochs of training, e.g. 10,20,30,40,50 or 60
        :param current_epoch: Number of current epoch
        :param dir_name: root directory for this running.
        :return: Nothing
        """

        # End time of current epoch
        dt = datetime.datetime.now().strftime('_date_%Y-%m-%d_%H-%M-%S')
        dir_name = dir_name + 'epoch_' + str(current_epoch) + dt + '/'
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)

        # Fuzzing hyper-parameters

        diversities = [i * 0.10 for i in range(1, 20, 2)]
        # diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8]
        # diversities = [0.5, 1.0, 1.5]  # for sou and for mou
        diversities = [1.0]

        generated_obj_total = 1100  # [5, 10, 100, 1000, 3000] {1000-1100 for sou and 3000-3100 for muo}
        generated_obj_with_same_prefix = 1  # [1, 5, 10, 20, 40] {10 for sou and 20 for mou}
        generated_obj_max_allowed_len = 400  # Choose max allowed len for object randomly
        exclude_from_fuzzing_set = {'s', 't', 'r', 'e', 'a',
                                    'm'}  # set(['s', 't', 'r', 'e', 'a', 'm'])

        # Learn and fuzz paper hyper-parameters
        t_fuzz = 0.9  # For comparision with p_fuzz where p_fuzz is a random number (if p_fuzz > t_fuzz)
        p_t = 0.9  # 0.9 and more for format fuzzing; 0.5 and less than 0.5 for data fuzzing. Now format fuzzing.

        # End of fuzzing hyper-parameters

        testset_objects_list = preprocess.get_list_of_object(self.text_test)
        testset_object_gt_maxlen_list = []
        for obj in testset_objects_list:
            if len(obj) > self.maxlen + len(' endobj'):
                testset_object_gt_maxlen_list.append(obj)
        print('len filtered test-set: ', len(testset_object_gt_maxlen_list))
        generated_total = ''
        for diversity in diversities:
            generated_total = ''
            for q in range(
                    round(generated_obj_total /
                          generated_obj_with_same_prefix)):

                obj_index = random.randint(
                    0,
                    len(testset_object_gt_maxlen_list) - 1)
                # obj_index = 0
                generated_obj_counter = 0
                generated_obj_len = 0
                generated = ''
                stop_condition = False
                endobj_attach_manually = False
                # print()
                print('-- Diversity:', diversity)

                obj_prefix = str(
                    testset_object_gt_maxlen_list[obj_index])[0:self.maxlen]
                generated += obj_prefix
                # prob_vals = '1 ' * self.maxlen
                # learnt_grammar = obj_prefix

                # print('--- Generating ts_text with seed:\n "' + obj_prefix + '"')
                # sys.stdout.write(generated)

                if generated.endswith('endobj'):
                    generated_obj_counter += 1

                if generated_obj_counter > generated_obj_with_same_prefix:
                    stop_condition = True

                while not stop_condition:
                    x_pred = np.zeros((1, self.maxlen, len(self.chars)))
                    for t, char in enumerate(obj_prefix):
                        x_pred[0, t, self.char_indices[char]] = 1.

                    preds = model.predict(x_pred, verbose=0)[0]
                    next_index, prob, preds2 = self.sample(preds, diversity)
                    next_char = self.indices_char[next_index]
                    next_char_for_prefix = next_char

                    ###### Fuzzing section we don't need it yet!
                    # if next_char not in exclude_from_fuzzing_set:
                    #     p_fuzz = random.random()
                    #     if p_fuzz > t_fuzz and preds2[next_index] > p_t:
                    #         next_index = np.argmin(preds2)
                    #         print('((Fuzz!))')
                    #     next_char = self.indices_char[next_index]
                    ###### End of fuzzing section

                    # print()
                    # print(preds2)
                    # print(np.argmax(preds))
                    # print(preds[np.argmax(preds)])

                    # print(prob)
                    # print(np.argmax(prob))
                    # print('====>',next_index)
                    # print(prob[0, next_index])

                    # prob_vals += str(preds2[next_index]) + '\n'
                    # if preds2[next_index] > 0.9980:
                    #     learnt_grammar += next_char
                    # else:
                    #     learnt_grammar += '.'
                    # input()

                    obj_prefix = obj_prefix[1:] + next_char_for_prefix
                    generated += next_char_for_prefix  # next_char
                    generated_obj_len += 1

                    if generated.endswith('endobj'):
                        generated_obj_counter += 1
                        generated_obj_len = 0
                    elif (generated.endswith('endobj') is False) and \
                            (generated_obj_len > generated_obj_max_allowed_len):
                        # Attach '\nendobj\n' manually, and reset obj_prefix
                        generated += '\nendobj\n'
                        generated_obj_counter += 1
                        generated_obj_len = 0
                        endobj_attach_manually = True

                    if generated_obj_counter >= generated_obj_with_same_prefix:  # Fix: Change > to >= (13970315)
                        stop_condition = True
                    elif endobj_attach_manually:
                        # Reset prefix:
                        # Here we need to modify obj_prefix because we manually change the generated_obj!
                        # Below we add this new repair:

                        # obj_prefix = obj_prefix[len('\nendobj\n'):] + '\nendobj\n'

                        # Instead of modify obj_prefix we can reset prefix if we found that 'endobj' dose not generate
                        # automatically. It seems to be better option, so we do this:
                        # obj_index = random.randint(0, len(testset_object_gt_maxlen_list) - 1)
                        obj_index = 0
                        obj_prefix = str(
                            testset_object_gt_maxlen_list[obj_index])[0:self.
                                                                      maxlen]
                        generated += obj_prefix
                        endobj_attach_manually = False

                    # sys.stdout.write(next_char)
                    # sys.stdout.flush()
                    # print()
                generated_total += generated + '\n'
            # save generated_result to file inside program

            file_name = model_name \
                        + '_diversity_' + repr(diversity) \
                        + '_epochs_' + repr(epochs) \
                        + '_step_' + repr(self.step) \
                        + '.txt'
            preprocess.save_to_file(dir_name + file_name, generated_total)
            # preprocess.save_to_file(dir_name + file_name + 'probabilities.txt', prob_vals)
            # preprocess.save_to_file(dir_name + file_name + 'learntgrammar.txt',learnt_grammar)
            print('Diversity %s save to file successfully.' % diversity)

        print('End of generation method.')
        print('Starting new epoch ...')
        return generated_total
def train():
    trainset_path = './trainset/pdfobjs.txt'
    trainset_path = './trainset/pdf_object_trainset_100_to_500_percent01.txt'
    text = poc.load_from_file(trainset_path)
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('Total chars:', len(chars))
    # print(chars)

    # Vectorization
    print('Building dictionary index ...')
    char_indices = dict((c, i) for i, c in enumerate(chars))
    # print(char_indices)
    indices_char = dict((i, c) for i, c in enumerate(chars))
    # print(indices_char)

    # cut the text in semi-redundant sequences of maxlen characters
    maxlen = 50  # Good idea: use ave_object_len to determine this hyper-parameter
    step = 1  # should set to 1 for best result
    epochs = 10  # number of epochs for training
    sentences = []  # list of all sentence as input
    next_chars = []  # list of all next chars as labels
    for i in range(0, len(text) - maxlen, step):  # arg2 why this?
        sentences.append(text[i:i + maxlen])
        # print(sentences)
        next_chars.append(text[i + maxlen])
        # print(next_chars)

    print('semi sequences:', len(sentences))

    print('One-Hot vectorization...')
    x = np.zeros((len(sentences), maxlen, len(chars)),
                 dtype=np.bool)  # input x
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)  # output label y
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

    # build the model: a single LSTM layer # we need to deep it
    print('Build model...')
    model = Sequential()
    # model.add(LSTM(128, input_shape=(maxlen, len(chars))))
    model.add(
        LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
    # model.add(LSTM(128, input_shape=(maxlen, len(chars)),
    #                activation='relu', return_sequences=True, dropout=0.2))
    model.add(LSTM(128, input_shape=(maxlen, len(chars))))
    # model.add(LSTM(128, activation='relu', dropout=0.2))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    input()
    # sys.exit()

    model.fit(x, y, batch_size=128, epochs=epochs,
              validation_split=0.2)  # why epochs=?
    save(model, epochs)

    # del model
    # model = load_model('./modelh5/lstm_text_generation_pdf_objs_1_20180214_235713_epochs10.h5')
    """ sampling the model and generate new object """
    diversities = [0.2, 0.5, 1.0, 1.2, 1.5, 1.8]
    # diversities = [0.1, 0.2, 0.3, 0.5, 0.7, 1, 1.2, 1.5, 1.7, 2]
    generated_obj_max_number = 5
    generated_obj_max_allowed_len = 500
    t_fuzz = 0.9
    p_t = 0.9  # 0.9 for format fuzzing and 0.5 or letter for data fuzzing. Now format fuzzing

    list_of_objects = poc.get_list_of_object(text)
    list_of_objects_with_maxlen = []
    for o in list_of_objects:
        if len(o) > maxlen:
            list_of_objects_with_maxlen.append(o)

    for diversity in diversities:
        obj_index = random.randint(0, len(list_of_objects_with_maxlen) - 1)
        generated_obj_counter = 0
        generated_obj_len_index = 0

        stop_condition = False
        print()
        print('-- Diversity:', diversity)

        # generated = ''
        obj_prefix = str(list_of_objects_with_maxlen[obj_index])[
            0:maxlen]  # len(sentence) equals 100 here

        generated = obj_prefix
        prob_vals = '100\n' * maxlen
        learnt_grammar = obj_prefix

        print('--- Generating text with seed:\n "' + obj_prefix + '"')
        sys.stdout.write(generated)

        if generated.endswith('endobj'):
            generated_obj_counter += 1

        if generated_obj_counter > generated_obj_max_number:
            stop_condition = True

        while not stop_condition:
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(obj_prefix):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index, prob, preds2 = sample(preds, diversity)
            p_fuzz = random.random()
            if p_fuzz > t_fuzz and preds2[next_index] > p_t:
                next_index = np.argmin(preds2)
                print('FUZZ DONE!')

            next_char = indices_char[next_index]

            # print()
            # print(preds2)
            # print(np.argmax(preds))
            # print(preds[np.argmax(preds)])

            # print(prob)
            # print(np.argmax(prob))
            # print('====>',next_index)
            # print(prob[0, next_index])

            # prob_vals += str(preds2[next_index]) + '\n'
            # if preds2[next_index] > 0.9980:
            #     learnt_grammar += next_char
            # else:
            #     learnt_grammar += '.'
            # input()

            obj_prefix = obj_prefix[1:] + next_char
            generated += next_char
            generated_obj_len_index += 1

            if generated.endswith('endobj'):
                generated_obj_counter += 1
                generated_obj_len_index = 0
            elif generated_obj_len_index > generated_obj_max_allowed_len:
                generated += '\nendobj\n'
                generated_obj_counter += 1
                generated_obj_len_index = 0

            if generated_obj_counter > generated_obj_max_number:
                stop_condition = True

            sys.stdout.write(next_char)
            sys.stdout.flush()

        # save generated text to file inside program
        dt = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S_')
        dir_name = './generated_results/pdfobjs_new/'
        file_name = 'gen_objs' + dt + 'epochs' + repr(epochs) + '_div' \
                    + repr(diversity) + '_step' + repr(step) + '.txt'
        poc.save_to_file(dir_name + file_name, generated)