Beispiel #1
0
def main(args):
    print('Settings:')
    print(str(args)[10:-1])

    transform_size = 1000  # Batch size to be used during transformation.

    num_features = args.features
    print('Loading data')
    train_x, train_y, dev_x, dev_y, dev_pid, test_x, test_y, test_pid = load_data(
        args.data, bytes=args.byte, preprocess=args.preprocessed)

    if args.no_recalc:  # Fit vectorizer and transform source codes to n-gram bag vectors.
        print('Calculating vectors.')
        vec = train_vectorizer(train_x, args.mode, args.ngram, num_features)
        dump(vec, 'vectorizer.joblib'
             )  # Saves the vectorizer to be used in n-gram error analysis
        batch_transform(vec, train_x, 'train', num_features, transform_size)
        batch_transform(vec, dev_x, 'dev', num_features, transform_size)
        batch_transform(vec, test_x, 'test', num_features, transform_size)
        if args.mode == 'c':
            print("Rescaling count values.")
            rescale(num_features)
    else:
        print('Vector calculation skipped, loading from pre-calculated files.')
    predictions_dev, predictions_test, history = run_model(
        args.batch, num_features, train_y, dev_y, args.skip, args.fullpredict)
    if not args.skip:
        plot_history(
            history
        )  # Plot training and validation accuracy and loss per epoch.
    if args.results:
        write_predictions(predictions_dev, dev_pid, 'dev_predictions')
        write_predictions(predictions_test, test_pid, 'test_predictions')
Beispiel #2
0
def main(args):
    print('Settings:')
    print(str(args)[10:-1])
    train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data(args.data, bytes=args.byte, preprocess=args.preprocessed)
    dev_orig = dev_x.copy()  # [:2500] # To do shorter set, uncomment [:2500]
    print("Dataset loaded.")
    ngram_sizes = [int(n) for n in args.ngrams.split()]
    profile_lens = [int(n) for n in args.features.split()]
    print(ngram_sizes)
    print(profile_lens)
    # ngram_sizes = [2, 3]
    # profile_lens = [-1, 200]

    for ngram_size in ngram_sizes:
        author_profiles = {}
        dev_x = dev_orig.copy()
        for i in range(len(train_x)):
            single_profile = generate_profile(train_x[i], ngram_size)

            if train_y[i] in author_profiles:
                author_profiles[train_y[i]] = append_profile(author_profiles[train_y[i]], single_profile)
            else:
                author_profiles[train_y[i]] = single_profile
        author_profiles_backup = author_profiles.copy()
        # for author in author_profiles:
        #     author_profiles[author] = dictionary_to_list(author_profiles[author])

        dev_x = [dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x]

        #############################

        for profile_len in profile_lens:
            author_profiles = author_profiles_backup.copy()  # First load all the author ngrams and fix to the correct length
            for author in author_profiles:
                if profile_len >= 0:
                    author_profiles[author] = set(dictionary_to_list(author_profiles[author])[:profile_len])
                # Each author profile is now a set of the top profile_len number of features.
                elif profile_len == -1:
                    # print("HYPER")
                    auth_dict = author_profiles[author]  # count dictionary for author
                    keys = list(auth_dict)  # list of ngrams
                    for key in keys:  # if a key only appears once, remove it
                        if auth_dict[key] == 1:
                            del auth_dict[key]
                    author_profiles[author] = set(dictionary_to_list(author_profiles[author]))

            print("Running {}@{}".format(ngram_size, profile_len))
            count_total = 0
            count_success = 0

            for i in range(len(dev_x)):
                actual = dev_y[i]
                result = compare_to_profiles(dev_x[i], author_profiles)
                count_total += 1
                if actual == result:
                    count_success += 1
            print('Total Guesses: {}'.format(count_total))
            print('Correct Guesses: {}'.format(count_success))
            print('Guess accuracy: {}'.format(count_success / count_total))
            print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
Beispiel #3
0
def main(args):
    train_x, train_y, dev_x, dev_y, _, test_x, _, _ = load_data('../data_dir')
    features = 60770  # Number of unique words in training set
    if args.no_recalc:
        vectorizer = CountVectorizer(binary=True)
        # vectorizer = TfidfVectorizer()

        # Convert train, dev and test to 60770-D vectors
        train_x = vectorizer.fit_transform(train_x).astype('float32').toarray()  # Convert train set to vector
        features = train_x.shape[1]
        t = np.memmap('vectors/train.mm', dtype='float32', mode='w+', shape=(50000, features))
        t[:] = train_x[:]
        del t, train_x
        dev_x = vectorizer.transform(dev_x).astype('float32').toarray()
        d = np.memmap('vectors/dev.mm', dtype='float32', mode='w+', shape=(25000, features))
        d[:] = dev_x[:]
        del d, dev_x
        test_x = vectorizer.transform(test_x).astype('float32').toarray()
        te = np.memmap('vectors/test.mm', dtype='float32', mode='w+', shape=(25000, features))
        te[:] = test_x[:]
        del te, test_x
    t = np.memmap('vectors/train.mm', dtype='float32', mode='r', shape=(50000, features))
    d = np.memmap('vectors/dev.mm', dtype='float32', mode='r', shape=(25000, features))
    te = np.memmap('vectors/test.mm', dtype='float32', mode='r', shape=(25000, features))
    # Setup generators
    train = Generator(t, train_y, 128)
    dev = Generator(d, dev_y, 128)
    test = GeneratorX(te, 128)
    #  Model
    callback_list = [EarlyStopping(monitor='val_acc', patience=5),
                     ModelCheckpoint(filepath='word_model.h5', monitor='val_acc', save_best_only=True),
                     ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=3)]

    opt = RMSprop(learning_rate=0.001)
    model = Sequential()
    model.add(Dense(500, activation='relu', input_shape=(features,)))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['acc'])
    model.summary()
    model.fit(train, epochs=1000, validation_data=dev, callbacks=callback_list)
    model.load_weights('word_model.h5')
    model.evaluate(dev)
    # Write dev and test predictions to file.
    predict_vec = np.memmap('vectors/dev_word.mm', dtype='float32', mode='w+', shape=(25000, 1000))
    predict_vec[:] = model.predict(dev)[:]
    del predict_vec
    predict_vec2 = np.memmap('vectors/test_word.mm', dtype='float32', mode='w+', shape=(25000, 1000))
    predict_vec2[:] = model.predict(test)[:]
    del predict_vec2
Beispiel #4
0
    def __data_generation(self, filenames):
        # Load a batch of data
        X, y = datatools.load_data(
            path_to_dataset=self.path_to_dataset,
            data_list=filenames,
            input_shape=self.dim,
            standardization_mode=self.standardization_mode,
            border=self.border)

        # Debugging
        #        if self.val == False:
        #            f = open('traingen.log', 'a+')
        #            f.write('-------------------->New Epoch\n')
        #            for i in range(len(filenames)):
        #                f.write(filenames[i]+'\n')
        #            f.close()
        #        else:
        #            f = open('valgen.log', 'a+')
        #            f.write('-------------------->New Epoch\n')
        #            for i in range(len(filenames)):
        #                f.write(filenames[i]+'\n')
        #            f.close()

        #        if self.standardization_mode != None:
        #            standardize = True
        #            #print('Datagen performing standardization...')
        #        else:
        #            standardize = False
        #            #print('Datagen without standardization...')
        #        X, y = datatools.load_data2(path_to_dataset=self.path_to_dataset,
        #                                   data_list=filenames, input_shape=self.dim,
        #                                   standardize=standardize,
        #                                   border=self.border)

        # Scale the data
        y = y * self.linear_output_scaling_factor

        # Expand the dimension for channels
        X = X[:, :, :, :, np.newaxis]
        y = y[:, :, :, :, np.newaxis]

        return X, y
Beispiel #5
0
def main():
    train_x, _, dev_x, _, _, test_x, _, _ = load_data(r'../data_dir/',
                                                      bytes=False,
                                                      preprocess=True)
    del _

    print("Tokenizing.")
    length = 100
    start = time.time()
    processes = [
        Process(target=tokenize, args=(train_x, 'train', length)),
        Process(target=tokenize, args=(dev_x, 'dev', length)),
        Process(target=tokenize, args=(test_x, 'test', length))
    ]
    for p in processes:
        p.start()
    for p in processes:
        p.join()
    print("Finalising writing tokens to file.")
    print(time.time() - start)
Beispiel #6
0
import sys

sys.path.append('..')
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, LSTM, Conv1D, MaxPool1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tools.datatools import load_data

train_x, train_y, dev_x, dev_y, _, test_x, _, _ = load_data('../data_dir')

# Word CNN model.

tokenizer = Tokenizer(
    num_words=2048)  # Tokenize source codes using top 2048 words
tokenizer.fit_on_texts(train_x)
train_y = np.array(train_y)
dev_y = np.array(dev_y)
train_x = tokenizer.texts_to_sequences(train_x)
dev_x = tokenizer.texts_to_sequences(dev_x)

train_x = pad_sequences(train_x, maxlen=512)  # Pad sequences to uniform length
dev_x = pad_sequences(dev_x, maxlen=512)
callback_list = [
    EarlyStopping(monitor='val_acc', patience=5),
    ModelCheckpoint(filepath='word_model2.h5',
                    monitor='val_acc',
                    save_best_only=True),
Beispiel #7
0
def main(args):
    start = time.time()
    print('Settings:')
    print(str(args)[10:-1])
    ngram_size = args.ngram
    profile_len = args.features
    author_profiles = {}  # author n-gram profiles 0-999
    # _, train_x, train_y = prep_inputs('train', bytes=True)
    # _, dev_x, dev_y = prep_inputs('dev', bytes=True)
    train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data(
        '../data_dir/', bytes=args.byte, preprocess=args.preprocessed)

    print("Dataset loaded.")
    for i in range(len(train_x)):
        single_profile = generate_profile(
            train_x[i], ngram_size)  # creates profile for each code file

        if train_y[
                i] in author_profiles:  # appends to existing author profile or creates a new one if it doesn't
            # already exist
            author_profiles[train_y[i]] = append_profile(
                author_profiles[train_y[i]], single_profile)
        else:
            author_profiles[train_y[i]] = single_profile
    for author in author_profiles:
        if profile_len >= 0:
            author_profiles[author] = set(
                dictionary_to_list(author_profiles[author])[:profile_len])
        # Each author profile is now a set of the top profile_len number of features.
        elif profile_len == -1:
            print("HYPER")
            auth_dict = author_profiles[author]  # count dictionary for author
            keys = list(auth_dict)  # list of ngrams
            for key in keys:  # if a key only appears once, remove it
                if auth_dict[key] == 1:
                    del auth_dict[key]
            author_profiles[author] = set(
                dictionary_to_list(author_profiles[author]))

    print("Author profiles ready.")

    count_total = 0
    count_success = 0

    dev_x = [
        dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x
    ]
    print('Dev ready for comparisons.')
    start_time = time.time()
    for i in range(len(dev_x)):

        actual = dev_y[i]
        result = compare_to_profiles(dev_x[i], author_profiles)
        count_total += 1
        if actual == result:
            count_success += 1
        if ((i + 1) % 250) == 0:
            percent = int((i + 1) / 250)
            print("Progress: {}%".format(percent))
            print("Accuracy so far: {}".format(count_success / count_total))
            time_secs = (
                (time.time() - start_time) / percent) * (100 - percent)
            time_mins = int(time_secs // 60)
            time_secs = str(int(time_secs % 60)).zfill(2)
            print("Time remaining: {}:{}".format(time_mins, time_secs))
    print('Total Guesses: {}'.format(count_total))
    print('Correct Guesses: {}'.format(count_success))
    print('Guess accuracy: {}'.format(count_success / count_total))
    print('n-grams: {}'.format(ngram_size))
    print('Profile length: {}'.format(profile_len))
    print(time.time() - start)
Beispiel #8
0
linear_output_scaling_factor = 409600000000
path_to_dataset = os.path.join('..', '..', '..', 'Daten',
                               'dataset_size32_stride16_split')
data_list = datatools.get_balanced_dataset(path_to_dataset=path_to_dataset,
                                           clip=5000)

# Shuffle the dataset
np.random.shuffle(data_list)

train_list = data_list[0:10000]
val_list = data_list[10000:15000]
test_list = data_list[15000:200000]

X_train, y_train = datatools.load_data(path_to_dataset=path_to_dataset,
                                       data_list=train_list,
                                       input_shape=(32, 32, 32),
                                       standardization_mode='per_sample',
                                       border=None)
X_val, y_val = datatools.load_data(path_to_dataset=path_to_dataset,
                                   data_list=val_list,
                                   input_shape=(32, 32, 32),
                                   standardization_mode='per_sample',
                                   border=None)
X_test, y_test = datatools.load_data(path_to_dataset=path_to_dataset,
                                     data_list=test_list,
                                     input_shape=(32, 32, 32),
                                     standardization_mode=None,
                                     border=None)

# Expand the dimensions for channels
X_train = X_train[:, :, :, :, np.newaxis]
Beispiel #9
0
def main(args):
    start = time.time()
    print('Settings:')
    print(str(args)[10:-1])
    ngram_size = args.ngram

    profile_len = args.features
    author_profiles = {}  # author n-gram profiles 0-999
    # _, train_x, train_y = prep_inputs('train', bytes=True)
    # _, dev_x, dev_y = prep_inputs('dev', bytes=True)
    train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data(
        '../data_dir/', bytes=args.byte, preprocess=args.preprocessed)

    print("Dataset loaded.")
    for i in range(len(train_x)):
        single_profile = generate_profile(
            train_x[i], ngram_size)  # creates profile for each code file

        if train_y[
                i] in author_profiles:  # appends to existing author profile or creates a new one if it doesn't
            # already exist
            author_profiles[train_y[i]] = append_profile(
                author_profiles[train_y[i]], single_profile)
        else:
            author_profiles[train_y[i]] = single_profile

    # for author in author_profiles:
    #     author_profiles[author] = set(dictionary_to_list(author_profiles[author])[:profile_len])

    for author in author_profiles:
        if profile_len >= 0:
            author_profiles[author] = set(
                dictionary_to_list(author_profiles[author])[:profile_len])
        # Each author profile is now a set of the top profile_len number of features.
        elif profile_len == -1:
            auth_dict = author_profiles[author]  # count dictionary for author
            keys = list(auth_dict)  # list of ngrams
            for key in keys:  # if a key only appears once, remove it
                if auth_dict[key] == 1:
                    del auth_dict[key]
            author_profiles[author] = set(
                dictionary_to_list(author_profiles[author]))
    # lowest=999999999
    # highest=0
    # cx=0
    # for a in author_profiles:
    #     cx+=len(author_profiles[a])
    #     lowest=min(lowest,len(author_profiles[a]))
    #     highest = max(highest, len(author_profiles[a]))
    # print(cx)
    # print(lowest)
    # print(highest)
    # exit()

    print("Author profiles ready.")

    dev_x = [
        dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x
    ]
    print('Dev ready for comparisons.')
    start_time = time.time()

    size = int(25000 / cpu_count())

    smaller_chunks = [dev_x[x:x + size] for x in range(0, len(dev_x), size)]
    labels = [dev_y[x:x + size] for x in range(0, len(dev_y), size)]
    with ProcessPoolExecutor() as executor:
        results = [
            executor.submit(calculate_profiles, smaller_chunks[i], labels[i],
                            author_profiles, i)
            for i in range(len(smaller_chunks))
        ]

        total = 0
        success = 0
        for r in as_completed(results):
            r = r.result()
            success += r[0]
            total += r[1]
    print(success)
    print(total)
    print(success / total)
    print(time.time() - start_time)
    standardization_mode=standardization_mode,
    linear_output_scaling_factor=linear_output_scaling_factor,
    border=border)

history = cnn.fit_generator(epochs=epochs,
                            train_generator=train_generator,
                            val_generator=val_generator,
                            callbacks=callbacks)

#%%############################################################################
# Evaluate the model
###############################################################################
# Load unstandardized test data
X_test_data, y_test_data = datatools.load_data(path_to_dataset=path_to_dataset,
                                               data_list=test_list,
                                               input_shape=data_shape,
                                               standardization_mode=None,
                                               border=border)
if evaluate == True:
    test_loss = cnn.evaluate_model(X_test=np.expand_dims(X_test_data, axis=4),
                                   y_test=np.expand_dims(y_test_data, axis=4),
                                   batch_size=batch_size)
    print(test_loss)

#%%############################################################################
# Save the model
###############################################################################

cnn.save_model_json(model_export_path, 'model_json')
cnn.save_model_weights(model_export_path, 'model_weights')
cnn.save_model_single_file(model_export_path, 'model_single')
Beispiel #11
0
def main(args):
    print('Settings:')
    print(str(args)[10:-1])
    train_x, train_y, dev_x, dev_y, _, _, _, _ = load_data(
        args.data, bytes=args.byte, preprocess=args.preprocessed)
    dev_orig = dev_x.copy()  # [:2500]  # To do shorter set, uncomment [:2500]
    print("Dataset loaded.")
    ngram_sizes = [int(n) for n in args.ngrams.split()]
    profile_lens = [int(n) for n in args.features.split()]
    print(ngram_sizes)
    print(profile_lens)

    for ngram_size in ngram_sizes:
        author_profiles = {}
        dev_x = dev_orig.copy()
        for i in range(len(train_x)):
            single_profile = generate_profile(train_x[i], ngram_size)

            if train_y[i] in author_profiles:
                author_profiles[train_y[i]] = append_profile(
                    author_profiles[train_y[i]], single_profile)
            else:
                author_profiles[train_y[i]] = single_profile
        author_profiles_backup = author_profiles.copy()

        dev_x = [
            dictionary_to_list(generate_profile(x, ngram_size)) for x in dev_x
        ]

        #############################

        for profile_len in profile_lens:
            author_profiles = author_profiles_backup.copy(
            )  # First load all the author ngrams and fix to the correct length
            for author in author_profiles:
                if profile_len >= 0:
                    author_profiles[author] = set(
                        dictionary_to_list(
                            author_profiles[author])[:profile_len])
                # Each author profile is now a set of the top profile_len number of features.
                elif profile_len == -1:
                    # print("HYPER")
                    auth_dict = author_profiles[
                        author]  # count dictionary for author
                    keys = list(auth_dict)  # list of ngrams
                    for key in keys:  # if a key only appears once, remove it
                        if auth_dict[key] == 1:
                            del auth_dict[key]
                    author_profiles[author] = set(
                        dictionary_to_list(author_profiles[author]))
            print("Running {}@{}".format(ngram_size, profile_len))

            start_time = time.time()
            processes = cpu_count()
            size = int(25000 / processes)

            smaller_chunks = [
                dev_x[x:x + size] for x in range(0, len(dev_x), size)
            ]
            labels = [dev_y[x:x + size] for x in range(0, len(dev_y), size)]
            with ProcessPoolExecutor() as executor:
                results = [
                    executor.submit(calculate_profiles, smaller_chunks[i],
                                    labels[i], author_profiles)
                    for i in range(len(smaller_chunks))
                ]

                total = 0
                success = 0
                for r in as_completed(results):
                    r = r.result()
                    success += r[0]
                    total += r[1]
                executor.shutdown(wait=True)

            # print(success)
            # print(total)
            # print(success / total)
            print(time.time() - start_time)
            print('Total Guesses: {}'.format(total))
            print('Correct Guesses: {}'.format(success))
            print('Guess accuracy: {}'.format(success / total))
            print(
                '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
            )
Beispiel #12
0
def main(args):
    print('Settings:')
    print(str(args)[10:-1])

    length = 136
    print('Loading data...')

    if args.no_recalc:
        train_x, _, dev_x, _, _, test_x, _, _ = load_data(
            '../data_dir'
        )  # Makes use of both raw and preprocessed source codes.
        train_x2, _, dev_x2, _, _, test_x2, _, _ = load_data('../data_dir',
                                                             preprocess=True)
        print("Extracting stylometric features...")
        vec = Vectorizer(
            'lexical'
        )  # Runs the stylometry_vectorizer from the vectorizer.py file so characters can be grabbed
        # simultaneously.
        train_x = vec.vectorize(train_x, train_x2)  # Vectorize all 3 subsets
        dev_x = vec.vectorize(dev_x, dev_x2)
        test_x = vec.vectorize(test_x, test_x2)
        del train_x2, dev_x2, test_x2
        scaler = MinMaxScaler()  # Rescale values between 0 and 1.
        print("Rescaling...")
        train_x = scaler.fit_transform(train_x)
        dev_x = scaler.transform(dev_x)
        test_x = scaler.transform(test_x)
        length = len(train_x[0])
        print(length)
        trainmm = np.memmap('vectors/train.mm',
                            dtype='float32',
                            mode='w+',
                            shape=(50000, length))
        trainmm[:] = train_x[:]
        devmm = np.memmap('vectors/dev.mm',
                          dtype='float32',
                          mode='w+',
                          shape=(25000, length))
        devmm[:] = dev_x[:]
        testmm = np.memmap('vectors/test.mm',
                           dtype='float32',
                           mode='w+',
                           shape=(25000, length))
        testmm[:] = test_x[:]
        del trainmm, devmm, testmm, train_x, dev_x, test_x  # Save and flush all vectors.
        print("Finished building vectors.")
    # Load data from file.
    train_y, dev_y, _ = load_all_labels('../data_dir')
    dev = np.array(
        np.memmap('vectors/dev.mm',
                  dtype='float32',
                  mode='r',
                  shape=(25000, length)))
    test = np.array(
        np.memmap('vectors/test.mm',
                  dtype='float32',
                  mode='r',
                  shape=(25000, length)))

    train = np.array(
        np.memmap('vectors/train.mm',
                  dtype='float32',
                  mode='r',
                  shape=(50000, length)))
    # Model.
    callback_list = [
        EarlyStopping(monitor='val_acc', patience=10),
        ModelCheckpoint(filepath='style_model.h5',
                        monitor='val_acc',
                        save_best_only=True),
        ReduceLROnPlateau(monitor='val_acc', factor=0.1, patience=5)
    ]
    model = Sequential()
    model.add(Dense(500, activation='relu', input_shape=(136, )))
    model.add(Dropout(0.3))
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1000, activation='softmax'))
    opt = RMSprop(learning_rate=0.001)
    model.compile(optimizer=opt,
                  loss='sparse_categorical_crossentropy',
                  metrics=['acc'])
    model.summary()
    model.fit(train,
              train_y,
              epochs=1000,
              batch_size=250,
              validation_data=(dev, dev_y),
              shuffle=True,
              callbacks=callback_list)
    model = load_model('style_model.h5')
    print(model.evaluate(dev, dev_y))
    # Generate predictions.
    predict_vec = np.memmap('vectors/dev_style.mm',
                            dtype='float32',
                            mode='w+',
                            shape=(25000, 1000))
    predict_vec[:] = model.predict(dev)[:]
    del predict_vec

    predict_vec2 = np.memmap('vectors/test_style.mm',
                             dtype='float32',
                             mode='w+',
                             shape=(25000, 1000))
    predict_vec2[:] = model.predict(test)[:]
    del predict_vec2