Beispiel #1
0
def prepare_data(args):
    '''
    you need to modify this dir path
    input dir denote the original data, which may be not divided
    we recommend that you need to write your own preprocess_data
    the target of preprocess data is to divide original data into train and val
    the test dir may be like the following
    data/
       data_dir/
           train/
              class1/
              class2/
              ...
              classn/
           val/
              class1/
              class2/
              ...
              classn/

    '''
    input_dir = args.input_dir
    output_dir = args.output_dir
    ratio = args.train_test_ratio
    preprocess_data(input_dir, output_dir, ratio)
Beispiel #2
0
def get_data(f):
    file_name = f.split("/")[-1].split("\\")[-1].split('.')[0]
    dir = f.split(file_name)[0]
    input_color = get_color_array(f)
    height = input_color.shape[0]
    width = input_color.shape[1]
    input_shape = (height, width, 7)
    features_path = dir + file_name + '_features.txt'
    input_features = get_features_array(input_shape, features_path)
    grad = get_grad_array(dir, file_name)
    # preprocessing the data
    r = preprocess_data(
        input_color[:, :, :1], input_features,
        np.concatenate((grad[:, :, :1], grad[:, :, 3:4]), axis=-1),
        input_shape)
    g = preprocess_data(
        input_color[:, :, 1:2], input_features,
        np.concatenate((grad[:, :, 1:2], grad[:, :, 4:5]), axis=-1),
        input_shape)
    b = preprocess_data(
        input_color[:, :, 2:3], input_features,
        np.concatenate((grad[:, :, 2:3], grad[:, :, 5:6]), axis=-1),
        input_shape)

    return r, g, b
Beispiel #3
0
def get_testdata():
    test_data_list = []
    dir_list = []
    colorfiles = glob(os.path.join(data_dir, '*.exr'))
    for f in colorfiles:

        scene = f.split("/")[-1].split("\\")[-1].split('.')[0]
        print(scene)

        input_color = get_color_array(f)
        height = input_color.shape[0]
        width = input_color.shape[1]
        input_shape = (height, width, 7)

        features_path = data_dir + scene + '_features.txt'
        input_features = get_features_array(input_shape, features_path)

        grad = get_grad_array(data_dir, scene)
        # preprocessing the data
        data = preprocess_data(input_color, input_features, grad, input_shape)
        outputDir = 'data/test/res/%s/' % scene

        if not os.path.exists(outputDir):
            os.makedirs(outputDir)
            writeEXR(input_color, (outputDir + 'origin.exr'))
            writeEXR(input_features[:, :, 0:3], (outputDir + 'normal.exr'))
            writeEXR(input_features[:, :, 4:7], (outputDir + 'albedo.exr'))
            writeEXR(grad[:, :, 0:3], (outputDir + 'gx.exr'))
            writeEXR(grad[:, :, 3:6], (outputDir + 'gy.exr'))

        test_data_list.append(data)
        # test_grad_list.append(grad)
        dir_list.append(outputDir)

    return test_data_list, dir_list
Beispiel #4
0
def run(input_file, test_file, k):
    clf = RandomForestClassifier(n_estimators=k)
    df = preprocess_data(input_file)
    X, label_dict, dict = extract_features(df)
    r, c = X.shape
    dft = preprocess_testdata(test_file)
    Xt, yt = extract_testfeatures(dft, label_dict, dict)
    clf.fit(X[:, 0:c - 1], X[:, c - 1])
    z = clf.predict(Xt)
    print(accuracy_score(yt, z))
Beispiel #5
0
def test_tokenizer():
    import util

    f = open("nltk_tokens.txt", "w+")

    polluter_tweets = "content_polluters_tweets.txt"
    tweets = util.load_data(fn=polluter_tweets, delimiter="\t", usecols=2)
    sentences = util.preprocess_data(tweets)

    dicti = {}
    for tokens in sentences:
        for token in tokens:
            if not token in dicti:
                dicti[token]=True

    for key in dicti.keys():
        f.write(key.encode('utf-8') + "\n")
def main(country_code_file_path, twitter_data_path):
    """
    :param country_code_file_path: the path of country_code_file
    :param twitter_data_path: the path of twitter_data
    """
    program_start = time.time()
    # initialize communicator
    comm = MPI.COMM_WORLD
    comm_rank = comm.Get_rank()
    comm_size = comm.Get_size()

    language_code_dict = None

    # read country_code in master process
    if comm_rank == 0:
        dump_num_processor(comm_size)
        # read country_code info and broad cast
        language_code_dict = read_language_code_dict(country_code_file_path)

    # counting hash_tag
    hash_tag_count = Counter()
    language_code_count = Counter()

    # **********************************************************************************************
    #                      Algorithm 1: parallel twitter file reading & processing
    # **********************************************************************************************

    # calculating number of lines of data to be processed, line to start, line to end
    n_lines = comm.bcast(read_n_lines(twitter_data_path), root=0)
    lines_per_core = n_lines // comm_size
    # the total number of line to be read by the processor
    lines_to_end = n_lines + 1  # ignore first line
    # the index of the first line to be processed by the processor
    line_to_start = 1 + lines_per_core * comm_rank  # ignore first line
    # the index of the last line to be processed by the processor
    line_to_end = line_to_start + lines_per_core
    if comm_rank == comm_size - 1:  # last core to finish all remaining lines
        line_to_end = lines_to_end

    # processing lines in specified range: line_to_start <= line_number <= line_to_end
    for line_number, line in enumerate(
            read_data_line_by_line(twitter_data_path)):  # ignore first line
        if line_number == line_to_end:
            break
        if line_number >= line_to_start:
            preprocessed_line = preprocess_data(line)
            if preprocessed_line:
                processing_data(preprocessed_line, hash_tag_count,
                                language_code_count)

    # **********************************************************************************************
    #                      Algorithm 2: parallel top-n calculation
    # **********************************************************************************************
    n = 10
    # a) concurrent calculating top n hash_tags, languages used
    if comm_size > 1:
        # 1) merge Counter from each processor
        reduced_language_code_count = comm.reduce(language_code_count,
                                                  root=0,
                                                  op=operator.add)
        reduced_hash_tag_count = comm.reduce(hash_tag_count,
                                             root=0,
                                             op=operator.add)
        # 2) split merged to each processor
        if comm_rank == 0:
            split_language_code_np_array = np.array_split(
                list(reduced_language_code_count.items()), comm_size)
            split_hash_tag_np_array = np.array_split(
                list(reduced_hash_tag_count.items()), comm_size)
        else:
            split_language_code_np_array = None
            split_hash_tag_np_array = None

        # 3) scatter merged to each processor
        local_language_code = list(
            map(lambda x: (x[0], int(x[1])),
                comm.scatter(split_language_code_np_array, root=0)))
        local_hash_tag = list(
            map(lambda x: (x[0], int(x[1])),
                comm.scatter(split_hash_tag_np_array, root=0)))

        # 4) merge each processor's top n calculation result
        reduced_language_code_count = comm.reduce(heapq.nlargest(
            n, local_language_code, lambda x: x[1]),
                                                  root=0,
                                                  op=merge_list)
        reduced_hash_tag_count = comm.reduce(heapq.nlargest(
            n, local_hash_tag, lambda x: x[1]),
                                             root=0,
                                             op=merge_list)
    # b) single processor calculating top n
    else:
        reduced_hash_tag_count = hash_tag_count.most_common(n)
        reduced_language_code_count = language_code_count.most_common(n)

    # output summary in root process
    if comm_rank == 0:
        dump_hash_tag_output(reduced_hash_tag_count)
        dump_country_code_output(reduced_language_code_count,
                                 language_code_dict)

        program_run_time = time.time() - program_start
        print("Programs runs {}(s)".format(program_run_time))
util.mkdirp('MODELS')

# Load dataset and prepare data

print('Loading dataset...')
X_train, X_test, Y_train, Y_test = util.load_ABCDE_datasets(
    args.path, args.cv, args.lnoise, args.anoise)

nb_classes = len(np.unique(Y_train))
img_rows = img_cols = int(math.sqrt(X_train.shape[1] / args.channels))

Yc_train = np_utils.to_categorical(Y_train, nb_classes)
Yc_test = np_utils.to_categorical(Y_test, nb_classes)

if args.pre != None and args.pre != 'None':
    X_train, scaler = util.preprocess_data(args.pre, X_train)
    X_test, scaler = util.preprocess_data(args.pre, X_test, scaler)

X_train, X_test, input_shape = reshape(X_train, X_test, args.flat,
                                       args.channels, img_rows, img_cols)

print('dbname:', dbname)
print('full dbname:', fulldbname)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('nb_classes:', nb_classes)
print('channels:', args.channels)
print('input:', input_shape)
print('preprocess:', args.pre)
print('Label noise:', args.lnoise)
print('Attribute noise:', args.anoise)
Beispiel #8
0
               " Epochs {epochs}\n"
               " Base Learning Rate 2^{base_learning_rate}\n"
               "  Learning Rate {learning_rate}\n"
               " Loss Scaling {loss_scaling}\n"
               " Weight Decay {weight_decay}\n")
    if not opts.no_validation:
        log_str += ("Validation Graph.\n"
                    " Dataset {validation_data}\n"
                    " Batch Size {validation_batch_size}\n")
    log_str += "Checkpoint Path {checkpoint_path}\n"

    print(log_str.format(**vars(opts)))

    # If the data is not already preprocessed, preprocess it
    if not opts.use_synthetic_data and not util.is_preprocessed(opts.datafolder):
        util.preprocess_data(opts.datafolder)

    print("Loading training data")
    opts.training_data = MLPData(opts, data_path=opts.training_data)
    print(f"Rows: {opts.training_data._size}")

    print("Loading evaluation data")
    opts.validation_data = MLPData(opts, data_path=opts.validation_data)
    print(f"Rows: {opts.validation_data._size}")

    # If using synthetic data, set the environment variable required
    if opts.use_synthetic_data:
        if 'TF_POPLAR_FLAGS' in os.environ:
            os.environ['TF_POPLAR_FLAGS'] += ' --use_synthetic_data --synthetic_data_initializer=random'
        else:
            os.environ['TF_POPLAR_FLAGS'] = '--use_synthetic_data --synthetic_data_initializer=random'
Beispiel #9
0
import sys
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

from alphagan_class import AlphaGAN
from util import dump_column_transformers, load_column_transformers, split_data, preprocess_data

os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'


if __name__ == '__main__':
    if len(sys.argv) > 1:
        train_df = pd.read_csv(sys.argv[1])
        preprocess_data(train_df, './data/ranges.csv')

        X_train = train_df.to_numpy()

        ag = AlphaGAN()
        ag.train(X_train=X_train, epochs=4000, batch_size=32)

Beispiel #10
0
import util
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from baseline import BaselinePredictor
from sklearn.svm import SVC

data = util.load_data()

preprocessed_data = util.preprocess_data(data)

X, Y = util.splitFeaturesAndLabel(preprocessed_data, 'Empathy')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42)

baseline_predictor = BaselinePredictor()

baseline_preds = util.trainAndPredict(X_train, Y_train, baseline_predictor,
                                      X_test)

print("Baseline accuracy and classification report")

util.printAccuracyAndClassficationReport(baseline_preds,
                                         Y_test,
                                         classes=['1', '2', '3', '4', '5'])

X_train, X_test = util.getBestFeatures(X_train, Y_train, X_test)

model = SVC(kernel='rbf')

params = {
    'C': [i for i in range(1, 11)],
from keras.losses import MeanAbsoluteError

import keras.backend as K
import tensorflow as tf
import numpy as np

if __name__ == '__main__':
    session = K.get_session()
    init = tf.global_variables_initializer()
    session.run(init)

    ag = AlphaGAN()
    ag.load_pretrained_models('./snapshots/3900_')

    test_normal_df = pd.read_csv('./data/test_set_normal.csv')
    preprocess_data(test_normal_df, './data/ranges.csv')

    test_abnomal_df = pd.read_csv('./data/test_set_abnomal.csv')
    preprocess_data(test_abnomal_df, './data/ranges.csv')

    X_1 = test_normal_df.to_numpy()
    X_2 = test_abnomal_df.to_numpy()

    Z_hat_1 = ag.encoder.predict(X_1)
    X_hat_1 = ag.generator.predict(Z_hat_1)

    Z_hat_2 = ag.encoder.predict(X_2)
    X_hat_2 = ag.generator.predict(Z_hat_2)

    rec_losses_normal = np.linalg.norm(np.subtract(X_1, X_hat_1), axis=1)
    rec_losses_fraud = np.linalg.norm(np.subtract(X_2, X_hat_2), axis=1)
Beispiel #12
0
import numpy as np
import lbann
import lbann.modules
from util import preprocess_data

# Data paths, directory where patches are located
data_dir = 'data'
samples = preprocess_data(data_dir)

dims = len(samples[0])

num_classes = 3
num_channels = 14


# Sample access functions
def get_sample(index):
    sample = samples[index]
    return sample


def num_samples():
    return samples.shape[0]


def sample_dims():
    return [dims]


def str_list(l):
    return ' '.join([str(i) for i in l])
Beispiel #13
0
def main():
    tf.flags.DEFINE_string(
        "output_dir",
        "/Users/iiskin/Documents/workspace/PreProduction/sentiment_beam/output",
        "Directory to export the model run results")
    tf.flags.DEFINE_string(
        "input_data_dir",
        "/Users/iiskin/Documents/workspace/PreProduction/sentiment_beam/data",
        "Path to directory containing training and testing data")
    tf.flags.DEFINE_string("version", "default", "Version of your model")
    tf.flags.DEFINE_boolean("transform_data", False, "Preprocess raw data")

    tf.flags.DEFINE_integer("vocab_size", 20000, "Vocabulary size")
    tf.flags.DEFINE_integer("train_batch_size", 1000,
                            "Batch size for training")
    tf.flags.DEFINE_integer("train_num_epochs", 10000,
                            "Number of epochs for training")
    tf.flags.DEFINE_integer("num_train_instances", 2000,
                            "Number of training instances")
    tf.flags.DEFINE_integer("num_test_instances", 2000,
                            "Number of test instances")
    tf.flags.DEFINE_string("delimiters", ".,!?() ",
                           "Delimiters to be used in splitting text")

    FLAGS = tf.flags.FLAGS
    FLAGS._parse_flags()

    raw_file_dir = os.path.join(FLAGS.input_data_dir, 'raw')
    raw_metadata_dir = os.path.join(raw_file_dir, 'metadata')
    train_neg_file_pattern = os.path.join(raw_file_dir, 'train/negative/*')
    train_pos_file_pattern = os.path.join(raw_file_dir, 'train/positive/*')
    test_neg_file_pattern = os.path.join(raw_file_dir, 'test/negative/*')
    test_pos_file_pattern = os.path.join(raw_file_dir, 'test/positive/*')

    transformed_file_dir = os.path.join(FLAGS.input_data_dir, 'transformed')
    transformed_metadata_dir = os.path.join(transformed_file_dir, 'metadata')
    transformed_train_file_pattern = os.path.join(transformed_file_dir,
                                                  'train/*')
    transformed_test_file_pattern = os.path.join(transformed_file_dir,
                                                 'test/*')

    temp_dir = os.path.join(FLAGS.output_dir, "tmp")
    #model_run_dir = os.path.join(FLAGS.output_dir, datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
    model_run_dir = os.path.join(FLAGS.output_dir, FLAGS.version)

    if not FLAGS.transform_data:
        if not os.path.exists(transformed_file_dir):
            raise Exception(
                "It doesn't look like the raw data has been transformed yet. Use transform_data flag to transform the raw data."
            )
    else:
        shutil.rmtree(transformed_file_dir, ignore_errors=True)
        util.preprocess_data(
            train_neg_file_pattern=train_neg_file_pattern,
            train_pos_file_pattern=train_pos_file_pattern,
            test_neg_file_pattern=test_neg_file_pattern,
            test_pos_file_pattern=test_pos_file_pattern,
            transformed_train_file_pattern=transformed_train_file_pattern,
            transformed_test_file_pattern=transformed_test_file_pattern,
            transformed_metadata_dir=transformed_metadata_dir,
            raw_metadata_dir=raw_metadata_dir,
            transform_func_dir=model_run_dir,
            temp_dir=temp_dir,
            vocab_size=FLAGS.vocab_size,
            delimiters=FLAGS.delimiters)

    print(
        "\nRun \"tensorboard --logdir {}\" to see the results on Tensorboard\n\n"
        .format(FLAGS.output_dir))
    learn_runner.run(experiment_fn=model.generate_experiment_fn(
        transformed_train_file_pattern=transformed_train_file_pattern,
        transformed_test_file_pattern=transformed_test_file_pattern,
        transformed_metadata_dir=transformed_metadata_dir,
        raw_metadata_dir=raw_metadata_dir,
        vocab_size=FLAGS.vocab_size,
        train_batch_size=FLAGS.train_batch_size,
        train_num_epochs=FLAGS.train_num_epochs,
        num_train_instances=FLAGS.num_train_instances,
        num_test_instances=FLAGS.num_test_instances),
                     output_dir=model_run_dir)
Beispiel #14
0
def train_model():
    file = 'train.data'
    util.preprocess_data()
               input_shape=input_shape))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    return model


if __name__ == "__main__":
    os.chdir('..')
    path = os.getcwd() + 'dataset/Epileptic Seizure Recognition.csv'
    X, Y = preprocess_data(path)
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    x_train = x_train.reshape(x_train.shape[0], imheight, imwidth, 1)
    x_test = x_test.reshape(x_test.shape[0], imheight, imwidth, 1)

    imheight, imwidth = (36, 54)
    input_shape = (imheight, imwidth, 1)

    model = construct_model(input_shape)

    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])