Ejemplo n.º 1
0
def test(model: keras.Model):
    model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    x_test, y_test = data.load_test('cifar10', channel_first=False)
    test_iter = data.get_test_iterator(x_test, y_test)
    model.evaluate(test_iter,
                   batch_size=common.batch_size,
                   steps=len(x_test) // common.batch_size)
Ejemplo n.º 2
0
 def __init__(self, vocab_size):
     # 输入序列(句子)长度
     self.sequence_length = FLAGS.sequence_length
     # 循环数
     self.num_epochs = FLAGS.num_epochs
     # batch大小
     self.batch_size = FLAGS.batch_size
     # 词表大小
     self.vocab_size = vocab_size
     # 词向量大小
     self.embedding_size = FLAGS.embedding_size
     # 不同类型的filter,相当于1-gram,2-gram,3-gram和5-gram
     self.filter_sizes = [1, 2, 3, 5]
     # 隐层大小
     self.hidden_size = FLAGS.hidden_size
     # 每种filter的数量
     self.num_filters = FLAGS.num_filters
     # 论文里给的是0.0001
     self.l2_reg_lambda = FLAGS.l2_reg_lambda
     # dropout
     self.keep_prob = FLAGS.keep_prob
     # 学习率
     # 论文里给的是0.01
     self.lr = FLAGS.lr
     # margin
     # 论文里给的是0.009
     self.m = FLAGS.margin
     # 设定GPU的性质,允许将不能在GPU上处理的部分放到CPU
     # 设置log打印
     self.cf = tf.ConfigProto(allow_soft_placement=True,
                              log_device_placement=False)
     '''
     GPU内存使用策略
     '''
     # 自动增长
     self.cf.gpu_options.allow_growth = True
     # 只占用20%的GPU内存
     # self.cf.gpu_options.per_process_gpu_memory_fraction = 0.2
     self.test_data = data.load_test(self.sequence_length,
                                     self.sequence_length)
Ejemplo n.º 3
0
print "m_paths"
print m_paths
metadata_path_all = glob.glob(m_paths)
print "length of metadata_path_all"
print(len(metadata_path_all))

if len(sys.argv) >= 4:
    subset = sys.argv[3]
    assert subset in ['train', 'valid', 'test', 'train_valid']
else:
    subset = "test"

num_seq = 64

if subset == "test":
    xb_test, _, xs_test, _ = data.load_test(CVsplit)
    dat = utils.add_dims_seq([xb_test, xs_test])
    xb_test = dat[0]
    xs_test = dat[1]
elif subset == "train":
    sys.exit(subset + ": not implemented yet")
elif subset == "train_valid":
    sys.exit(subset + ": not implemented yet")
else:
    sys.exit(subset + ": not implemented yet")

X = np.vstack((xb_test, xs_test))
n = np.size(X, axis=0)

print("X shape:")
print(X.shape)
Ejemplo n.º 4
0
    norm = dfp.is_op('concatenate')(dfp.is_tuple(
        (dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard(),
         dfp.wildcard(), dfp.wildcard())))
    red = dfp.is_op('concatenate')(dfp.is_tuple(
        (dfp.wildcard(), dfp.wildcard(), dfp.wildcard(), dfp.wildcard())))
    return [norm, red]


if __name__ == '__main__':
    from nasnet import get_model
    from resnet import ConvBnSubst
    import graph
    import work
    import data

    x_test, y_test = data.load_test('cifar10', channel_first=True)
    test_gen = data.TvmDataGen(x_test, y_test)
    nasnet = get_model(6, load_weights=True)
    wl_1 = work.Workload.from_keras(nasnet, dtype='float16')
    wl_1.mod = avg_include_pool(wl_1.mod)
    wl_2 = graph.SubstPass(ConvBnSubst)(wl_1)
    wl_3 = graph.SubstPass(ConvAddSubst)(wl_2)
    wl_4 = graph.SubstPass(AvgAddSubst)(wl_3)
    graph.visualize(wl_2, name='nasnet_cb', path='logs')
    # wl_1.evaluate(test_gen)
    # wl_2.evaluate(test_gen)
    # wl_3.evaluate(test_gen)
    # wl_4.evaluate(test_gen)
    # pat_list = _get_breakpoint_patterns()
    # rcd_1 = work.BreakpointRecord(wl_1, pat_list)
    # rcd_2 = work.BreakpointRecord(wl_2, pat_list)
Ejemplo n.º 5
0
def trans_ans(y):
    ans = []
    for i in y:
        if i[1] >= i[0]:
            ans.append(1)
        else:
            ans.append(0)
    return ans


model = resnet.ResnetBuilder.build_resnet_50(
    (config.img_channels, config.img_rows, config.img_cols), config.nb_classes)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.load_weights(config.model_path)

x_test = data.load_test(config.test_path)

y = model.predict(x_test)

y = trans_ans(y)

id = [i for i in range(4000, 7550)]

submit = pd.DataFrame({'id': id, 'y': y})

submit.to_csv(config.submitfile_path, index=False)
Ejemplo n.º 6
0
import theano.tensor as T
import numpy as np
import sys

import data
import model
from theano_toolkit.parameters import Parameters
from theano_toolkit import updates


if __name__ == '__main__':
    model_filename = sys.argv[1]
    test_filename = sys.argv[2]
    train_filename = sys.argv[3]
    P = Parameters()
    data_X, df = data.load_test(test_filename, train_filename)
    f = model.build(P,
        input_size=data_X.shape[1],
        hidden_sizes=[256, 128, 64, 32]
    )
    X = T.matrix('X')
    predict = theano.function(
        inputs=[X],
        outputs=f(X, test=True) > 0.5,
    )
    P.load(model_filename)
    output = predict(data_X) 
    print data_X.shape
    print output.shape
    print df.values.shape
    df['probs'] = predict(data_X)
Ejemplo n.º 7
0
            writer.writerow(review.__dict__)


def remove_diacritic(input):
    """
    Accept a unicode string, and return a normal string without any diacritical marks.
    input arguments:
        input: the string to strip accents from
    output arguments:
        the stripped input
    """
    return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')


if __name__ == "__main__":
    dataset = sys.argv[1]
    if dataset == 'train':
        reviews = data.load_train()
    elif dataset == 'test':
        reviews = data.load_test()
    else:
        raise ValueError('No dataset ' + dataset + ' found!')
    print "reviews loaded"
    reviews_dict_languages = split_by_language(reviews)

    for k, v in reviews_dict_languages.iteritems():
        print k
        review_list = correct_spelling_and_stem(k, v)
        print "corrected and stemmed"
        save_reviews_to_csv(k, review_list, dataset)
        print "saved to csv"
Ejemplo n.º 8
0
def main():
    logs = {
        'start-time': now(),
        'num_workers': PARTITIONS,
        'reg_lambda': REG_LAMBDA,
        'epochs': EPOCHS,
        'batch': BATCH,
        'learning_rate': LEARNING_RATE
    }
    # Logging configuration
    logging.basicConfig(filename='/data/logs/tmp_logs.txt',
                        level=logging.WARNING)

    logging.warning("{}:Loading Training Data...".format(now()))
    # Load data
    val_df, train_df = data.load_train(spark)

    # Collect validation for loss computation
    val_collected = val_df.collect()

    # Create initial weight vector
    dimensions = train_df.rdd \
                         .map(lambda row: max(row.features.keys())).max() + 1
    w = [0.0] * dimensions

    # Create the partitions of the train dataset
    partitions = train_df.rdd.zipWithIndex() \
                             .map(lambda x: (x[1], x[0])) \
                             .partitionBy(PARTITIONS)

    persistence = [0.0] * PERSISTENCE
    smallest_val_loss = float('inf')

    logs['start-compute-time'] = now()
    logging.warning("{}:Starting SGD...".format(logs['start-compute-time']))
    logs['epochs-stats'] = []
    for epoch in range(EPOCHS):
        epoch_stat = {'epoch_number': epoch, 'epoch_start': now()}
        logging.warning("{}:EPOCH:{}".format(now(), epoch))
        # Broadcast w to make it available for each worker
        w_b = sc.broadcast(w)
        # Calculate Mini Batch Gradient Descent for each partition
        partition_deltas_w = \
            partitions.mapPartitions(lambda x: sgd(x, w_b)).collect()
        # Collect total update weights for all workers in one epoch
        total_delta_w = {}
        for delta_w in partition_deltas_w:
            for k, v in delta_w.items():
                if k in total_delta_w:
                    total_delta_w[k] += v
                else:
                    total_delta_w[k] = v

        # Update weights
        for k, v in total_delta_w.items():
            w[k] += LEARNING_RATE * v

        val_loss = loss(val_collected, w)
        epoch_stat['val_loss'] = val_loss
        epoch_stat['epoch_end'] = now()
        logs['epochs-stats'].append(epoch_stat)
        logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss))

        # Early stopping criteria
        persistence[epoch % PERSISTENCE] = val_loss
        if smallest_val_loss < min(persistence):
            # Early stop
            logging.warning("{}:EARLY STOP!".format(now()))
            break
        else:
            smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss

    logs['end-compute-time'] = now()

    logging.warning("{}:Calculating Train Accuracy".format(now()))
    train_accuracy = accuracy(train_df, w)
    logs['train_accuracy'] = train_accuracy

    logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy))

    logging.warning("{}:Calculating Test Accuracy".format(now()))
    test_df = data.load_test(spark)
    test_accuracy = accuracy(test_df, w)
    logs['test_accuracy'] = test_accuracy

    logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy))

    spark.stop()

    logs['end_time'] = now()
    with open(
            '/data/logs/logs.workers_{}.batch_{}.epochs_{}.time_{}.json'.
            format(PARTITIONS, BATCH, EPOCHS, logs['start-time']), 'w') as f:
        json.dump([logs], f)
Ejemplo n.º 9
0
    model = MoCo(dim=args.moco_dim,
                 K=args.moco_k,
                 m=args.moco_m,
                 T=args.moco_t,
                 ver=args.version,
                 arch=args.arch,
                 bn_splits=args.bn_splits,
                 symmetric=args.symmetric,
                 v3_encoder=vit).cuda()

    print(model)
    # exit(0)

    train_data, train_loader = load_train(args)
    memory_data, memory_loader = load_memory(args)
    test_data, test_loader = load_test(args)

    # define optimizer
    if args.version == 3:
        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.wd)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=args.lr,
                                    weight_decay=args.wd,
                                    momentum=0.9)

    # load model if resume
    epoch_start = 1
    if args.resume != '':
Ejemplo n.º 10
0
    print('Loading word2vec...')
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC,
                                                                 binary=True)

    print('Loading training data...')
    train = data.load_train()  #[:LIMIT]
    dic = data.load_dic()

    print('Computing tweet averages...')
    X = np.zeros(shape=(len(train), 300))
    y = np.zeros(shape=(len(train), ), dtype=int)
    for i, tweet in enumerate(train):
        X[i] = tweet_embedding_by_average(tweet[0], dic, embeddings)
        y[i] = tweet[1]

    print('Training the model...')
    clf = RandomForestClassifier(n_estimators=200, max_depth=10)
    clf.fit(X, y)

    print('Loading test data...')
    test = data.load_test()  #[:LIMIT]
    T = np.zeros(shape=(len(test), 300))
    ids = np.zeros(shape=(len(test), ))
    for i, tweet in enumerate(test):
        T[i] = tweet_embedding_by_average(tweet[0], dic, embeddings)
        ids[i] = tweet[1]

    print('Predicting...')
    Yhat = clf.predict(T)
    data.generate_submission(Yhat, ids)
Ejemplo n.º 11
0
import pickle
from collections import defaultdict

import data
data.load_all_train()
data.load_test()

FLEXIBLE_PATTERN_RATIO = 1e-4


def main():
    print("counting all words from the whole dataset...")
    word_freq = defaultdict(int)
    total_num_words = 0

    for tweet in data.ALL_TRAIN + data.TEST:
        for word in tweet.normalised_text.lower().split():
            word_freq[word] += 1
            total_num_words += 1
    print("counted!")

    print("defining high frequency words...")
    hfw_threshold = total_num_words * FLEXIBLE_PATTERN_RATIO
    hfw = set()

    for word in word_freq:
        if word_freq[word] > hfw_threshold:
            hfw.add(word)

    # save to file
    print("saving to ../data/hfws.pickle...")
Ejemplo n.º 12
0
            predictions = predictions + np.load(predictions_path)
#    print "shape of predictions and max"
#    print(predictions.shape)
#    print(predictions.max())
    predictions = predictions / len(predictions_path_all)  # evening it out
    #    print(predictions.max())
    import data

    if len(sys.argv) == 4:
        subset = sys.argv[3]
        assert subset in ['train', 'valid', 'test', 'train_valid']
    else:
        subset = "test"

    if subset == "test":
        xb_test, tb_test, _, ts_test = data.load_test(CVsplit)
    elif subset == "train":
        sys.exit(subset + ": not implemented yet")
    elif subset == "train_valid":
        sys.exit(subset + ": not implemented yet")
    else:
        sys.exit(subset + ": not implemented yet")

    t = np.vstack((tb_test, ts_test))
    n = np.size(t, axis=0)

    import utils
    AUC = utils.auc(predictions, t)
    total_AUC += AUC

    predictions = predictions > prob
Ejemplo n.º 13
0
            predictions = predictions + np.load(predictions_path)
    #    print "shape of predictions and max"
    #    print(predictions.shape)
    #    print(predictions.max())
    predictions = predictions / len(predictions_path_all)  # evening it out
    #    print(predictions.max())
    import data

    if len(sys.argv) == 4:
        subset = sys.argv[3]
        assert subset in ["train", "valid", "test", "train_valid"]
    else:
        subset = "test"

    if subset == "test":
        xb_test, tb_test, _, ts_test = data.load_test(CVsplit)
    elif subset == "train":
        sys.exit(subset + ": not implemented yet")
    elif subset == "train_valid":
        sys.exit(subset + ": not implemented yet")
    else:
        sys.exit(subset + ": not implemented yet")

    t = np.vstack((tb_test, ts_test))
    n = np.size(t, axis=0)

    import utils

    AUC = utils.auc(predictions, t)
    total_AUC += AUC
Ejemplo n.º 14
0
def main():
    logs = {
        'start-time': now(),
        'lock': LOCK,
        'num_workers': WORKERS,
        'reg_lambda': REG_LAMBDA,
        'epochs': EPOCHS,
        'learning_rate': LEARNING_RATE
    }
    # Logging configuration
    logging.basicConfig(filename='logs/tmp_logs.txt', level=logging.WARNING)

    with Manager() as manager:
        logging.warning("{}:Loading Training Data...".format(now()))
        logging.warning("{}:FULL TEST {}".format(now(), FULL_TEST))
        logging.warning("{}:WORKERS {}".format(now(), WORKERS))
        logging.warning("{}:LOCK {}".format(now(), LOCK))

        val, train = data.load_train()
        train = manager.dict(train)
        dim = max([max(k) for k in train['features']]) + 1
        init_w = [0.0] * dim

        if LOCK:
            lock = Lock()
            w = Array(c_double, init_w, lock=lock)
        else:
            w = RawArray(c_double, init_w)

        logs['start-compute-time'] = now()
        start_time = time()
        logging.warning("{}:Starting SGD...".format(
            logs['start-compute-time']))

        val_queue = Queue()
        workers = []
        for worker in range(WORKERS):
            p = Process(target=sgd, args=(worker, train, w, val_queue))
            p.start()
            workers.append(p)

        logs['epochs-stats'] = []

        # Initial early stopping variables
        persistence = [0.0] * PERSISTENCE
        smallest_val_loss = float('inf')
        workers_done = [False] * WORKERS
        while True:
            workers_alive = any([p.is_alive() for p in workers])
            if not workers_alive:
                logging.warning("{}:WORKERS DONE!".format(now()))
                logs['end-compute-time'] = now()
                logging.warning("{}:END TIME {}".format(
                    now(),
                    time() - start_time))
            if not workers_alive and val_queue.empty():
                logging.warning("{}:WORKERS DONE AND QUEUE EMPTY!".format(
                    now()))
                final_weights = w[:]
                break
            # Block until getting a message
            val_queue_item = val_queue.get()
            worker = val_queue_item['worker']
            epoch = val_queue_item['epoch']
            weights = val_queue_item['weights']

            val_loss = loss(val, weights)

            logging.warning("{}:EPOCH:{}".format(now(), epoch))
            logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss))
            logs['epochs-stats'].append({
                'epoch_number': epoch,
                'val_loss': val_loss
            })

            # Early stopping criteria
            persistence[epoch % PERSISTENCE] = val_loss
            if smallest_val_loss < min(persistence):
                # Early stop
                logging.warning("{}:EARLY STOP!".format(now()))
                # Terminate all workers, but save the weights before
                # because a worker could have a lock on them. Terminating
                # a worker doesn't release its lock.
                final_weights = w[:]
                for p in workers:
                    p.terminate()
                logs['end-compute-time'] = now()
                logging.warning("{}:END TIME {}".format(
                    now(),
                    time() - start_time))
                break
            else:
                smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss

        # Close queue
        val_queue.close()
        val_queue.join_thread()

        logging.warning("{}:Calculating Train Accuracy".format(now()))
        train_accuracy = accuracy(train, final_weights)
        logs['train_accuracy'] = train_accuracy
        logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy))

        # Calculate test accuracy
        logging.warning("{}:Calculating Test Accuracy".format(now()))
        test = data.load_test(FULL_TEST)
        test_accuracy = accuracy(test, final_weights)
        logs['test_accuracy'] = test_accuracy
        logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy))

        logs['end_time'] = now()
        with open(
                'logs/logs.w_{}.l_{}.e_{}.time_{}.json'.format(
                    WORKERS, LOCK, EPOCHS, logs['start-time']), 'w') as f:
            json.dump([logs], f)
def load_data(options):
    data = load_test()
    if options.attack is not None:
        attack_classes = get_attack_classes(options.attack)
        data = data[data.attack_class.isin(['Normal', *attack_classes])]
    return preprocess(data, normalize=options.normalize)
Ejemplo n.º 16
0
from data import load_train, load_test, predictions_to_csv

model_name = 'VGG9'
model = load_model(f'./project3/trained_models/{model_name}.h5')

print(model.summary())
plot_model(model,
           to_file=f'./project3/figures/{model_name}_arch.png',
           show_shapes=True)

num_classes = 10
img_x, img_y = 64, 64

# Load data
train_images, train_labels = load_train()
x_test = load_test()

_, x_valid, _, y_valid = train_test_split(train_images,
                                          train_labels,
                                          test_size=0.2,
                                          random_state=42,
                                          stratify=train_labels)

# Reshape and normalize
x_valid = x_valid.reshape(x_valid.shape[0], img_x, img_y, 1)
x_valid = x_valid.astype('float32')
x_valid /= 255.
print('Validation dim: ', x_valid.shape)

x_test = x_test.reshape(x_test.shape[0], img_x, img_y, 1)
x_test = x_test.astype('float32')
Ejemplo n.º 17
0
import time

from data import load_test, predictions_to_csv, load_model

if __name__ == '__main__':
    print('Loading data...')
    start = time.time()
    data_test = load_test()
    print(f'Time to load data: {time.time()-start}')

    # Replace with name of model you want load
    pipeline = load_model('sgd_bigram_tfidf.joblib')

    # Generate predictions
    pred = pipeline.predict(data_test)

    # Name of csv to save predictions in
    predictions_to_csv(pred, 'sgd_bigram_tfidf.csv')
Ejemplo n.º 18
0
def main(
    data_path,
    train_data_path,
    val_data_path,
    test_data_path,
    output_path,
    prediction_name='suggestion.json',
    cache_dir=None,
    model_type='lda',
):
    '''
    train a model and make a prediction

    Args:
        data_path: path to the data json file
        train_data_path: path to the train data
        val_data_path: path to the val data
        test_data_path: path to the test data
        output_path: path to the output dir
        prediction_name: the name of prediction output file
        cache_dir: where to save cache
        model: which model to use

    Returns:
        None
    '''
    # load data
    print('Loading data')
    documents, titles = data.load_doc_title(
        data_path,
        cache_path=os.path.join(cache_dir, 'preproccessed')
        if cache_dir is not None else None,
    )
    train_data = data.load_train(train_data_path)
    val_data = data.load_val(val_data_path)
    test_data = data.load_test(test_data_path)

    # convert to corpus if needed
    if model_type in ('lda', ):
        print('Preparing corpus')
        dictionary = utils.make_dictionary(
            documents.content,
            cache_path=os.path.join(cache_dir, 'dictionary')
            if cache_dir is not None else None,
            filter_=False,
        )
        documents['bow'] = utils.make_corpus(documents.content, dictionary)
        titles['bow'] = utils.make_corpus(titles.content, dictionary)

    # train
    print('Training model')
    if model_type == 'lda':
        model = engine.CustomLDA(documents, titles, dictionary)
        model = model.train(train_data, val_data, output_path)
    elif model_type == 'doc2vec':
        model = engine.CustomDoc2vec(documents, titles)
        model = model.train(train_data, val_data, output_path)
    else:
        raise ValueError(model_type)

    # inference
    prediction = model.predict(test_data)
    prediction_output = os.path.join(output_path, prediction_name)
    data.dump_prediction(prediction, prediction_output)
    return