Exemple #1
0
def score_regressor():
    from solutions.pos_regressor import PositionRegressor
    regressor = PositionRegressor()
    data = load_data('./data/regression_data.pkl')
    regressor.train(data)
    positions = np.asarray([info['agent_pos'] for info in data['info']])
    pred_pos = regressor.predict(data['obs'])
    mse = np.mean(np.sum(np.square(positions - pred_pos), axis=1))
    target_mse = 1.1e-5
    if mse < target_mse:
        mse = 0.
    mse_dist = (mse - target_mse)
    score = 1. - mse_dist
    score = min(max(0., score), 1.)

    test_data = load_data('./data/reg_test_data.pkl')
    test_positions = np.asarray(
        [info['agent_pos'] for info in test_data['info']])[:10, ]
    test_pred_pos = regressor.predict(test_data['obs'][:10])
    test_mse = np.mean(
        np.sum(np.square(test_positions - test_pred_pos), axis=1))
    target_mse = 0.038
    if test_mse < target_mse:
        test_mse = 0.
    test_mse_dist = (mse - target_mse)
    test_score = 1. - mse_dist
    test_score = min(max(0., score), 1.)
    # print('train_mse:', mse)
    # print('test_mse:', test_mse)
    score = 0.5 * score + 0.5 * test_score
    return score
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--test', action='store_true')
    parser.add_argument('--model')
    args = parser.parse_args()

    if args.train and args.model:
        nn = model.generate_model()
        training_input = data_utils.load_data(training_input_dir)
        training_output = data_utils.load_data(training_output_dir)
        nn.fit(training_input, training_output, batch_size=128, epochs=50)
        model.saveModel(nn, args.model)
        test = input("Do you want to test with the test images too? ")
        if test == 'yes':
            test_input = data_utils.load_data(test_input_dir)
            test_output = nn.predict(test_input)
            print(test_output.shape)
            data_utils.save_images(test_output_dir, test_input_dir,
                                   test_output)
    elif args.test and args.model:
        nn = model.loadModel(args.model)
        test_input = data_utils.load_data(test_input_dir)
        test_output = nn.predict(test_input)
        print(test_output.shape)
        data_utils.save_images(test_output_dir, test_input_dir, test_output)
Exemple #3
0
def main():

    word2id=load_data(FLAGS.voc_file)
    label2id=load_data(FLAGS.tag_voc)
    train_data=data_corpus(FLAGS.src_file,word2id,label2id)
    dev_data=data_corpus(FLAGS.src_file_dev,word2id,label2id)
    nermodel=NERModel(FLAGS,config,word2id,label2id)
    nermodel.build_model()
    nermodel.train(train_data,dev_data)
Exemple #4
0
def test(model_name, model_ckpt, dataset_name, data_folder):
    # model definition
    if model_name == 'lenet':
        from model.lenet import LeNet
        model = LeNet()
    else:
        from model.modelzoo import create_model
        model, input_size = create_model(model_name, n_classes=120)
    model = apply_cuda(model)

    # load weights
    ckpt = torch.load(model_ckpt)
    model.load_state_dict(ckpt['state_dict'])

    # data source
    batch_size = 200
    if dataset_name == 'mnist':
        test_loader = load_data('test', batch_size, data_folder, dataset_name)
    else:
        test_loader = load_data('test', batch_size, data_folder, dataset_name,
                                input_size)
    n_batches_test = len(test_loader)

    print('==== test phase ====')
    avg_acc = float(0)
    model.eval()
    images_export, labels_export = None, None
    for i, (images, labels) in enumerate(test_loader):
        if images_export is None or labels_export is None:
            images_export = images.data.numpy()
            labels_export = labels.data.numpy()
        else:
            images_export = np.concatenate(
                (images_export, images.data.numpy()), axis=0)
            labels_export = np.concatenate(
                (labels_export, labels.data.numpy()), axis=0)
        images, labels = apply_cuda(images), apply_cuda(labels)
        logits = model(images)
        _, pred = torch.max(logits.data, 1)
        if i == 0:
            print(images[0])
            print(logits[0], pred[0], labels[0])
        bs_ = labels.data.size()[0]
        match_count = (pred == labels.data).sum()
        accuracy = float(match_count) / float(bs_)
        print(
            datetime.now(),
            'batch {}/{} with shape={}, accuracy={:.4f}'.format(
                i + 1, n_batches_test, images.shape, accuracy))
        avg_acc += accuracy / float(n_batches_test)
    print(datetime.now(), 'test results: acc={:.4f}'.format(avg_acc))
    print(
        datetime.now(),
        'total batch to be exported with shape={}'.format(images_export.shape))
    export_test_data_to_numpy(images_export, labels_export, data_folder)
    def read_all_data(self, actions, data_dir, one_hot=False):
        """
        Loads data for training/testing and normalizes it.
        
        Args
        actions: list of strings (actions) to load
        seq_length_in: number of frames to use in the burn-in sequence
        seq_length_out: number of frames to use in the output sequence
        data_dir: directory to load the data from
        one_hot: whether to use one-hot encoding per action
        Returns
        train_set: dictionary with normalized training data
        test_set: dictionary with test data
        data_mean: d-long vector with the mean of the training data
        data_std: d-long vector with the standard dev of the training data
        dim_to_ignore: dimensions that are not used becaused stdev is too small
        dim_to_use: dimensions that we are actually using in the model
        """

        train_subject_ids = [1, 6, 7, 8, 9, 11]
        test_subject_ids = [5]

        train_set, complete_train = data_utils.load_data(
            data_dir, train_subject_ids, actions, one_hot)
        test_set, complete_test = data_utils.load_data(data_dir,
                                                       test_subject_ids,
                                                       actions, one_hot)

        # Compute normalization stats
        data_mean, data_std, dim_to_ignore, dim_to_use = data_utils.normalization_stats(
            complete_train)

        # Normalize -- subtract mean, divide by stdev
        train_set = data_utils.normalize_data(train_set, data_mean, data_std,
                                              dim_to_use, actions, one_hot)
        test_set = data_utils.normalize_data(test_set, data_mean, data_std,
                                             dim_to_use, actions, one_hot)
        print("done reading data.")

        self.train_set = train_set
        self.test_set = test_set

        self.data_mean = data_mean
        self.data_std = data_std

        self.dim_to_ignore = dim_to_ignore
        self.dim_to_use = dim_to_use

        self.train_keys = list(self.train_set.keys())
Exemple #6
0
def _get_buckets():
    # test set
    test_buckets = data_utils.load_data("test_ids.enc", "test_ids.dec")
    # training set
    data_buckets = data_utils.load_data("train_ids.enc", "train_ids.dec")
    # Count the number of conversation pairs for each bucket.
    train_bucket_sizes = [len(data_buckets[b]) for b in range(len(config.BUCKETS))]
    # print("Number of samples in each bucket:\n", train_bucket_sizes)
    # Total number of conversation pairs.
    train_total_size = sum(train_bucket_sizes)
    # list of increasing numbers from 0 to 1 that we"ll use to select a bucket.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in range(len(train_bucket_sizes))]
    # print("Bucket scale:\n", train_buckets_scale)
    return test_buckets, data_buckets, train_buckets_scale
Exemple #7
0
def main(args):
    """
    """
    print("Loading dataset...")
    X, y = load_data(args.cw_traces, max_length=args.length, max_instances=105)
    y = np.zeros(y.shape)
    classes = len(np.unique(y))
    X_ow, _ = load_data(args.ow_traces,
                        max_length=args.length,
                        fname_pattern=r"(\d+)",
                        max_instances=10000,
                        open_world=True)
    y_ow = np.ones((X_ow.shape[0], )) * classes
    print(X.shape, X_ow.shape, y.shape, y_ow.shape)
    X, y = np.concatenate([X, X_ow]), np.concatenate([y, y_ow])
    unmon_class = classes
    classes += 1

    # get split
    X_tr, y_tr, X_te, y_te, X_va, y_va = make_split(X, y, 0.8, 0.1)

    # consider them as float
    X_tr = X_tr.astype('float32')
    X_va = X_va.astype('float32')
    y_tr = y_tr.astype('float32')
    y_va = y_va.astype('float32')

    print(X_tr.shape[0], 'training samples')
    print(X_va.shape[0], 'validation samples')

    # convert class vectors to binary class matrices
    y_tr = np_utils.to_categorical(y_tr, classes)
    y_va = np_utils.to_categorical(y_va, classes)

    # train OW model
    model = train_model(X_tr, y_tr, X_va, y_va, classes, args.length,
                        args.model_path)

    # prepare OW testing data
    unmon_mask = np.equal(y_te, np.ones((y_te.shape[0], )) * unmon_class)
    #y_te = np_utils.to_categorical(y_te, classes)

    X_te_unmon = X_te[unmon_mask]
    y_te_unmon = y_te[unmon_mask]
    X_te_mon = X_te[~unmon_mask]
    y_te_mon = y_te[~unmon_mask]

    do_ow(model, X_te_mon, y_te_mon, X_te_unmon, y_te_unmon, args.log_file)
def read_all_data(actions=walking_lst,
                  seq_length_in=50,
                  seq_length_out=25,
                  data_dir="./data/h3.6m/dataset",
                  one_hot=True):
    """
  Loads data for training/testing and normalizes it.

  Args
    actions: list of strings (actions) to load
    seq_length_in: number of frames to use in the burn-in sequence
    seq_length_out: number of frames to use in the output sequence
    data_dir: directory to load the data from
    one_hot: whether to use one-hot encoding per action
  Returns
    train_set: dictionary with normalized training data
    test_set: dictionary with test data
    data_mean: d-long vector with the mean of the training data
    data_std: d-long vector with the standard dev of the training data
    dim_to_ignore: dimensions that are not used becaused stdev is too small
    dim_to_use: dimensions that we are actually using in the model
  """

    # === Read training data ===
    print("Reading training data (seq_len_in: {0}, seq_len_out {1}).".format(
        seq_length_in, seq_length_out))

    train_subject_ids = [1, 6, 7, 8, 9, 11]
    test_subject_ids = [5]

    train_set, complete_train = data_utils.load_data(data_dir,
                                                     train_subject_ids,
                                                     actions, one_hot)
    test_set, complete_test = data_utils.load_data(data_dir, test_subject_ids,
                                                   actions, one_hot)

    # Compute normalization stats
    data_mean, data_std, dim_to_ignore, dim_to_use = data_utils.normalization_stats(
        complete_train)

    # Normalize -- subtract mean, divide by stdev
    train_set = data_utils.normalize_data(train_set, data_mean, data_std,
                                          dim_to_use, actions, one_hot)
    test_set = data_utils.normalize_data(test_set, data_mean, data_std,
                                         dim_to_use, actions, one_hot)
    print("done reading data.")

    return train_set, test_set, data_mean, data_std, dim_to_ignore, dim_to_use
def main():
    collection = sys.argv[1]
    source = 'sampled_candidates'
    #target = 'annotation_task_input'
    data_dict = load_data(collection, source)
    property_lemma_dict = dict()
    overview_dict_list = []

    for property, concept_dict_list in data_dict.items():
        #print(property)
        final_lemma_dict_list = find_label_certainty(concept_dict_list)
        property_lemma_dict[property] = final_lemma_dict_list
        #prop_counter[property] = len(final_lemma_dict_list)
        overview_dict = dict()

        overview_dict['property'] = property
        overview_dict['pos'] = len(
            [d for d in final_lemma_dict_list if d['label'] == 'pos'])
        overview_dict['neg'] = len(
            [d for d in final_lemma_dict_list if d['label'] == 'neg'])
        overview_dict['pos/neg'] = len(
            [d for d in final_lemma_dict_list if '/' in d['label']])
        overview_dict['total'] = len(final_lemma_dict_list)
        overview_dict_list.append(overview_dict)

    candidates_to_file(property_lemma_dict, collection)
    overview_to_file(overview_dict_list, collection)
Exemple #10
0
def main():
    #This sets up the descriptions for each feature. It is a class defined in data_utils
    descriptions = data_utils.load_descriptions()
    #This gets all the standardized data into a dataframe. There is a 'fips_code' column, and each data feature has a corresponding column.
    #You can find the feature names in Data sources/data_values.csv
    #To use a specific column, you use pandas way of calling the feature, i.e. data['CRM250207D']
    data = data_utils.load_data()
    #pca = PCA(n_components=2)

    #Your code here to experiment, test, etc
    #print 'Got datastream'
    start = time.time()
    #This gets a subset of features from the dataframe. SO the user provides a set of features (i'll handle how that gets done)
    #For your work, just add or replace elements un the subset_features list below
    #this will get you a dataframe that has columns: 'fips_code', 'AGE040205D','BNK010205D', etc
    query_feature = ['EAN300205D']
    subset_features = query_feature + ['AGE040205D','BNK010205D','BPS030205D','CRM250207D']
    rank_features = subset_features[1:]
    #i.e. subset_features = ['EAN300205D', 'AGE040205D','BNK010205D','BPS030205D','CRM250207D']
    #THis is the function that actually gets you the frame with the features you are focusing on, therefor focus_frame
    focus_frame = data_utils.get_features(subset_features,data)

    svm = SVMRank(focus_frame, 50)
    weights_k = svm.train_top_k()
    df = pandas.DataFrame(columns=rank_features)
    df.loc[0] = weights_k
    print "----------------------------------------------------------"
    print "Resulting weights of the ranking features:"
    print df.to_string(index=False)
    frame_ranks_k = svm.get_full_rank()
    print
    print "Top 10 ranked counties by the query feature (first column is the fips code, second column is query feature, third column is resulting ranking score):"
    print frame_ranks_k.ix[:10].to_string(index=False)
Exemple #11
0
def main(unused_argv):
    global colourizer

    sess = tf.Session()

    print("Setting up model graph...", end='')
    colourizer = colourizer.Colourizer()
    colourizer.setup()
    print("done")

    print("Loading saver...", end='')
    saver = tf.train.Saver(max_to_keep=5)
    ckpt = tf.train.get_checkpoint_state(FLAGS.save_dir)
    print("done")

    print("Restoring model...", end='')
    logging.info('Restoring model from %s', ckpt.model_checkpoint_path)
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("done")

    print("Loading data...", end='')
    _, (test_l, test_ab, _) = data_utils.load_data()
    print("done")

    while True:
        print("Choose an option (E.g., Enter '1'):")
        print("\t(1) Colourize 5 random novel images")

        user_input = raw_input()

        if user_input == "1":
            display_5_images(sess, colourizer, test_l, test_ab)
Exemple #12
0
def main():
    config = Config()
    vocab = Vocab(config.dict_file)
    dev_q, dev_c, dev_s, dev_spans, dev_s_idx, dev_answerable = load_data(
        config.dev_file, vocab, config.debug)
    dev_data = list(
        zip(dev_q, dev_c, dev_s, dev_s_idx, dev_answerable, dev_spans))
    ssnet = SSQANet(config)
    ssnet.build_model()
    ssnet.restore_session(config.dir_model)
    batches = batch_loader(dev_data, config.batch_size, shuffle=False)
    acc_history = []
    em_history = []
    for batch in batches:
        batch_q, batch_c, batch_s, batch_s_idx, batch_ans, batch_spans = zip(
            *batch)
        question_lengths, padded_q = zero_padding(batch_q, level=1)
        context_lengths, padded_c = zero_padding(batch_c, level=1)
        sequence_lengths, sentence_lengths, padded_s = zero_padding(batch_s,
                                                                    level=2)

        batch_acc, batch_em, batch_loss = ssnet.eval(
            padded_q, question_lengths, padded_c, context_lengths, padded_s,
            sequence_lengths, sentence_lengths, batch_s_idx, batch_ans,
            batch_spans)
        acc_history.append(batch_acc)
        em_history.append(batch_em)

    dev_acc = np.mean(acc_history)
    dev_em = np.mean(em_history)
    print("classification acc :{}".format(dev_acc))
    print("EM :{}".format(dev_em))
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    longitude_latitude = data_utils.load_longitude_latitude_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    conbined_data = generate_distance_features(conbined_data, longitude_latitude)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id.values
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
Exemple #14
0
def prepare_data(data_path, test_size, seed, batch_size):
    """
    Prepare the dataloaders for the training. These are also passed to the MI
    constructor and unpacked in the MI class to compute the mutual information
    Args:
        data_path: path to load the dataset from.
        test_size: percentage of data to use as test (or number of samples).
        seed: rng seed.
        batch_size: batch size to use.

    Returns:
        [type]: [description]
    """
    X_train, X_test, y_train, y_test = data_utils.load_data(data_path, test_size, seed)

    # Prepare data for pytorch
    if batch_size != "full":
        train_loader = data_utils.create_dataloader(X_train, y_train, batch_size, seed)
        test_loader = data_utils.create_dataloader(X_test, y_test, batch_size, seed)
    else:
        train_loader = data_utils.create_dataloader(X_train, y_train, len(X_train), seed)
        test_loader = data_utils.create_dataloader(X_test, y_test, len(X_test), seed)

    # Activitiy loaders
    full_X, full_y = np.concatenate((X_train, X_test)), np.concatenate((y_train, y_test))
    act_full_loader = data_utils.create_dataloader(full_X, full_y, len(full_X), seed, shuffle=False)

    return train_loader, test_loader, act_full_loader
def model_01():
    """
    猫图像识别,两层网络结构
    :return:
    """
    X_train, Y_train, X_test, Y_test, classes = load_data()  # 猫图像数据

    # 模型参数设置
    layers_dims = [X_train.shape[0], 7, 1]
    num_iter = 2500
    learning_rate = 0.0075
    print_cost = True
    initialization = "sqrt_n"

    parameters, costs = basic_model(X_train,
                                    Y_train,
                                    layers_dims=layers_dims,
                                    num_iter=num_iter,
                                    lr=learning_rate,
                                    print_cost=print_cost,
                                    initialization=initialization)

    # 预测及评估
    prediction_train = predict(parameters, X_train)
    prediction_test = predict(parameters, X_test)

    print("Train准确率: {}".format(evaluate(prediction_train, Y_train)))
    print("test准确率: {}".format(evaluate(prediction_test, Y_test)))

    costs_draw(costs, learning_rate=learning_rate)
Exemple #16
0
def train_classification(dataset_params, agent_params, training_params):
    dataset = load_data(dataset_params['data_path'])
    trainloader = return_trainloader(dataset['trainX'],
                                     dataset['trainY'],
                                     category='classification')
    valloader = return_trainloader(dataset['valX'],
                                   dataset['valY'],
                                   category='classification')
    data_dict = {
        'trainloader': trainloader,
        'valloader': valloader
        # 'y_handtype_indexes':y_handtype_indexes
    }
    if dataset_params['datatype'] == f'{dt.DataTypes.HANDRANKSFIVE}':
        category_weights = generate_category_weights()
        data_dict['category_weights'] = category_weights
    print('Data shapes', dataset['trainX'].shape, dataset['trainY'].shape,
          dataset['valX'].shape, dataset['valY'].shape)
    # dataset['trainY'] = dataset['trainY'].long()
    # dataset['valY'] = dataset['valY'].long()
    # target = dt.Globals.TARGET_SET[dataset_params['datatype']]
    # y_handtype_indexes = return_ylabel_dict(dataset['valX'],dataset['valY'],target)

    # print('Target values',np.unique(dataset['trainY'],return_counts=True),np.unique(dataset['valY'],return_counts=True))
    train_network(data_dict, agent_params, training_params)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_price_doc = train['price_doc']
    train.drop(['price_doc'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values
    conbined_data.index = range(conbined_data.shape[0])

    # 时间窗大小
    timewindow_days = [30 * 6, 30 * 4, 30 * 2, 30, 20, 10]
    conbined_data = perform_time_window(conbined_data, timewindow_days)
    conbined_data = perform_groupby_time_window(conbined_data, timewindow_days)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['price_doc'] = train_price_doc
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
Exemple #18
0
def load_dataset(arg, data_path, in_vocab, slot_vocab, intent_vocab):
    """Returns the dataset that is loaded from the disk.

    Args:
    arg: The output of the parser.
    data_path: The path of the dataset to be loaded.
    in_vocab: The vocabulary of the input sentences.
    slot_vocab: The vocabulary of slot labels.
    intent_vocab: The vocabulary of intent labels.

    Returns:
    The input data, slot data and the intent data as numpy arrays.
    """

    full_path = os.path.join('./data', arg.dataset, data_path)

    input_path = os.path.join(full_path, arg.input_file)
    slot_path = os.path.join(full_path, arg.slot_file)
    intent_path = os.path.join(full_path, arg.intent_file)

    in_data, slot_data, intent_data = load_data(input_path, slot_path,
                                                intent_path, in_vocab,
                                                slot_vocab, intent_vocab,
                                                arg.max_seq_len)

    return in_data, slot_data, intent_data
Exemple #19
0
def main(args):
    # hyper param
    root = args.root_dir
    assert os.path.exists(root)

    tokenizer = AutoTokenizer.from_pretrained(args.model,
                                              cache_dir=args.transformer_cache)

    mt_dnn_root = os.path.join(root, args.model)
    if not os.path.isdir(mt_dnn_root):
        os.makedirs(mt_dnn_root)

    task_defs = TaskDefs(args.task_def)

    for task in task_defs.get_task_names():
        task_def = task_defs.get_task_def(task)
        logger.info("Task %s" % task)
        for split_name in task_def.split_names:
            file_path = os.path.join(root, "%s_%s.tsv" % (task, split_name))
            if not os.path.exists(file_path):
                logger.warning("File %s doesnot exit")
                sys.exit(1)
            rows = load_data(file_path, task_def)
            dump_path = os.path.join(mt_dnn_root,
                                     "%s_%s.json" % (task, split_name))
            logger.info(dump_path)
            build_data(
                rows,
                dump_path,
                tokenizer,
                task_def.data_type,
                lab_dict=task_def.label_vocab,
                workers=args.workers,
            )
Exemple #20
0
def plot_data():
    """Plot prices and items over time"""
    
    # Load data
    prices,items,_ = load_data()
    
    # Price trends for different time periods
    prices['year'] = prices.ts.dt.year
    prices['month'] = prices.ts.dt.month
    prices['dayofweek'] = prices.ts.dt.dayofweek
    prices['hour'] = prices.ts.dt.hour
    
    prices.groupby('year',as_index=False)['price'].aggregate({"mean_price":np.mean, 
                                                              "median_price":np.median}).plot(kind='line', x="year")
    prices.groupby('month',as_index=False)['price'].aggregate({"mean_price":np.mean, 
                                                               "median_price":np.median}).plot(kind='line', x="month")
    prices.groupby('dayofweek',as_index=False)['price'].aggregate({"mean_price":np.mean, 
                                                                   "median_price":np.median}).plot(kind='line', x="dayofweek")
    prices.groupby('hour',as_index=False)['price'].aggregate({"mean_price":np.mean, 
                                                              "median_price":np.median}).plot(kind='line', x="hour")
    sns.FacetGrid(prices.groupby(["dayofweek","hour"])["price"] \
        .aggregate({"mean_price":np.mean, "median_price":np.median}).reset_index(), row="dayofweek") \
        .map(plt.plot, "hour", "median_price") \
        .set(xlim=(0, 23))
        
    # Grades
    item_order = items.loc[items.consumption.argsort()[::-1], 'item']
    sns.factorplot(x="item", y="consumption", data=items, kind="bar", order=item_order)
    
    
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_price_doc = train['price_doc']

    num_columns = train.select_dtypes(exclude=['object']).columns.values
    num_columns = num_columns.tolist()
    num_columns.remove('id')
    num_columns.remove('timestamp')

    print 'perform feature selection in %d numerical features...' % train[num_columns].shape[1]
    keep_features = feature_select(train[num_columns], keep_top=0.98)
    print 'after feature selection numerical features', len(keep_features)
    keep_features.append('id')
    keep_features.append('timestamp')

    train = train[keep_features]
    test = test[keep_features]

    train['price_doc'] = train_price_doc

    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
Exemple #22
0
def generate():
    random.seed(SEED)
    np.random.seed(SEED)
    vocab_dict, vocab_res = data_utils.load_vocab('./vocab.txt')
    data = data_utils.load_data('data.pkl')

    vocab_size = len(vocab_dict)
    SEQ_LENGTH = data.shape[1]

    generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM,
                          SEQ_LENGTH, START_TOKEN)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    samples = generator.generate(sess)
    for i in range(int(1)):
        if i > len(samples):
            break
        arr = samples[i]
        poem = ''
        for index in arr:
            if index != data_utils.EOS_ID:
                poem += vocab_res[index]
        print(poem)
Exemple #23
0
def main():

    args, settings = parse_args_and_settings()
    logger = output_utils.Logger(args)
    logger.shout('python main.py ' + ' '.join(sys.argv[1:]))

    num_threads = 5
    torch.set_num_threads(num_threads)

    # Load entity map (needed for all phases; primarily for loading data):
    entity_idx_to_name, entity_name_to_idx = data_utils.load_entity_map(settings.data.entity_map)

    if args.phase == 'train' or args.phase == 'deploy':

        # Loading train data is needed for train, but also for deploy, namely if vocabulary doesn't exist yet:
        train_data = None
        if args.phase == 'train' or not os.path.exists(settings.data.vocabulary):
            train_data = data_utils.load_data(settings.data.dataset, entity_name_to_idx, with_keys=True, logger=logger)

        # Load vocabulary (and extract from train_data if vocabulary doesn't exist yet)
        vocabulary_idx_to_word, vocabulary_word_to_idx = data_utils.get_vocabulary(settings.data.vocabulary,
                                                                                   extract_from=train_data,
                                                                                   logger=logger)

        # Avoid loading/generating google news embeddings in deploy phase:
        if args.phase == 'deploy' and settings.model.token_emb == config_utils.data_paths["embeddings"]["google_news"]:
            settings.model.token_emb = 300
            # Appropriate embeddings will be loaded anyway from saved .pt model file.
            # TODO: This won't generalize when using other embeddings.

        # Load embeddings if needed:
        if isinstance(settings.model.token_emb, str):
            settings.model.token_emb = embedding_loader.load_word_embeddings(settings.model.token_emb,
                                                                             settings.data.dataset, train_data, logger)
        if isinstance(settings.model.speaker_emb, str):
            settings.model.speaker_emb = embedding_loader.load_entity_embeddings(settings.model.speaker_emb,
                                                                                 settings.data.entity_map, logger)
        # convenient to compute and store some dependent parameters:
        settings.model.vocabulary_size = len(vocabulary_idx_to_word)
        settings.model.num_entities = len(entity_idx_to_name)

    if args.phase == 'train':
        logger.save_config(settings.orig)
        logger.say(output_utils.bcolors.BOLD + 'Training on ' + settings.data.dataset)
        run_training(settings, train_data, vocabulary_idx_to_word, vocabulary_word_to_idx, logger, not args.no_cuda)

    if args.phase == 'deploy':
        logger.say(output_utils.bcolors.BOLD + 'Deploying ' + str(len(args.model)) + ' models (' + (
            args.run_name if len(args.model) > 1 else args.model[0]) + ')...\n   ...on ' + ('folds of ' if not args.no_cv else '') + args.deploy_data)
        args.answer_file, with_keys = run_deploy(args.model, settings, args.deploy_data, vocabulary_idx_to_word, vocabulary_word_to_idx, entity_name_to_idx, args.answers_per_fold, args.no_cv, logger, not args.no_cuda)
        # After deploying, evaluate (unless not desired or data does not contain reference keys):
        if not args.no_eval:
            if with_keys is True:
                args.phase = 'evaluate'
            else:
                logger.shout('Warning: Model predictions will not be evaluated, since given data does not contain reference labels. ')

    if args.phase == 'evaluate':
        logger.say(output_utils.bcolors.BOLD + 'Evaluating ' + ('(not SemEval style) ' if args.no_semeval else '(SemEval style) ') + 'predictions of ' + args.answer_file)
        run_evaluate(args.answer_file, args.deploy_data, entity_name_to_idx, entity_idx_to_name, args.no_semeval, logger)
def run_model(model):
    '''Solve optimization problem using MedianForecaster -model for forecasting.'''

    # Load data
    prices, items, schedule = load_data()

    # Split to train and test sets based on realized schedule
    X_train, y_train, X_test, y_test = split_prices(prices)

    # Fit and predict with model
    y_pred = model.fit(X_train, y_train).predict(X_test)

    # Evaluate model
    mse = MSE(y_test, y_pred)
    print('MSE: %f' % mse)

    # Solve optimization problem with forecasted electricity prices
    df_pred = format_forecast_results(X_test, y_pred, prices, schedule)
    df_pred.plot()
    itemblocks_to_produce,blocks_available,forecasted_block_prices, \
        actual_block_prices,item_consumptions,block_order = prepare_optimization(items,schedule,df_pred)
    prob = create_optimization_problem(itemblocks_to_produce,
                                       blocks_available,
                                       forecasted_block_prices,
                                       item_consumptions,
                                       name=model.name)
    solution_schedule = solve_problem(prob, actual_block_prices,
                                      item_consumptions, block_order)

    plot_forecasted_schedule(solution_schedule,
                             title='Schedule using %s' % model.name)
def main():
    collection = sys.argv[1]
    #target_property = sys.argv[2]
    source = 'data_extracted_sorted'
    target = 'data_for_concept_selection_cosine_centroid'

    dsm_path = '../../../Data/dsm/wikipedia_full/sgns_pinit1/sgns_pinit1/sgns_rand_pinit1'
    model = load_model(dsm_path, 'sgns')
    model_creation_type, matrix, wi_dict, dsm_vocab = model

    data_dict = load_data(collection, source)
    extended_data_dict = dict()

    overview_dict_list = []

    for property, concept_dict_list in data_dict.items():
        #if property == target_property:
        print(property)
        clean_concept_dict_list = remove_oov(dsm_vocab, concept_dict_list)
        new_concept_dict_list = []

        n = 200
        # get neighbors of positive concept centroid:
        concepts_pos = [cd['concept'] for cd in clean_concept_dict_list if cd['label'] == 'pos']
        concepts = [cd['concept'] for cd in clean_concept_dict_list]
        neighbors, centroid = get_get_candidate_neighbors(model, concepts_pos, n = n)
        concept_cosine_dict = get_cosines_to_centroid(model, concepts, centroid)
        #print(len(concept_cosine_dict.keys()))
        # get cosines to centroid of all concepts

        # use nltk pos tagging to select nouns only:
        if neighbors:
            neighbors_clean = [(c, n) for c, n in neighbors if n not in concepts]
            #neighbors_nouns = select_nouns_pos(neighbors_clean)
            for cosine, neighbor in neighbors_clean:
                neighbor_dict = dict()
                neighbor_dict['concept'] = neighbor
                neighbor_dict['label'] = 'neg/pos'
                neighbor_dict['certainty'] = 'not_certain'
                neighbor_dict['sources_str'] = 'wikipedia_sgns_model'
                neighbor_dict['categories_str'] = 'neighbors'+'-'+str(n)
                neighbor_dict['cosine_centroid'] = str(cosine)
                new_concept_dict_list.append(neighbor_dict)

        for cd in  clean_concept_dict_list:
            cos = concept_cosine_dict[cd['concept']]
            cd['cosine_centroid'] = str(cos)
            print(cd['concept'], cd['cosine_centroid'])
            new_concept_dict_list.append(cd)


        sorted_concept_dict_list = sorted([(cd['cosine_centroid'], cd) for cd \
                                    in new_concept_dict_list if cd['cosine_centroid'] != '-'])
        if sorted_concept_dict_list:
            extended_data_dict[property] = [cd for cosine, cd in sorted_concept_dict_list]
            print(property, len(extended_data_dict[property]))
        else:
            print('no concepts in the model vocab: ', property)

        data_to_file(collection, extended_data_dict, target)
Exemple #26
0
def train_model(args):
    """Load the data, train the model, test the model, export / save the model
    """
    torch.manual_seed(args.seed)

    # Open our dataset
    train_loader, test_loader = data_utils.load_data(args.test_split,
                                                     args.batch_size)

    # Create the model
    net = model.SonarDNN().double()
    optimizer = optim.SGD(net.parameters(), lr=args.lr,
                          momentum=args.momentum, nesterov=False)

    # Train / Test the model
    for epoch in range(1, args.epochs + 1):
        train(net, train_loader, optimizer, epoch)
        test(net, test_loader)

    # Export the trained model
    torch.save(net.state_dict(), args.model_name)

    if args.model_dir:
        # Save the model to GCS
        data_utils.save_model(args.model_dir, args.model_name)
Exemple #27
0
def test(batch_size):
    # Note the x(blur) in the second, the y(full) in the first
    y_test, x_test = data_utils.load_data(data_type='test')
    g = generator_model()
    g.load_weights('weight/generator_weights.h5')
    generated_images = g.predict(x=x_test, batch_size=batch_size)
    data_utils.generate_image(y_test, x_test, generated_images, 'result/finally/')
def train_model(args):
    """Load the data, train the model, test the model, export / save the model
    """
    torch.manual_seed(args.seed)

    # Open our dataset
    train_loader, test_loader = data_utils.load_data(args.test_split,
                                                     args.batch_size)

    # Create the model
    net = model.SonarDNN().double()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          nesterov=False)

    # Train / Test the model
    for epoch in range(1, args.epochs + 1):
        train(net, train_loader, optimizer, epoch)
        test(net, test_loader)

    # Export the trained model
    torch.save(net.state_dict(), args.model_name)

    if args.model_dir:
        # Save the model to GCS
        data_utils.save_model(args.model_dir, args.model_name)
def run_single_experiment(cfg, diffinit, seed, replace_index):
    t0 = time()
    # how we convert the cfg into a path and such is defined in ExperimentIdentifier
    exp = ExperimentIdentifier(seed=seed,
                               replace_index=replace_index,
                               diffinit=diffinit)
    exp.init_from_cfg(cfg)
    exp.ensure_directory_exists(verbose=True)
    path_stub = exp.path_stub()
    print('Running experiment with path', path_stub)
    # load data
    x_train, y_train, x_vali, y_vali, x_test, y_test = load_data(
        options=cfg['data'], replace_index=replace_index)
    # define model
    init_path = get_model_init_path(cfg, diffinit)
    model = build_model(**cfg['model'], init_path=init_path)
    # prep model for training
    prep_for_training(
        model,
        seed=seed,
        optimizer_settings=cfg['training']['optimization_algorithm'],
        task_type=cfg['model']['task_type'])
    # now train
    train_model(model,
                cfg['training'],
                cfg['logging'],
                x_train,
                y_train,
                x_vali,
                y_vali,
                path_stub=path_stub)
    # clean up
    del model
    clear_session()
    print('Finished after', time() - t0, 'seconds')
Exemple #30
0
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    # conbined_data = feature_distribute_scale(conbined_data)
    conbined_data = feature_discretization(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def main():
    print 'loading train and test datas...'
    train, test, _ = data_utils.load_data()
    print 'train:', train.shape, ', test:', test.shape

    train_id = train['id']
    train_price_doc = train['price_doc']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    test_id = test['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    conbined_data.columns = test.columns.values

    conbined_data = gen_area_features(conbined_data)
    conbined_data = gen_school_features(conbined_data)
    conbined_data = generate_hospital_features(conbined_data)
    conbined_data = generate_population_features(conbined_data)
    conbined_data = generate_population_age_features(conbined_data)
    conbined_data = generate_build_features(conbined_data)

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    train['id'] = train_id
    train['price_doc'] = train_price_doc
    test['id'] = test_id
    print 'train:', train.shape, ', test:', test.shape
    print("Save data...")
    data_utils.save_data(train, test, _)
def load_data(dataset_directory , dataset_name):
    print "Loading datasets ..."
    import os
    repo = os.environ.get('MLPYTHON_DATASET_REPO')
    if repo is None:
        raise ValueError('environment variable MLPYTHON_DATASET_REPO is not defined')
    dataset_dir = os.path.join(os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory, dataset_name)    
    
    input_size = 6 
    spatial_dimensions = 1
    all_data = data_utils.load_data(dir_path=dataset_dir, input_size=input_size, train_filename=None, test_filename=None, background_filename=None,load_to_memory=False)

    train_data, train_metadata = all_data['train']
    valid_data, valid_metadata = all_data['valid']
    finaltrain_data, finaltrain_metadata = all_data['finaltrain']
    test_data, test_metadata = all_data['test']
    lbl = np.array([int(data[1]) for data in test_data])

    def reduce_dimensionality(mlproblem_data, mlproblem_metadata):
        mlproblem_metadata['input_size'] = 3  # we need to change the input size from 6 to 3. 
        return [mlproblem_data[0][:3] , mlproblem_data[1]]

    if spatial_dimensions ==1:      
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(train_data, train_metadata)
        validset = trainset.apply_on(valid_data,valid_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data,finaltrain_metadata)
        testset = trainset.apply_on(test_data,test_metadata)

    elif spatial_dimensions ==0:
        import mlpython.mlproblems.generic as mlpg
        trainset = mlpg.PreprocessedProblem(data = train_data , metadata = train_metadata , preprocess = reduce_dimensionality)
        validset = trainset.apply_on(valid_data, valid_metadata)
        testset = trainset.apply_on(test_data, test_metadata)
        finaltrainset = trainset.apply_on(finaltrain_data, finaltrain_metadata)
        import mlpython.mlproblems.classification as mlpb
        trainset = mlpb.ClassificationProblem(trainset, trainset.metadata)
        validset = trainset.apply_on(validset,validset.metadata)
        finaltrainset = trainset.apply_on(finaltrainset,finaltrainset.metadata)
        testset = trainset.apply_on(testset,testset.metadata)

    return {'finaltrainset':finaltrainset, 'testset':testset ,'ground_truth':lbl}  
Exemple #33
0
def train_model(args):
    """Load the data, train the model, test the model, export / save the model
    """
    torch.manual_seed(args.seed)

    # Open our dataset
    train_loader, test_loader = data_utils.load_data(
        args.test_split, args.seed, args.batch_size)

    # Create the model
    net = model.SonarDNN().double()
    optimizer = optim.SGD(net.parameters(), lr=args.lr,
                          momentum=args.momentum, nesterov=False)

    # Train / Test the model
    latest_accuracy = 0.0
    for epoch in range(1, args.epochs + 1):
        train(net, train_loader, optimizer)
        latest_accuracy = test(net, test_loader)

    # The default name of the metric is training/hptuning/metric.
    # We recommend that you assign a custom name. The only functional
    # difference is that if you use a custom name, you must set the
    # hyperparameterMetricTag value in the HyperparameterSpec object in your
    # job request to match your chosen name.
    # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#HyperparameterSpec
    hpt = hypertune.HyperTune()
    hpt.report_hyperparameter_tuning_metric(
        hyperparameter_metric_tag='my_accuracy_tag',
        metric_value=latest_accuracy,
        global_step=args.epochs)

    # Export the trained model
    torch.save(net.state_dict(), args.model_name)

    if args.job_dir:
        # Save the model to GCS
        data_utils.save_model(args.job_dir, args.model_name)
    else:
        print('Accuracy: {:.0f}%'.format(latest_accuracy))
    def __init__(self,
                 alpha,
             batch_size,
             n_epochs,
             wordVecLen,
             flag_dropout,
             datapath,
             random_seed,
             dropoutRates,
             optimizer,
             dispFreq,
             beam_size,
             flag_random_lookup_table,
             flag_toy_data,
             size_hidden_layer,
             dataset,
             result_path,
             sentence_modeling,
             CNN_filter_length,
             LSTM_go_backwards
             ):
        model_options = locals().copy()
        model_options['rng'] = np.random.RandomState(random_seed)
        print 'Loading data'
        src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath)
        if flag_toy_data == True:
            src_valid = src_valid[:10]
            src_test = src_test[:10] 
            #src_train = copy.copy(src_valid)
            src_train = src_train[:10]
        elif flag_toy_data != False:
            valid_l = len(src_valid) * flag_toy_data
            test_l = len(src_test) * flag_toy_data
            train_l = len(src_train) * flag_toy_data
            src_valid = src_valid[:int(valid_l)]
            src_test = src_test[:int(test_l)] 
            src_train = src_train[:int(train_l)]
            
        train,pairdict_train = prepare_data(src_train)
        valid,pairdict_valid = prepare_data(src_valid)
        test,pairdict_test = prepare_data(src_test)
        model_options['embedding'] = embedding
        
        (sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug) = build_model(model_options)
        #f_cost = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, name='f_cost')
    
        #grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams.values())
        grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams)
        # grads = tensor.grad(cost, wrt=tparams.values())
        #f_grad = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], grads, name='f_grad')
    
        lr = tensor.scalar(name='lr')
        if model_options['optimizer'] == 'sgd': optimizer = sgd
        elif model_options['optimizer'] == 'rmsprop': optimizer = rmsprop
        else: optimizer = adadelta
        f_grad_shared, f_update = optimizer(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost)
        
        
        print 'Optimization'

        kf_valid = get_minibatches_idx(len(valid), model_options['batch_size'])
        kf_test = get_minibatches_idx(len(test), model_options['batch_size'])
    
        print "%d train examples" % len(train)
        print "%d valid examples" % len(valid)
        print "%d test examples" % len(test)
        sys.stdout.flush()
        
        
        best_validation_score = -np.inf
        best_iter = 0
        uidx = 0  # the number of update done
        for epoch in xrange(model_options['n_epochs']):
            print ('Training on %d epoch' % epoch)
            sys.stdout.flush()
            kf = get_minibatches_idx(len(train), batch_size, shuffle=True)
            start_time = time.time()
            samples_seen = 0
            for _, train_index in kf:
                uidx += 1
                batch_samples = [train[t] for t in train_index]
                samples_seen += len(batch_samples)
                #print batch_samples
                sentence1,sentence1_mask,sentence2,sentence2_mask,y = data_padding(batch_samples)
                #print sentence1,sentence1_mask,sentence2,sentence2_mask,y
                #print sentence1.shape,sentence1_mask.shape,sentence2.shape,sentence2_mask.shape,y.shape
                #o = f_debug(sentence1,sentence1_mask,sentence2,sentence2_mask,y)
                #print o
                #print o[0].shape,o[1].shape,o[2].shape,o[3].shape
                cost = f_grad_shared(sentence1,sentence1_mask,sentence2,sentence2_mask,y)
                f_update(model_options['alpha'])
                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen
                    sys.stdout.flush()
            print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen
            sys.stdout.flush()
            '''
            if epoch % 5 == 0:
                kf_train = get_minibatches_idx(len(train), batch_size)
                print ('Train_score:')
                self.eva(f_pred, src_train, train, pairdict_train, kf_train, model_options)
                sys.stdout.flush()
            '''
            print ('Valid_score:')
            top1_res = self.eva(f_pred, src_valid, valid, pairdict_valid, kf_valid, model_options)
            self.save_result(model_options['result_path'] + 'dev.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res)
            sys.stdout.flush()
            print ('Test_score:')
            top1_res = self.eva(f_pred, src_test, test, pairdict_test, kf_test, model_options)
            self.save_result(model_options['result_path'] + 'test.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res)
            sys.stdout.flush()
            
            print ('%d epoch completed.' % epoch)
            sys.stdout.flush()
            '''
            if(best_validation_score < valid_score):
                best_iter = epoch
                best_validation_score = valid_score
            print ('Current best_dev_F is %.2f, at %d epoch'%(best_validation_score,best_iter))
            '''
        
            end_time = time.time()
            minu = int((end_time - start_time)/60)
            sec = (end_time - start_time) - 60 * minu
            print ('Time: %d min %.2f sec' % (minu, sec))
            sys.stdout.flush()
        print('Training completed!')
        sys.stdout.flush()
       
        
        paragraph = np.asarray(paragraph) - np.min(paragraph)
        paragraph = list(paragraph)
        for sentence in paragraph:
            fw.write(str(sentence))
            fw.write(' ')
        fw.write('#')
        for category in cur_categories:
            fw.write(category)
            fw.write(' ')
        fw.write('\n')
    fw.close()
    
    
dataset = 'cs'
datapath = '../data/%s.pkl.gz'%dataset
src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath)

res_order = []
res_eva = []
for paragraph, cur_categories in src_test:
    n = len(paragraph)
    candidates = [x for x in xrange(n)]
    guess_order = []
    for i in xrange(n):
        idx = np.random.randint(n - i)
        guess_order.append(candidates[idx])
        candidates.remove(candidates[idx])
    res_order.append((guess_order, cur_categories))
    patial_correct, total_correct = score_rank(guess_order)
    res_eva.append(np.asarray([patial_correct, total_correct]))
res_eva = np.asarray(res_eva)
str_ParamOptionValue = ""

dataset_dir = None
if dataset_dir is None:
    # Try to find dataset in MLPYTHON_DATASET_REPO
    import os
    repo = os.environ.get('MLPYTHON_DATASET_REPO')
    if repo is None:
        raise ValueError('environment variable MLPYTHON_DATASET_REPO is not defined')
    dataset_dir = os.path.join(os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory, dataset_name)


# Load data
start_time = time.clock()

all_data = data_utils.load_data(dir_path=dataset_dir, input_size=input_size, train_filename=train_filename, test_filename=test_filename, background_filename=background_filename,load_to_memory=False)

train_data, train_metadata = all_data['train']
valid_data, valid_metadata = all_data['valid']
finaltrain_data, finaltrain_metadata = all_data['finaltrain']
test_data, test_metadata = all_data['test']

import mlpython.mlproblems.classification as mlpb
trainset = mlpb.ClassificationProblem(train_data, train_metadata)
validset = trainset.apply_on(valid_data,valid_metadata)
finaltrainset = trainset.apply_on(finaltrain_data,finaltrain_metadata)
testset = trainset.apply_on(test_data,test_metadata)

def compute_error_mean_and_sterror(costs):
    classif_errors = np.hstack(costs)
    classif_mean = classif_errors.mean()
    kwargs = dict(
        dim=dim,
        fappend=fappend,
        df_min=df_min,
        df_max=df_max,
        confusion=confusion,
    )

    ############################################################
    # Extract method specific options.
    ############################################################
    # TODO: SVMs only.
    ############################################################

    # Load data.
    data = load_data(dataset)

    # Create object file names.
    fname_args = []
    if dim:
        fname_args.append(str(dim))
    fname_args.append(fappend)
    mdl_fname = make_fname(METHOD, model, dataset, 'mdl', 'pk', *fname_args)
    vec_fname = make_fname(METHOD, model, dataset, 'vec', 'pk', *fname_args)
    dim_fname = make_fname(METHOD, model, dataset, 'dim', 'pk', *fname_args)
    mdl_path = os.path.join(MODEL_HOME,mdl_fname)
    vec_path = os.path.join(MODEL_HOME,vec_fname)
    dim_path = os.path.join(MODEL_HOME,dim_fname)

    model_files_present = os.path.isfile(mdl_path) and os.path.isfile(vec_path)
    if dim:
Exemple #38
0
results_path = output_folder + '/libsvm_results/'
if not os.path.exists(results_path):
    os.makedirs(results_path)

#Factors = [1,1.5,2,2.5,3,3.5,4,4.5,5,6,7,8]
#Factors = [9,10,11,12,13,14,15,16,17,18,19,20]
Factors = [2000,1500,1000,900,800,700,600,500,400,300,200,100,75,50,40,30,20,10]
# measure the sensitivity of gamma for the selected brains and save the text file

brain_names = brain_list.keys()
results_file_c = 'libsvm_measures_factor.txt'
results_file_metadata = 'libsvm_measures_factor_metadata.txt'

for brain in brain_names[0:2]:
    dataset_dir = os.path.join(os.environ.get('MLPYTHON_DATASET_REPO') + '/' + dataset_directory, brain)
    all_data = data_utils.load_data(dataset_dir)
    test = all_data['test']
    fulltrain_backup = all_data['finaltrain']    
    resultc1, resultc2 = '' ,''           
    brain_str = brain + ' \n'
    
    for factor in Factors:
        resultc3 = ''
        dice_t = np.zeros((10))
        processed_timet = np.zeros((10))
        for nb in range(10):
            resultc3 = ''
            all_data = data_utils.data_reduction(fulltrain_backup , factor)
            all_data['test'] = test
            print len(fulltrain_backup[0])
            print len(all_data['finaltrain'][0])
def load_data(use_buckets=True):
    print(FLAGS.data_dir)
    if use_buckets:
        return data_utils.load_data(FLAGS, _buckets)
    else:
        return data_utils.load_data(FLAGS, None)
def train(**kwargs):
    """
    Train model

    Load the whole train data in memory for faster operations

    args: **kwargs (dict) keyword arguments that specify the model hyperparameters
    """

    # Roll out the parameters
    batch_size = kwargs["batch_size"]
    n_batch_per_epoch = kwargs["n_batch_per_epoch"]
    nb_epoch = kwargs["nb_epoch"]
    model_name = kwargs["model_name"]
    generator = kwargs["generator"]
    image_dim_ordering = kwargs["image_dim_ordering"]
    img_dim = kwargs["img_dim"]
    patch_size = kwargs["patch_size"]
    bn_mode = kwargs["bn_mode"]
    label_smoothing = kwargs["use_label_smoothing"]
    label_flipping = kwargs["label_flipping"]
    dset = kwargs["dset"]
    use_mbd = kwargs["use_mbd"]

    epoch_size = n_batch_per_epoch * batch_size

    # Setup environment (logging directory etc)
    general_utils.setup_logging(model_name)

    # Load and rescale data
    X_full_train, X_sketch_train, X_full_val, X_sketch_val = data_utils.load_data(dset, image_dim_ordering)
    img_dim = X_full_train.shape[-3:]

    # Get the number of non overlapping patch and the size of input image to the discriminator
    nb_patch, img_dim_disc = data_utils.get_nb_patch(img_dim, patch_size, image_dim_ordering)

    try:

        # Create optimizers
        opt_dcgan = Adam(lr=1E-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        # opt_discriminator = SGD(lr=1E-3, momentum=0.9, nesterov=True)
        opt_discriminator = Adam(lr=1E-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

        # Load generator model
        generator_model = models.load("generator_unet_%s" % generator,
                                      img_dim,
                                      nb_patch,
                                      bn_mode,
                                      use_mbd,
                                      batch_size)
        # Load discriminator model
        discriminator_model = models.load("DCGAN_discriminator",
                                          img_dim_disc,
                                          nb_patch,
                                          bn_mode,
                                          use_mbd,
                                          batch_size)

        generator_model.compile(loss='mae', optimizer=opt_discriminator)
        discriminator_model.trainable = False

        DCGAN_model = models.DCGAN(generator_model,
                                   discriminator_model,
                                   img_dim,
                                   patch_size,
                                   image_dim_ordering)

        loss = [l1_loss, 'binary_crossentropy']
        loss_weights = [1E1, 1]
        DCGAN_model.compile(loss=loss, loss_weights=loss_weights, optimizer=opt_dcgan)

        discriminator_model.trainable = True
        discriminator_model.compile(loss='binary_crossentropy', optimizer=opt_discriminator)

        gen_loss = 100
        disc_loss = 100

        # Start training
        print("Start training")
        for e in range(nb_epoch):
            # Initialize progbar and batch counter
            progbar = generic_utils.Progbar(epoch_size)
            batch_counter = 1
            start = time.time()

            for X_full_batch, X_sketch_batch in data_utils.gen_batch(X_full_train, X_sketch_train, batch_size):

                # Create a batch to feed the discriminator model
                X_disc, y_disc = data_utils.get_disc_batch(X_full_batch,
                                                           X_sketch_batch,
                                                           generator_model,
                                                           batch_counter,
                                                           patch_size,
                                                           image_dim_ordering,
                                                           label_smoothing=label_smoothing,
                                                           label_flipping=label_flipping)

                # Update the discriminator
                disc_loss = discriminator_model.train_on_batch(X_disc, y_disc)

                # Create a batch to feed the generator model
                X_gen_target, X_gen = next(data_utils.gen_batch(X_full_train, X_sketch_train, batch_size))
                y_gen = np.zeros((X_gen.shape[0], 2), dtype=np.uint8)
                y_gen[:, 1] = 1

                # Freeze the discriminator
                discriminator_model.trainable = False
                gen_loss = DCGAN_model.train_on_batch(X_gen, [X_gen_target, y_gen])
                # Unfreeze the discriminator
                discriminator_model.trainable = True

                batch_counter += 1
                progbar.add(batch_size, values=[("D logloss", disc_loss),
                                                ("G tot", gen_loss[0]),
                                                ("G L1", gen_loss[1]),
                                                ("G logloss", gen_loss[2])])

                # Save images for visualization
                if batch_counter % (n_batch_per_epoch / 2) == 0:
                    # Get new images from validation
                    data_utils.plot_generated_batch(X_full_batch, X_sketch_batch, generator_model,
                                                    batch_size, image_dim_ordering, "training")
                    X_full_batch, X_sketch_batch = next(data_utils.gen_batch(X_full_val, X_sketch_val, batch_size))
                    data_utils.plot_generated_batch(X_full_batch, X_sketch_batch, generator_model,
                                                    batch_size, image_dim_ordering, "validation")

                if batch_counter >= n_batch_per_epoch:
                    break

            print("")
            print('Epoch %s/%s, Time: %s' % (e + 1, nb_epoch, time.time() - start))

            if e % 5 == 0:
                gen_weights_path = os.path.join('../../models/%s/gen_weights_epoch%s.h5' % (model_name, e))
                generator_model.save_weights(gen_weights_path, overwrite=True)

                disc_weights_path = os.path.join('../../models/%s/disc_weights_epoch%s.h5' % (model_name, e))
                discriminator_model.save_weights(disc_weights_path, overwrite=True)

                DCGAN_weights_path = os.path.join('../../models/%s/DCGAN_weights_epoch%s.h5' % (model_name, e))
                DCGAN_model.save_weights(DCGAN_weights_path, overwrite=True)

    except KeyboardInterrupt:
        pass