Ejemplo n.º 1
0
def train(args):

    device = args.device
    load_path = args.load_path
    # load data
    train_data = load_data('train')
    val_data = load_data('validation')

    # load model
    with tf.device('/gpu:%d' % device):
        model = get_model('policy')

    # trainer init
    optimizer = Config.optimizer
    train_step = optimizer.minimize(model.loss)

    # init session and server
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    if load_path==None:
        sess.run(tf.initialize_all_variables())
    else:
        saver.restore(sess, load_path)
        print("Model restored from %s" % load_path)

    # accuracy
    pred = tf.reshape(model.pred, [-1, 9*10*16])
    label = tf.reshape(model.label, [-1, 9*10*16])
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(label,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    logging.basicConfig(filename='log.txt', level=logging.DEBUG)
    # train steps
    for i in range(Config.n_epoch):

        # training step
        batch_data, batch_label = train_data.next_batch(Config.minibatch_size)

        input_dict = {model.label:batch_label}
        for var, data in zip(model.inputs, batch_data):
            input_dict[var]=data

        #from IPython import embed;embed()
        sess.run(train_step, feed_dict=input_dict)

        # evalue step
        if (i+1)%Config.evalue_point == 0:
            batch_data, batch_label = val_data.next_batch(Config.minibatch_size)
            val_dict = {model.label:batch_label}
            for var, data in zip(model.inputs, batch_data):
                val_dict[var]=data
            score = accuracy.eval(feed_dict=val_dict)
            print("epoch %d, accuracy is %.2f" % (i,score))
            logging.info("epoch %d, accuracy is %.2f" % (i,score))

        # save step
        if (i+1)%Config.check_point == 0:
            save_path = saver.save(sess, "%s/epoch-%d" %(Config.save_path, i))
            print("Model saved in file: %s" % save_path)
            logging.info("Model saved in file: %s" % save_path)
Ejemplo n.º 2
0
def main(args):
    model = utils.get_models(bert_config=args.bert_config,
                             pred_n_labels=args.pred_n_labels,
                             arg_n_labels=args.arg_n_labels,
                             n_arg_heads=args.n_arg_heads,
                             n_arg_layers=args.n_arg_layers,
                             pos_emb_dim=args.pos_emb_dim,
                             use_lstm=args.use_lstm,
                             device=args.device)
    if torch.cuda.is_available():
        map_location = lambda storage, loc: storage.cuda()
    else:
        map_location = 'cpu'
    model.load_state_dict(
        torch.load(args.model_path, map_location=map_location))
    model.zero_grad()
    model.eval()

    loader = load_data(data_path=args.test_data_path,
                       batch_size=args.batch_size,
                       tokenizer_config=args.bert_config,
                       train=False)
    start = time.time()
    extract(args, model, loader, args.save_path)
    print("TIME: ", time.time() - start)
    test_results = do_eval(args.save_path, args.test_gold_path)
    utils.print_results("TEST RESULT", test_results,
                        ["F1  ", "PREC", "REC ", "AUC "])
def main():
	'''
	# Use these three to get the data loaded, targets loaded, and the accessions stripped (Otherwise use dataset.py load_data())
	# get classifications
	type_dict = get_targets()

	# load data
	scores = get_data()
	
	# get arrays of scores and targets
	data, targets = get_arrays(type_dict, scores)
	'''

	data, targets = load_data()

	# tune model parameters
	#tune_model_parameters(data,targets)

	# get ROC curves
	#get_roc(data, targets)

	# get confusion matrix
	get_conf_matrix(data, targets)

	'''I WANT TO RE-RUN the ROC curves and the Confusion matrix data using predictions from a cross-validation rather than train/test_split'''
Ejemplo n.º 4
0
    def _load_data(self, opts, seed):
        if opts['dataset'].lower() in ('mnist', 'fashion', 'cifar10', 'svhn'):
            (self.data,
             self.labels), (self.test_data,
                            self.test_labels) = load_data(opts['dataset'],
                                                          seed,
                                                          imbalance=True)
            if 'augment_x' in opts and opts['augment_x']:
                self.data, self.labels = self.oversampling(
                    opts, self.data, self.labels, seed)
            self.num_points = len(self.data)
        else:
            raise ValueError('Unknown %s' % opts['dataset'])

        self.class_counts = [
            np.count_nonzero(self.labels == c)
            for c in range(opts['n_classes'])
        ]
        print("[ statistic ]")
        print("Total train: ", self.num_points)
        print(self.class_counts)
        print("Total test: ", len(self.test_labels))
        print([
            np.count_nonzero(self.test_labels == c)
            for c in range(opts['n_classes'])
        ])
Ejemplo n.º 5
0
def main(args):
    data_train, data_eval, concept2idx, tag2idx, word2concept = dataset.load_data(
    )
    m = args[0]
    model_map = {
        "baseline": models.Baseline,
        "word": models.Word,
        "tag": models.Tag,
        "concept": models.Concept,
        "tag_word": models.TagWord,
        "concept_tag": models.ConceptTag,
        "concept_word": models.ConceptWord,
        "concept_tag_word": models.ConceptTagWord
    }

    if m in model_map:
        lstm_dim, hidden_dim, epoch, batch, early_stop, patience = args[1:]
        model = model_map[m](concept2idx=concept2idx,
                             tag2idx=tag2idx,
                             word2concept=word2concept,
                             lstm_dim=int(lstm_dim),
                             hidden_dim=int(hidden_dim))
        model.train(data_train, int(epoch), int(batch), early_stop == "1",
                    int(patience))
        accuracy, preds = model.evaluate(data_eval)
        logging.info("Accuracy: {}".format(accuracy))
        log_result(
            data_eval, preds,
            "out/{}_{}_{}_{}.txt".format(m, lstm_dim, hidden_dim, accuracy))
Ejemplo n.º 6
0
def train(dataset_dir, emb_file, epoch, batch_size):
    (train_data, test_data, text_field, label_field) = dataset.load_data(dataset_dir, emb_file)

    class_size = len(label_field.vocab)

    nbow = nbow_model.NBoW(class_size, text_field.vocab.vectors)
    nbow.train()

    optimizer = torch.optim.Adam(nbow.parameters())

    train_iter = Iterator(train_data, batch_size)
    for n in range(epoch):
        for batch in train_iter:
            optimizer.zero_grad()

            logit = nbow(batch.text.t())
            loss = F.cross_entropy(logit, batch.label)
            loss.backward()

            optimizer.step()

        nbow.eval()

        (accuracy, num_correct) = compute_accuracy(nbow, test_data)
        print('Epoch: {} Accuracy: {:.2f}% ({}/{})'.format(n + 1, accuracy * 100, num_correct, len(test_data)))

        nbow.train()
Ejemplo n.º 7
0
def test():
    for epoch in range(args.epochs):
        # 载入模型
        output_path = args.output_path + str(epoch)
        args.model_path = os.path.join(output_path, "pytorch_model.bin")
        tokenizer, model = load_model(args)
        # 载入测试集数据
        data_path = args.data_path + str(epoch)
        test_data_path = os.path.join(data_path, 'test.csv')
        test_loader = load_data(tokenizer, args, test_data_path,
                                "test")  # 3263
        logger.info("Testing data has been loaded!")
        # 得到测试结果
        running = Running(model, args)
        outputs = running.test(test_loader)
        # 写入数据
        outputs_df = pd.read_csv(
            os.path.join(args.raw_data_path, "sample_submission.csv"))
        outputs_df['target_0'] = outputs[:, 0]
        outputs_df['target_1'] = outputs[:, 1]
        outputs_df['target_2'] = outputs[:, 2]
        outputs_df[['id', 'target_0', 'target_1',
                    'target_2']].to_csv(os.path.join(output_path, "sub.csv"),
                                        index=False)
        logger.info('sub ' + str(epoch) + ' has been written.')
Ejemplo n.º 8
0
def reload_state(checkpoint, training_state, config, args):
    """
    Reload state when resuming training.
    """
    model, id_to_token, id_to_char = BidafModel.from_checkpoint(
        config['bidaf'], checkpoint)
    if torch.cuda.is_available() and args.cuda:
        model.cuda()
    model.train()

    optimizer = get_optimizer(model, config, training_state)

    token_to_id = {tok: id_ for id_, tok in id_to_token.items()}
    char_to_id = {char: id_ for id_, char in id_to_char.items()}

    len_tok_voc = len(token_to_id)
    len_char_voc = len(char_to_id)

    with open(args.data) as f_o:
        data, _ = load_data(json.load(f_o),
                            span_only=True, answered_only=True)
    limit_passage = config.get('training', {}).get('limit')
    data = tokenize_data(data, token_to_id, char_to_id, limit_passage)

    data = get_loader(data, config)

    assert len(token_to_id) == len_tok_voc
    assert len(char_to_id) == len_char_voc

    return model, id_to_token, id_to_char, optimizer, data
Ejemplo n.º 9
0
def train_main(args):
    global loader_train, loader_val
    loader_train, loader_val, loader_test = load_data(train_bath_size=BATCH_SIZE, args=args,RANDOM_SEED=RANDOM_SEED, val_batch_size=BATCH_SIZE)

    device = set_device()
    setup_seed(RANDOM_SEED) #随机种子

    #model = googleNet()
    model = resnet18()
    #model = load_model(model, args.pretrained_model_path, device=device)
    model = nn.DataParallel(model) #多gpu

    criterion = nn.CrossEntropyLoss()

    params = net_lr(model, FC_LR, NET_LR)

    if OPTIMIZER == 'adam':
        optimizer = torch.optim.Adam(params, betas=(0.9, 0.999), weight_decay=0, eps=1e-08)
    else:
        optimizer = torch.optim.SGD(params, momentum=MOMENTUM, nesterov=True,
                                    weight_decay=WEIGHT_DECAY)

    print(model)
    start_epoch = 0
    if Load_model:
        start_epoch = 25
        filepath = 'load_model_path'
        model = load_model(model, filepath, device=device)
        model = model.to(device=device)
        optimizer = load_optimizer(optimizer, filepath, device=device)

    train(model, optimizer, criterion, device=device, epochs=EPOCH, start=start_epoch)
Ejemplo n.º 10
0
def main(_):
  # Import data
  data = load_data(FLAGS.dataset, one_hot=True, validation_size=10000)

  ####################################### DEFINE MODEL #######################################
  
  # x is a value that we'll input when TensorFlow is asked to run a computation
  # None means the dimension can be of any length (any number of MNIST images)
  x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS])
  
  # Variables are modifiable tensors that live in the graph of interacting operations
  # Typical to use this type for model parameters - initially set to 0 as they'll be learnt
  W = tf.Variable(tf.zeros([IMAGE_PIXELS, NUM_CLASSES]))
  b = tf.Variable(tf.zeros([NUM_CLASSES]))
  
  # First multiply x by W and then add bias before applying the softmax layer of a NN
  # y = tf.nn.softmax(tf.matmul(x, W) + b)
  
  # Remove softmax layer due to later instability of raw cross-entropy formulation
  y = tf.matmul(x, W) + b

  # Placeholder to input the correct answers
  y_ = tf.placeholder(tf.float32, [None, NUM_CLASSES])
  
  ######################################### TRAINING #########################################

  #### DEFINE LOSS AND OPTIMISER ####
  # The raw formulation of cross-entropy,
  #
  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), reduction_indices=[1]))
  #
  # can be numerically unstable.
  #
  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
  # outputs of 'y', and then average across the batch.
  # Internally computes the softmax activation
  cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=y))
  
  #### APPLY OPTIMISATION ####
  # In one line, compute gradients, compute parameter update steps and apply update steps to parameters
  train_step = tf.train.GradientDescentOptimizer(FLAGS.learn_rate).minimize(cross_entropy)

  # Launch model in an interactive session
  sess = tf.InteractiveSession()
  tf.global_variables_initializer().run()
  
  #### TRAINING ####
  # Stochastic GD as it's less expensive than using all available data for every training step
  for _ in range(1000):
    # Get a 'batch' of `batch_size` random data points from training set each loop iteration
    batch_xs, batch_ys = data.train.next_batch(FLAGS.batch_size)
    # Run train_step, feeding in the batch data to replace placeholders
    sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

  ######################################### TESTING #########################################
  
  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  print(sess.run(accuracy, feed_dict={x: data.test.images, y_: data.test.labels}))
Ejemplo n.º 11
0
def main():
    print('Loading data...')
    train, valid, test = load_data(args.dataset_path, valid_portion=0)

    train_data = RecSysDataset(train)
    valid_data = RecSysDataset(valid)
    test_data = RecSysDataset(test)
    train_loader = DataLoader(train_data,
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    valid_loader = DataLoader(valid_data,
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_fn)
    test_loader = DataLoader(test_data,
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_fn)

    ## Load Transition Matrix
    M2 = pd.read_csv('datasets/transition/final2_transition_gowalla_narm.csv')
    M2 = M2.T[1:].T
    M2.index = M2.columns

    n_items = 38575  #  38575, 3271, 8487
    model = NARM(n_items, M2, args.hidden_size, args.embed_dim,
                 args.batch_size).to(device)

    optimizer = optim.Adam(model.parameters(), args.lr)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=args.lr_dc_step, gamma=args.lr_dc)

    for epoch in tqdm(range(args.epoch)):
        # train for one epoch
        scheduler.step(epoch=epoch)
        trainForEpoch(train_loader,
                      model,
                      optimizer,
                      epoch,
                      args.epoch,
                      criterion,
                      log_aggr=512)

        recall10, mrr10, recall20, mrr20, recall50, mrr50 = validate(
            test_loader, model)
        print(
            'Epoch {} validation: Recall@{}: {:.4f}, MRR@{}: {:.4f}, Recall@{}: {:.4f}, MRR@{}: {:.4f}, Recall@{}: {:.4f}, MRR@{}: {:.4f}  \n'
            .format(epoch, 10, recall10, 10, mrr10, 20, recall20, 20, mrr20,
                    50, recall50, 50, mrr50))

        # store best loss and save a model checkpoint
        ckpt_dict = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        if epoch % 10 == 0:
            torch.save(ckpt_dict, 'latest_checkpoint.pth.tar')
Ejemplo n.º 12
0
    def _kmeans(self, n, k, init='random', iteration=2, a=-9, b=9):

        _iter = 0

        #load data points
        _points = load_data(n)
        """
		Select the method for generating the k initial centers
		"""
        if init == 'geo_init':
            #generate centers by geometric initialisation method
            _centers = geometric_init(_points, k)
            print _centers
        else:
            #generate centers randomly
            _centers = generate_centers(a, b, n, k)
            print _centers

        #compute euclidean distance between data points and centers
        _distances = cdist(_points, _centers, 'euclidean')
        print _distances

        while _iter < 2:
            _iter += 1
            intra = [0]  #variable for intra cluster criterion
            #create an ndarray to classify clusters
            _clusters = np.ones((k, n)) * 100

            for i in range(n):
                index_min = np.argmin(_distances[i])
                _clusters.put((index_min, i), index_min)
                _distances.put((i, index_min), 100)
            #compute the intra cluster criterion
            squared_distances = np.square(_distances)

            for i in range(k):
                s = j = 0
                while j < n:
                    if _clusters[i][j] < 100:
                        s += squared_distances[j][i]
                        j += 1
                    intra.append(s)
            inter = np.sum(intra)

            #recompute centers
            _centers = [[]]
            for i in range(k):
                temp_list = [[]]
                for j in range(n):
                    if _clusters[i][j] < 100:
                        temp_list.append(_points[j])
                del temp_list[0]
                _centers.append(np.median(temp_list, axis=0))
            del _centers[0]

            #_iter+=1
            _distances = cdist(_points, _centers, 'euclidean')

        return inter, intra, _clusters
Ejemplo n.º 13
0
def make_model():
    model = RandomForestClassifier(n_estimators=185)#label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
    sensor_data = dataset.load_data()
    X, y = sensor_data.data[:200], sensor_data.target[:200]
    model.fit(X, y)
    np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f')
    np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f')
    return model
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('dir', help='Caminho para o dataset')

    args = parser.parse_args()

    X_train, Y_train, X_test, Y_test = dataset.load_data(args.dir)
    dataset.save_data(args.dir, X_train, Y_train, X_test, Y_test)
Ejemplo n.º 15
0
def make_model3():
    model =  label_propagation.LabelSpreading(kernel='knn', n_neighbors=15)
    sensor_data = dataset.load_data()
    X, y = sensor_data.data[:200], sensor_data.target[:200]
    model.fit(X, y)
    np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f')
    np.savetxt("y_train.csv", y, delimiter=",", fmt='%10.1f')
    return model
Ejemplo n.º 16
0
def prepro_bow(name, filename, ngram, encoding='utf-8'):
    print('Process {} dataset...'.format(name))
    word_count = {}

    def link_words(samples):
        '''link sentence back
        return x, y
        '''
        x, y = list(), list()
        # line: (sentence, label)
        for line in samples:
            sentence = line[0]
            for w in sentence:
                if w in word_count:
                    word_count[w] += 1
                else:
                    word_count[w] = 0
        for line in samples:
            sentence = []
            for w in line[0]:
                if word_count[w] >= 20:
                    sentence.append(w)
            x.append(' '.join(sentence))
            y.append(line[1])
        vectorizer = CountVectorizer(ngram_range=(1, ngram), min_df=20)
        print(sum([1 if word_count[i] >= 20 else 0 for i in word_count]))
        x = vectorizer.fit_transform(x).toarray()
        print(len(vectorizer.get_feature_names()))
        return x, y

    # load dataset
    train_set, _ = load_data(os.path.join('dataset', 'raw', name,
                                          filename + '_train.txt'),
                             encoding=encoding)
    #dev_set, _ = load_data(os.path.join('dataset', 'raw', 'books', 'books.task.dev'))
    test_set, _ = load_data(os.path.join('dataset', 'raw', name,
                                         filename + '_test.txt'),
                            encoding=encoding)
    #train_set.extend(dev_set)
    train_len = len(train_set)
    train_set.extend(test_set)
    x_train, y_train = link_words(train_set)
    x_test, y_test = x_train[train_len:], y_train[train_len:]
    x_train, y_train = x_train[:train_len], y_train[:train_len]
    print('Process {} dataset done'.format(name))
    return x_train, y_train, x_test, y_test
Ejemplo n.º 17
0
def main(_):
    model_path = 'models/' + FLAGS.name
    data = load_data(FLAGS.dataset, one_hot=True, validation_size=10000)

    # Define and instantiate VAE model
    if FLAGS.vae_type == 'vae':
        vae = VAE(network_architecture=network_architecture(
            FLAGS.vae_type, FLAGS.latent_dim),
                  batch_size=FLAGS.batch_size,
                  learn_rate=FLAGS.learn_rate)
    elif FLAGS.vae_type == 'conv':
        vae = ConvVAE(network_architecture=network_architecture(
            FLAGS.vae_type, FLAGS.latent_dim),
                      batch_size=FLAGS.batch_size,
                      learn_rate=FLAGS.learn_rate)
    else:
        raise ValueError(
            "Autoencoder type should be either conv or vae. Received: {}.".
            format(FLAGS.vae_type))

    with tf.Session() as sess:
        np.random.seed(FLAGS.seed)
        tf.set_random_seed(FLAGS.seed)

        saver = tf.train.Saver()
        saver.restore(sess, model_path)
        print("Model restored from: %s" % model_path)

        # Sample a test input and see how well the VAE can reconstruct these samples
        x_sample = data.test.next_batch(FLAGS.batch_size)[0]
        x_reconstruct = vae.reconstruct(sess, x_sample)

        plt.figure(figsize=(8, 12))
        for i in range(5):
            plt.subplot(5, 2, 2 * i + 1)
            plt.imshow(x_sample[i].reshape(IMAGE_SIZE, IMAGE_SIZE),
                       vmin=0,
                       vmax=1,
                       cmap='gray')
            plt.title("Test input")
            plt.colorbar()

            plt.subplot(5, 2, 2 * i + 2)
            plt.imshow(x_reconstruct[i].reshape(IMAGE_SIZE, IMAGE_SIZE),
                       vmin=0,
                       vmax=1,
                       cmap='gray')
            plt.title("Reconstruction")
            plt.colorbar()

        plt.tight_layout()
        plt.show()

        visualise_latent_space(sess, vae, data.test)

        if FLAGS.latent_dim == 2:
            plot_reconstructions(sess, vae, FLAGS.batch_size)
Ejemplo n.º 18
0
def load_data(params: NakdimonParams):
    data = {}
    for stage_name, stage_dataset_filenames in params.corpus.items():
        np.random.seed(2)
        data[stage_name] = dataset.load_data(
            dataset.read_corpora(stage_dataset_filenames),
            validation_rate=params.validation_rate,
            maxlen=params.maxlen)
    return data
Ejemplo n.º 19
0
def evaluate(split):
    print('Loading {} set...'.format(split))
    x, y_true = load_data(split)
    x = preprocess_input(x)
    y_pred = model.predict(x)
    y_pred = y_pred.argmax(axis=1)
    print("{} set statistics:".format(split))
    print("Top-1-accuracy: {:.4f}".format(np.mean(y_true == y_pred)))
    print(metrics.classification_report(y_true, y_pred))
Ejemplo n.º 20
0
    def __init__(self, source_type="keras", name="mnist", **kwargs):

        # if source_type != "keras" and souce_type:
        #     raise NotImplementedError("temporarily only keras datasets available")
        # load train data

        flatten = config.global_config["network_type"] == "dense"
        self.X, self.y = load_data(source_type, name, flatten=flatten, **kwargs)

        self.input_shape = self.X[0].shape
        self.noutputs = self.y.shape[1]
Ejemplo n.º 21
0
def train():
    # 训练epochs轮
    for epoch in range(args.epochs):
        # 载入模型
        tokenizer, model = load_model(args)
        # 读取数据
        data_path = args.data_path + str(epoch)
        train_data_path = os.path.join(data_path, 'train.csv')
        train_loader = load_data(tokenizer, args, train_data_path, "train")
        evaluate_data_path = os.path.join(data_path, 'dev.csv')
        evaluate_loader = load_data(tokenizer, args, evaluate_data_path,
                                    "evaluate")
        logger.info("Training data has been loaded!")
        # 训练
        running = Running(model, args)
        running.train(train_loader, evaluate_loader, epoch)
        # 释放显存
        torch.cuda.empty_cache()
        # 垃圾回收
        gc.collect()
Ejemplo n.º 22
0
def ts_method(model):
    sensor_data = dataset.load_data()
    X, y = sensor_data.data[4000:], sensor_data.target[4000:]

    predicted_labels = model.predict(X)
    # predicted_labels = [1 if x > 0.5 else 0 for x in p[:, 1]]
    cm = confusion_matrix(y, predicted_labels,
                          labels=model.classes_)

    print('&' * 70)
    print(cm)
    print(classification_report(y, predicted_labels))
Ejemplo n.º 23
0
def train_script(input_path, output_dir):
    output_path = output_dir + "train_intermediate.csv"
    gather_translations(input_path, output_path)
    df, encode_dict, nb_class, weight_list = load_data(input_path=output_path,
                                                       weight=True)
    json.dump(encode_dict, open(output_dir + "mapping.json", "w"))
    train(df,
          nb_class,
          validation=True,
          output_model_file=output_dir + 'pytorch_beto_news.bin',
          output_vocab_file=output_dir + 'vocab_beto_news.bin',
          weight_list=weight_list)
Ejemplo n.º 24
0
    def __init__(self, config, **opt):
        # Load config used for training and merge with testing options
        self.config = yaml.load(open(config, "r"))
        self.config = Namespace(**{**self.config, **opt})

        # Load training data.pkl for src and tgt vocabs
        self.data = load_data(self.config)

        # Load trained model checkpoints
        device, devices_ids = misc_utils.set_cuda(self.config)
        self.model, _ = build_model(None, self.config, device)
        self.model.eval()
Ejemplo n.º 25
0
def Main(args):
    if not os.path.exists(args.data_dir):
        os.makedirs(args.data_dir)

    if args.do_train or args.do_eval or args.split_dataset or args.create_submission_file:
        articles, ref_articles_id, ref_span_starts, ref_span_ends, labels = load_data(
            args.train_data_folder, args.labels_path)
        train_file_path = os.path.join(args.data_dir, args.train_file)
        dev_file_path = os.path.join(args.data_dir, args.dev_file)
        if not os.path.exists(train_file_path) or not os.path.exists(
                dev_file_path) or args.overwrite_cache:
            logger.info("Creating train/dev files: %s, %s", train_file_path,
                        dev_file_path)
            get_train_dev_files(articles, ref_articles_id, ref_span_starts,
                                ref_span_ends, labels, train_file_path,
                                dev_file_path, args.split_by_ids,
                                args.dev_size, args.random_state, args.balance,
                                args.shuffle)

    if args.do_predict or args.create_submission_file or args.eval_submission:
        test_file_path = os.path.join(args.data_dir, args.test_file)
        test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels = load_data(
            args.test_data_folder, args.test_template_labels_path)
        if not os.path.exists(test_file_path) or args.overwrite_cache:
            logger.info("Creating roberta-type test file: %s", test_file_path)
            get_test_file(test_articles, test_articles_id, test_span_starts,
                          test_span_ends, test_labels, test_file_path)

    if args.do_train or args.do_eval or args.do_predict:
        transformers_clf(args)

    if args.create_submission_file:
        if not os.path.exists('results'):
            os.makedirs('results')
        output_file = os.path.join('results', args.output_file)
        logger.info("Creating the submission file: %s", output_file)
        create_submission_file(args.predicted_logits_files, train_file_path,
                               dev_file_path, test_file_path, test_articles_id,
                               test_span_starts, test_span_ends, output_file,
                               args.weights, args.data_dir)

    if args.eval_submission:
        output_file = os.path.join('results', args.output_file)
        logger.info("Evaluating the submission file: %s", output_file)
        if args.test_labels_path is None:
            acc, f1 = eval_submission(output_file, test_file_path)
            logger.info('accuracy: %f', acc)
            print('f1-macro:', f1)
        else:
            cmd = "python tools/task-TC_scorer.py -s {} -r {} -p {}".format(
                output_file, args.test_labels_path,
                args.propaganda_techniques_file)
            subprocess.run(cmd, shell=True)
Ejemplo n.º 26
0
    def __init__(self, model_folder, checkpoint_file):
        sys.path.append(model_folder)

        from model import get_model
        from dataset import load_data

        self.dataset = load_data('validation')

        self.sess = tf.InteractiveSession()
        self.model = get_model('policy')

        saver = tf.train.Saver()
        saver.restore(self.sess, checkpoint_file)
Ejemplo n.º 27
0
def init_state(logger, config, args):
    logger.log('Loading data...')

    with open(args.data) as f_o:
        data, _ = load_data(args.data)
    
    limit_passage = config.get('training', {}).get('limit')
    vocab_size = config.get('training', {}).get('vocab_size', None)

    logger.log('Tokenizing data...')
    data, token_to_id, char_to_id = tokenize_data(logger, data, vocab_size, True, limit_passage)
    data = get_loader(data, config)

    id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
    id_to_char = {id_: char for char, id_ in char_to_id.items()}

    assert(token_to_id[C.SOS_TOKEN] == C.SOS_INDEX)
    assert(token_to_id[C.UNK_TOKEN] == C.UNK_INDEX)
    assert(token_to_id[C.EOS_TOKEN] == C.EOS_INDEX)
    assert(token_to_id[C.PAD_TOKEN] == C.PAD_INDEX)

    logger.log('Creating model...')
    model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char)

    if args.word_rep:
        logger.log('Loading pre-trained embeddings...')
        with open(args.word_rep) as f_o:
            pre_trained = SymbolEmbSourceText(
                    f_o,
                    set(tok for id_, tok in id_to_token.items() if id_ != 0))
        mean, cov = pre_trained.get_norm_stats(args.use_covariance)
        rng = np.random.RandomState(2)
        oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)

        model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy(
            symbol_injection(
                id_to_token, 0,
                model.embedder.embeddings[0].embeddings.weight.data.numpy(),
                pre_trained, oovs))
    else:
        pass  # No pretraining, just keep the random values.

    # Char embeddings are already random, so we don't need to update them.

    if torch.cuda.is_available() and args.cuda:
        model.cuda()

    model.train()

    optimizer = get_optimizer(model, config, state=None)
    return model, id_to_token, id_to_char, optimizer, data
Ejemplo n.º 28
0
    def __init__(self, data_src, seed, batch_size=32, dataset='MNIST'):
        self.batch_size = batch_size
        self.data_src = data_src

        # Load data
        ((x, y), (x_test, y_test)) = load_data(dataset,
                                               seed=seed,
                                               imbalance=True)  # tf.keras.datasets.cifar10.load_data()

        if self.data_src == self.TRAIN:
            self.dataset_x = x
            self.dataset_y = y
        else:
            self.dataset_x = x_test
            self.dataset_y = y_test

        # Arrange x: channel first
        self.dataset_x = np.transpose(self.dataset_x, axes=(0, 3, 1, 2))

        # Normalize between -1 and 1
        # self.dataset_x = self.dataset_x / 255 - 0.5

        # Y 1D format
        # self.dataset_y = self.dataset_y[:, 0]

        assert (self.dataset_x.shape[0] == self.dataset_y.shape[0])

        # Compute per class instance count.
        classes = np.unique(self.dataset_y)
        self.classes = classes
        per_class_count = list()
        for c in classes:
            per_class_count.append(np.sum(np.array(self.dataset_y == c)))

        # Recount after pruning
        per_class_count = list()
        for c in classes:
            per_class_count.append(np.sum(np.array(self.dataset_y == c)))
        self.per_class_count = per_class_count

        # List of labels
        self.label_table = [str(c) for c in range(len(self.classes))]

        # Preload all the labels.
        self.labels = self.dataset_y[:]

        # per class ids
        self.per_class_ids = dict()
        ids = np.array(range(len(self.dataset_x)))
        for c in classes:
            self.per_class_ids[c] = ids[self.labels == c]
def main():
    model = Net()

    # print(sum(p.numel() for p in model.parameters() if p.requires_grad))

    image_fnames, data_fnames = dataset.find_images()
    images, landmarks_2d, landmarks_3d = dataset.load_data(
        image_fnames, data_fnames)
    dataset.augment_flip(images, landmarks_2d, landmarks_3d)
    images = np.array(images)
    landmarks_2d = np.array(landmarks_2d)
    landmarks_3d = np.array(landmarks_3d)

    X_train, X_val, Y_train, Y_val = train_test_split(images,
                                                      landmarks_2d,
                                                      train_size=0.8,
                                                      test_size=0.2)

    from torch.utils.data import DataLoader, TensorDataset

    BATCH_SIZE = 20

    train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(Y_train))
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True)

    valid_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(Y_val))
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False)

    # defining the optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.07)
    # defining the loss function
    criterion = nn.MSELoss()
    # checking if GPU is available
    # if torch.cuda.is_available():
    #     model = model.cuda()
    #     criterion = criterion.cuda()

    n_epochs = 5
    # empty list to store training losses
    train_losses = []
    # empty list to store validation losses
    val_losses = []
    # training the model
    for epoch in range(n_epochs):
        train(model, optimizer, criterion, epoch, train_dataloader,
              valid_dataloader, train_losses, val_losses)
Ejemplo n.º 30
0
def load_data(partition, xtokenizer, char_vocab, tag_vocab, feats_vocab):
    logging.info('Loading data')
    tensor_data = {}
    data = dataset.load_data(partition, xtokenizer, char_vocab, tag_vocab, feats_vocab)
    for part in partition:
        token_data, token_char_data, form_token_char_data, lemma_token_char_data, morph_token_data = data[part]
        token_data = torch.tensor(token_data, dtype=torch.long)
        token_char_data = torch.tensor(token_char_data, dtype=torch.long)
        token_form_char_data = torch.tensor(form_token_char_data, dtype=torch.long)
        token_lemma_char_data = torch.tensor(lemma_token_char_data, dtype=torch.long)
        token_morph_data = torch.tensor(morph_token_data, dtype=torch.long)
        ds = TensorDataset(token_data, token_char_data, token_form_char_data, token_lemma_char_data, token_morph_data)
        tensor_data[part] = ds
    return tensor_data
Ejemplo n.º 31
0
def main():
    print('Loading data...')
    train, valid, test = load_data(args.dataset_path, valid_portion=args.valid_portion)

    train_data = RecSysDataset(train)
    valid_data = RecSysDataset(valid)
    test_data = RecSysDataset(test)
    train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)
    valid_loader = DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)

    if args.dataset_path.split('/')[-2] == 'diginetica':
        n_items = 43098
    elif args.dataset_path.split('/')[-2] in ['yoochoose1_64', 'yoochoose1_4']:
        n_items = 37484
    else:
        raise Exception('Unknown Dataset!')
    model = DPAN(n_items, args.hidden_size, args.embed_dim, args.batch_size, args.alpha_pool, args.beta_pool).to(device)

    if args.test:
        ckpt = torch.load('latest_checkpoint.pth.tar')
        model.load_state_dict(ckpt['state_dict'])
        recall, mrr = validate(test_loader, model)
        print("Test: Recall@{}: {:.4f}, MRR@{}: {:.4f}".format(args.topk, recall, args.topk, mrr))
        return

    optimizer = optim.Adam(model.parameters(), args.lr)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=args.lr_dc_step, gamma=args.lr_dc)
    for epoch in tqdm(range(args.epoch)):
        # train for one epoch
        scheduler.step(epoch=epoch)
        begin_time = time.time()
        trainForEpoch(train_loader, model, optimizer, epoch, args.epoch, criterion, log_aggr=200)
        end_time = time.time()
        run_time = end_time - begin_time
        print('Epoch {} 运行时间:{:.4f}s\n'.format(epoch, run_time))
        recall, mrr = validate(valid_loader, model)
        print('Epoch {} validation: Recall@{}: {:.4f}, MRR@{}: {:.4f} \n'.format(epoch, args.topk, recall, args.topk,
                                                                                 mrr))

        # store best loss and save a model checkpoint
        ckpt_dict = {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        torch.save(ckpt_dict, 'latest_checkpoint.pth.tar')
Ejemplo n.º 32
0
def model():
	"""TODO MLmodel
	"""
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score


	data = dataset.load_data()
	print("all samples= ",data.shape)

	print("dataY contains:", np.unique(data[:,1]))

	data = pd.DataFrame(data, columns=['domain', 'label'])
	data = data.drop_duplicates(subset='domain')
	data = np.array(data)

	trainX = data[:30000,0]
	trainY = data[:30000,1].astype(int) 
	testX = data[30000:30500, 0]
	testY = data[30000:30500,1].astype(int)

	#print(trainX)
	print("trainY contains: ", np.unique(trainY))
	#print(testX)
	print("testY contains: ", np.unique(testY))

	feature_table = get_feature(trainX)

	LR = LogisticRegression()
	LR = LR.fit(feature_table,trainY)
	pred = LR.predict(get_feature(testX))
	acc = accuracy_score(testY, pred)
	print("acc stage 1: ", acc)

	joblib.dump(LR, './models/LR.pkl')
	algorithm_domains = dataset.load_simple_data()
	algorithm_domains = list(set(algorithm_domains))
	algorithm_y = [0]*len(algorithm_domains)

	pred_feature = get_feature(algorithm_domains)
	pred = LR.predict(pred_feature)

	acc = accuracy_score(algorithm_y, pred)
	print("acc stage 2: ", acc)



#if __name__ == '__main__':
#	model()
Ejemplo n.º 33
0
def make_model2():
    sensor_data = dataset.load_data()
    rng = np.random.RandomState(0)
    indices = np.arange(len(sensor_data.data))
    rng.shuffle(indices)
    print(len(sensor_data.data))
    sm = SMOTE(random_state=42)
    X, y  = sm.fit_sample(sensor_data.data[indices[:2000]], sensor_data.target[indices[:2000]])

    n_total_samples = len(y)
    print(len(y))
    n_labeled_points = 200
    max_iterations = 50
    unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
    lp_model = label_propagation.LabelSpreading(kernel='knn', n_neighbors=15)

    for i in range(max_iterations):
        if len(unlabeled_indices) == 0:
            print("No unlabeled items left to label.")
            break
        y_train = np.copy(y)
        y_train[unlabeled_indices] = -1
        lp_model.fit(X, y_train)
        p = lp_model.predict_proba(X[unlabeled_indices])
        # predicted_labels = [1 if x > 0.57 else 0 for x in p[:, 1]]
        predicted_labels = lp_model.predict(X[unlabeled_indices])

        true_labels = y[unlabeled_indices]
        # print("#"*20 + "Iteration :: " + str(i) + "#"*20)
        # print(classification_report(true_labels, predicted_labels))

        pred_entropies = stats.distributions.entropy(
            lp_model.label_distributions_.T)
        uncertainty_index = np.argsort(pred_entropies)[::-1]
        uncertainty_index = uncertainty_index[
                                np.in1d(uncertainty_index, unlabeled_indices)][:40]
        delete_indices = np.array([])
        for index in uncertainty_index:
            delete_index, = np.where(unlabeled_indices == index)
            delete_indices = np.concatenate((delete_indices, delete_index))
        unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
        n_labeled_points += len(uncertainty_index)




    np.savetxt("X.csv", X, delimiter=",", fmt='%10.5f')
    np.savetxt("y_train.csv", y_train, delimiter=",", fmt='%10.1f')
    return lp_model
Ejemplo n.º 34
0
def load_and_run_all_models(config, emails_list_file, emails_data_base_dir):
    emails = get_emails(emails_list_file)
    total_err = 0.0
    for mail in emails:
        X_data, Y_data     = dataset.load_data(mail, emails_data_base_dir)
        model_path         = os.path.join(config.save_model_dir, mail, "full")
        model_object_path  = os.path.join(model_path, "model.json")
        model_weights_path = os.path.join(model_path, "weights.h5")
        threshold          = get_threshold(config.thresholds_dir, mail)

        err = load_and_run_model(config, model_object_path, model_weights_path,
                                            threshold, X_data, Y_data)
        print("Error for %s with threshold %f: %f" % (mail, threshold, err))
        total_err = (total_err + err) / 2

    print("Total error:", total_err)
Ejemplo n.º 35
0
def init_state(config, args):
    token_to_id = {'': 0}
    char_to_id = {'': 0}
    print('Loading data...')
    with open(args.data) as f_o:
        data, _ = load_data(json.load(f_o), span_only=True, answered_only=True)
    print('Tokenizing data...')
    data = tokenize_data(data, token_to_id, char_to_id)
    data = get_loader(data, config)

    id_to_token = {id_: tok for tok, id_ in token_to_id.items()}
    id_to_char = {id_: char for char, id_ in char_to_id.items()}

    print('Creating model...')
    model = BidafModel.from_config(config['bidaf'], id_to_token, id_to_char)

    if args.word_rep:
        print('Loading pre-trained embeddings...')
        with open(args.word_rep) as f_o:
            pre_trained = SymbolEmbSourceText(
                    f_o,
                    set(tok for id_, tok in id_to_token.items() if id_ != 0))
        mean, cov = pre_trained.get_norm_stats(args.use_covariance)
        rng = np.random.RandomState(2)
        oovs = SymbolEmbSourceNorm(mean, cov, rng, args.use_covariance)

        model.embedder.embeddings[0].embeddings.weight.data = torch.from_numpy(
            symbol_injection(
                id_to_token, 0,
                model.embedder.embeddings[0].embeddings.weight.data.numpy(),
                pre_trained, oovs))
    else:
        pass  # No pretraining, just keep the random values.

    # Char embeddings are already random, so we don't need to update them.

    if torch.cuda.is_available() and args.cuda:
        model.cuda()
    model.train()

    optimizer = get_optimizer(model, config, state=None)
    return model, id_to_token, id_to_char, optimizer, data
Ejemplo n.º 36
0
 def fit(self):
     """
     Computes the holdout and enqueues the job to the Cluster queue
     :param X:
     :param y:
     :return:
     """
     X, y = load_data(dataset=self.dataset, data_home=self.data_home)
     holdout = check_holdout(self.holdout, X, y, classifier=True)
     params = {'dataset': self.dataset,
               'data_home': self.data_home,
               'holdout': holdout,
                  'estimator': self.estimator,
                  'parameters': ParameterGrid(self.param_grid),
                  'scorer': self.scorer,
                  'verbose': self.verbose,
                  'fit_params': self.fit_params,
                  'error_score': self.error_score,
                  'split': holdout}
     self.queue.enqueue(params, 1)
Ejemplo n.º 37
0
def train_and_evaluate(config, emails_list_file, emails_data_base_dir):
    emails = get_emails(emails_list_file)
    totalFP = 0
    totalFN = 0
    totalTP = 0
    totalTN = 0
    with open(config.results_file, "w+") as f:
        f.write("")
    for mail in emails:

        print('Loading data...')
        X_data, Y_data = dataset.load_data(mail, emails_data_base_dir)

        falsePositives = 0
        falseNegatives = 0
        truePositives  = 0
        trueNegatives  = 0
        predictions = []
        shouldBe = []
        for i in range(len(X_data)):
            tmp       = X_data[i]
            X_data[i] = X_data[0]
            X_data[0] = tmp

            tmp       = Y_data[i]
            Y_data[i] = Y_data[0]
            Y_data[0] = tmp

            X_train = X_data[1:]
            y_train = Y_data[1:]
            X_test  = X_data[0:1]
            y_test  = Y_data[0:1]

            print(len(X_train), 'train sequences')

            print("Pad sequences (samples x time)")
            X_train = sequence.pad_sequences(X_train, maxlen=config.max_seq_len)
            X_test = sequence.pad_sequences(X_test, maxlen=config.max_seq_len)
            X_train, X_test = config.additional_data_transform(X_train, X_test)
            max_value = max(X_train.max(), X_test.max()) + 1
            print('X_train shape:', X_train.shape)
            print('X_test shape:', X_test.shape)

            print('Build model...')
            model = config.build_model(max_value)
            model.compile(loss=config.loss_function,
                          optimizer=config.optimizer,
                          class_mode=config.class_mode)

            print("Train...")
            model.fit(X_train, y_train, batch_size=config.batch_size, nb_epoch=config.epochs,
                      validation_data=(X_test, y_test), show_accuracy=True)
            score, acc = model.evaluate(X_test, y_test,
                                        batch_size=config.batch_size,
                                        show_accuracy=True)

            prediction = model.predict(X_test)
            predicted_class = round(abs(float(prediction)))
            print("Model prediction:", prediction, "Should be:", y_test)
            print('Test score:', score)
            print('Test accuracy:', acc)

            predictions.append(float(prediction))
            shouldBe.append(y_test[0])

            if(predicted_class == 0 and y_test[0] == 1):
                falseNegatives += 1
            elif(predicted_class == 1 and y_test[0] == 0):
                falsePositives += 1
            elif(predicted_class == 1 and y_test[0] == 1):
                truePositives += 1
            elif(predicted_class == 0 and y_test[0] == 0):
                trueNegatives += 1

            tmp       = X_data[i]
            X_data[i] = X_data[0]
            X_data[0] = tmp

            tmp       = Y_data[i]
            Y_data[i] = Y_data[0]
            Y_data[0] = tmp

            totalFP += falsePositives
            totalFN += falseNegatives
            totalTP += truePositives
            totalTN += trueNegatives

            model_path         = os.path.join(config.save_model_dir, mail, str(i))
            model_object_path  = os.path.join(model_path, "model.json")
            model_weights_path = os.path.join(model_path, "weights.h5")

            save_model(model, model_object_path, model_weights_path)

        train_on_full_data(config, mail, X_data, Y_data)

        result_string = "\n".join(["For: %s FP: %d FN: %d TP: %d TN: %d",
                                    "Predictions: %s", "Should be: %s"])
        result_string = result_string % (mail, falsePositives, falseNegatives,
                        truePositives, trueNegatives, str(predictions),
                        str(shouldBe))
        appendResults(result_string, config.results_file)
        print(result_string)


    result_string = "TotalFP: " + str(totalFP) + " TotalFN: " + str(totalFN) + \
                    " TotalTP: " + str(totalTP) + " TotalTN: " + str(totalTN)
    appendResults(result_string, config.results_file)
    print(result_string)
    gather_results.from_file(config.results_file, config.out_results_dir)
    print("Looking for FP-free threshold...")
    compute_thresholds(config.out_results_dir, config.thresholds_dir, emails_list_file)
Ejemplo n.º 38
0
    for k in range(len(args)):
        mat[:,:,k] = mat_temp[:,:,args[k]]
    return mat

def shuffle_args():
    args = []
    for i in range(5):
        a = [0,1]
        random.shuffle(a)
        args.append(a)
    seq = [0,1,2,3,4]
    random.shuffle(seq)
    args.append(seq)
    return args


if __name__ == '__main__':

    from dataset import load_data
    dataset = load_data('validation')
    data, label= dataset.next_batch(1000)

    for selfpos, emypos, selfmove, emymove, selfprot, emyprot in zip(*data):
        new_selfmove, new_emymove, new_selfprot, new_emyprot = gentensor(selfpos, emypos)
        #from IPython import embed; embed()
        assert all( (selfmove.sum(axis=2) == new_selfmove.sum(axis=2)).reshape(-1))
        assert all( (emymove.sum(axis=2) == new_emymove.sum(axis=2)).reshape(-1))
        assert all( (selfprot.sum(axis=2) == new_selfprot.sum(axis=2)).reshape(-1))
        assert all( (emyprot.sum(axis=2) == new_emyprot.sum(axis=2)).reshape(-1))

from sklearn.externals import joblib
from HOG import HOG
import dataset
import argparse

''' Set up the argument parser which will get the CSV file and location where model 
is to be stored'''

argparser = argparse.ArgumentParser()
argparser.add_argument("-d", "--dataset", required = True,
        help = "path to the dataset file")
argparser.add_argument("-m", "--model", required = True,
        help = "path to where the model will be stored")
args = vars(argparser.parse_args())

(digits, labels) = dataset.load_data(args["dataset"])

hog = HOG(orientations = 18, pixelsPerCell = (10, 10), cellsPerBlock = (1, 1), normalise = True)

data = []
# Add histogram for each digit in a list 
for digit in digits:
    digit = dataset.deskew(digit) 
    hist = hog.describe(digit.reshape((28,28)))
    data.append(hist)

# Set up and train the model
SVC_model = LinearSVC()
SVC_model.fit(data, labels)

# Save the model to file
from sklearn import datasets

from sklearn.cross_validation import train_test_split

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import numpy as np
from sklearn.utils import shuffle

#This is where your clf will be (model)

scores, targets, acc = load_data()

'''

# Loading the Digits dataset

digits = datasets.load_digits()



# To apply an classifier on this data, we need to flatten the image, to

# turn the data in a (samples, feature) matrix:

n_samples = len(digits.images)
Ejemplo n.º 41
0
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    pop, log = alg.myEAMuCommaLambda(pop, start_gen, toolbox, Config.MU, Config.LAMBDA,
                                     cxpb=0.6, mutpb=0.2, ngen=Config.ngen,
                                     stats=stats, halloffame=hof, logbook=logbook, verbose=True,
                                     id=id)

    return pop, log, hof


if __name__ == "__main__":

    # load the whole data 
    X_train, y_train = load_data("data/"+trainset_name)
    X_test, y_test = load_data("data/"+testset_name)
    
    # set cfg
    Config.input_shape = X_train[0].shape 
    Config.noutputs = y_train.shape[1]
    #    print(Config.input_shape, Config.noutputs)
    
    if checkpoint_file is None:
        pop, log, hof = main(id)
    else:
        pop, log, hof = main(id, checkpoint_file)
    
    network = hof[0].createNetwork()
    network.summary()
    print( hof[0] )
Ejemplo n.º 42
0
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

NFOLDS = 10
BATCH = 32
EPOCHS = 5
CLASSES = 2
EMBDIMS = 100
MAXLEN = 100
MAXFEATURES = 10000

if __name__ == "__main__":

  np.random.seed(1337) 
  dataset = dataset.Dataset(MAXFEATURES)
  x, y = dataset.load_data()
  labels_one_hot = k.utils.np_utils.to_categorical(np.array(y), CLASSES)  

  x = sequence.pad_sequences(x, maxlen=MAXLEN)

  scores = []
  folds = sk.cross_validation.KFold(len(y), n_folds=NFOLDS)

  for train_indices, test_indices in folds:
    print 'starting new fold...'
    train_x = x[train_indices]
    train_y = labels_one_hot[train_indices]
    test_x = x[test_indices]
    test_y = labels_one_hot[test_indices]
    
    model = k.models.Sequential()