def getAccSentiment(model, words, f):
    f = open(f, 'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        score = i[1]
        X1 = getSeq(p1, words)
        seq1.append(X1)
        ct += 1
        if ct % 100 == 0:
            x1, m1 = utils.prepare_data(seq1)
            scores = model.scoring_function(x1, m1)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
        golds.append(score)
    if len(seq1) > 0:
        x1, m1 = utils.prepare_data(seq1)
        scores = model.scoring_function(x1, m1)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return accSentiment(preds, golds)
Example #2
0
def _standard_vs_dense(train_X, test_X, standard_params=None, dense_params=None, gt_AB=None):
    
    t = Timer()
    
    train_X, train_lengths, train_unique = prepare_data(train_X)
    test_X, test_lengths, test_unique = prepare_data(test_X)
    
    standard_hmms = []
    if standard_params is None:
        standard_hmms.append(StandardHMM())
    elif type(standard_params) is list or type(standard_params) is tuple:
        for params in standard_params:
            standard_hmms.append(StandardHMM(**params))
    else:
        standard_params = dict(standard_params)
        standard_hmms.append(StandardHMM(**standard_params))
    
    dense_params = {} if dense_params is None else dict(dense_params)
    dense_hmm = DenseHMM(**dense_params)
    
    opt_schemes = dict_get(dense_params, 'opt_schemes', default=('em', 'cooc'))
    if 'em' in opt_schemes:
        t.tic("Fitting dense HMM in mode 'em' ...")
        dense_hmm.fit(train_X, train_lengths, test_X, test_lengths)
        t.toc("Fitting finished.")
    if 'cooc' in opt_schemes:
        t.tic("Fitting dense HMM in mode 'cooc' ...")
        dense_hmm.fit_coocs(train_X, train_lengths, test_X, test_lengths, gt_AB)
        t.toc("Fitting finished.")
        
    for i, standard_hmm in enumerate(standard_hmms):
        t.tic("Fitting standard hmm %d/%d" % (i+1, len(standard_hmms)))
        standard_hmm.fit(train_X, train_lengths, test_X, test_lengths)
        t.toc("Fitting finished.")
Example #3
0
def getpairs(model, batch, params):
    g1 = []
    g2 = []

    for i in batch:
        g1.append(i[0].embeddings)
        g2.append(i[1].embeddings)

    g1x, g1mask = utils.prepare_data(g1)
    g2x, g2mask = utils.prepare_data(g2)

    embg1 = model.feedforward_function(g1x, g1mask)
    embg2 = model.feedforward_function(g2x, g2mask)

    for idx, i in enumerate(batch):
        i[0].representation = embg1[idx, :]
        i[1].representation = embg2[idx, :]

    pairs = getPairsFast(batch, params.type)
    p1 = []
    p2 = []
    for i in pairs:
        p1.append(i[0].embeddings)
        p2.append(i[1].embeddings)

    p1x, p1mask = utils.prepare_data(p1)
    p2x, p2mask = utils.prepare_data(p2)

    return (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x, p2mask)
Example #4
0
def main():
    isconcat = True
    #isconcat =False
    #modelname = 'lambdanet-b512-model.h5'
    modelname = 'lambdanet-b512-l01-model.h5'
    #modelname = 'lambdanet-b512-windows-model.h5'

    #predfilename='lamdbanet-b512-pred.dat' #acc1:76 lambdanet max poolingresult
    #predfilename = 'lambdanet-b512-pred.dat'  # acc1:60 lambdanet avg pooling result
    #predfilename = 'rank-pred.dat'#acc1:65 svm result
    predfilename = 'lambdanet-b512-l01-pred.dat'  #acc1:74 lambdanet max poolingresult
    #predfilename = 'lambdanet-b512-windows-pred.dat'

    stage = 3

    if stage <= 0:
        utils.prepare_data(vocab_size)
    if stage <= 1:
        utils.split_data(n=5)
        #utils.split_data(n=10)
    if stage <= 2:
        #train(n=5,isconcat=isconcat,modelname=modelname)
        train_lambda(n=5, isconcat=isconcat, modelname=modelname)
        #train_lambda(n=10, isconcat=isconcat, modelname=modelname)
    if stage <= 3:
        predict(n=5,
                isconcat=isconcat,
                modelname=modelname,
                predfilename=predfilename)
    if stage <= 4:
        utils.calc_metric(n=5, predfilename=predfilename)
        utils.calc_metric_method(n=5, predfilename=predfilename)
Example #5
0
    def build_model(self,):
        utils.prepare_data(data_file=self.data_file)
        self.lap_list, self.feature = utils.load_gcn_data(self.graph_file, self.num_support)
        self.num_feature = self.feature.shape[1]
        self.x = tf.placeholder(tf.float32, [None, self.d_input_step, self.d_input_size])
        self.z = tf.placeholder(tf.float32, [None, self.g_input_step, self.g_input_size])
        self.z_t = tf.placeholder(tf.float32, [None, self.g_input_step, self.g_input_size])
        self.lap = tf.placeholder(tf.float32, [self.num_support, self.d_input_size, self.d_input_size])
        self.fea = tf.placeholder(tf.float32, [self.d_input_size, self.num_feature])

        self.x_ = self.generator(self.z, self.g_input_step, self.g_input_size, self.g_hidden_size, self.g_batch_size)
        self.D = self.discriminator(self.x, self.d_input_step, self.d_input_size, self.d_hidden_size, 1, self.g_batch_size)
        self.D_ = self.discriminator(self.x_, self.d_input_step, self.d_input_size, self.d_hidden_size, 1, self.g_batch_size, reuse=True)

        if self.wgan == 1:
            self.d_loss_real = tf.reduce_mean(self.D)
            self.d_loss_fake = tf.reduce_mean(self.D_)
            self.g_loss = self.d_loss_fake
            self.d_loss = self.d_loss_real - self.d_loss_fake

        else:
            self.d_loss_real = utils.compute_loss(self.D, tf.ones_like(self.D))
            self.d_loss_fake = utils.compute_loss(self.D_, tf.zeros_like(self.D_))
            self.g_loss = utils.compute_loss(self.D_, tf.ones_like(self.D_))
            self.d_loss = self.d_loss_real + self.d_loss_fake

        self.accuracy = utils.compute_accuracy(self.z_t, self.z_)
Example #6
0
def main(args):

    # train mode
    if args.mode == "train":
        # prepare_data to get word_dict, train_reader
        word_dict, train_reader = utils.prepare_data(args.train_data_path,
                                                     args.word_dict_path,
                                                     args.batch_size,
                                                     args.mode)

        train_net(train_reader, word_dict, args.model_type, args.use_gpu,
                  args.is_parallel, args.model_path, args.lr, args.batch_size,
                  args.num_passes)

    # eval mode
    elif args.mode == "eval":
        # prepare_data to get word_dict, test_reader
        word_dict, test_reader = utils.prepare_data(args.test_data_path,
                                                    args.word_dict_path,
                                                    args.batch_size, args.mode)
        eval_net(test_reader, args.use_gpu, args.model_path)

    # infer mode
    elif args.mode == "infer":
        # prepare_data to get word_dict, test_reader
        word_dict, test_reader = utils.prepare_data(args.test_data_path,
                                                    args.word_dict_path,
                                                    args.batch_size, args.mode)
        infer_net(test_reader, args.use_gpu, args.model_path)
        id_list = [i for i in range(len(score_li))]
        df = pd.DataFrame({'input_id': id_list, 'senta_score': score_li})
        pickle.dump(file=open('./TokSentLeo/Senta/SentaDL_output.pkl', 'wb'),
                    obj=df)
        print("data saved.")
        return df
Example #7
0
def getpairs(model, batch, params):
    g1 = []
    g2 = []

    for i in batch:
        g1.append(i[0].embeddings)
        g2.append(i[1].embeddings)

    g1x, g1mask = utils.prepare_data(g1)
    g2x, g2mask = utils.prepare_data(g2)

    embg1 = model.feedforward_function(g1x, g1mask)
    embg2 = model.feedforward_function(g2x, g2mask)

    for idx, i in enumerate(batch):
        i[0].representation = embg1[idx, :]
        i[1].representation = embg2[idx, :]

    pairs = getPairsFast(batch, params.type)
    p1 = []
    p2 = []
    for i in pairs:
        p1.append(i[0].embeddings)
        p2.append(i[1].embeddings)

    p1x, p1mask = utils.prepare_data(p1)
    p2x, p2mask = utils.prepare_data(p2)

    return (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x, p2mask)
Example #8
0
def prepare_loss_data(args, dataset):
    file_delta_p = os.path.join(args.path_temp, 'delta_p.p')
    if os.path.isfile(file_delta_p):
        mondict = dataset.load(file_delta_p)
        dataset.list_rpe = mondict['list_rpe']
        dataset.list_rpe_validation = mondict['list_rpe_validation']
        return

    # prepare delta_p_gt
    list_rpe = {}
    for dataset_name, Ns in dataset.datasets_train_filter.items():
        t, ang_gt, p_gt, v_gt, u = prepare_data(args, dataset, dataset_name, 0)
        p_gt = p_gt.double()
        Rot_gt = TORCHIEKF.from_rpy(ang_gt[:Ns[1]]).double()
        list_rpe[dataset_name] = compute_delta_p(Rot_gt[:Ns[1]], p_gt[:Ns[1]])

    list_rpe_validation = {}
    for dataset_name, Ns in dataset.datasets_validatation_filter.items():
        t, ang_gt, p_gt, v_gt, u = prepare_data(args, dataset, dataset_name, 0)
        p_gt = p_gt.double()
        Rot_gt = TORCHIEKF.from_rpy(ang_gt[:Ns[1]]).double()
        list_rpe_validation[dataset_name] = compute_delta_p(
            Rot_gt[:Ns[1]], p_gt[:Ns[1]])
    dataset.list_rpe = list_rpe
    dataset.list_rpe_validation = list_rpe_validation
    mondict = {
        'list_rpe': list_rpe,
        'list_rpe_validation': list_rpe_validation,
    }
    dataset.dump(mondict, file_delta_p)
Example #9
0
def main(args):

    # train mode
    if args.mode == "train":
        # prepare_data to get word_dict, train_reader
        word_dict, train_reader = utils.prepare_data(args.train_data_path,
                                                     args.word_dict_path,
                                                     args.batch_size,
                                                     args.mode)

        train_net(train_reader, word_dict, args.model_type, args.use_gpu,
                  args.is_parallel, args.model_path, args.lr, args.batch_size,
                  args.num_passes)

    # eval mode
    elif args.mode == "eval":
        # prepare_data to get word_dict, test_reader
        word_dict, test_reader = utils.prepare_data(args.test_data_path,
                                                    args.word_dict_path,
                                                    args.batch_size, args.mode)
        eval_net(test_reader, args.use_gpu, args.model_path)

    # infer mode
    elif args.mode == "infer":
        # prepare_data to get word_dict, test_reader
        word_dict, test_reader = utils.prepare_data(args.test_data_path,
                                                    args.word_dict_path,
                                                    args.batch_size, args.mode)
        infer_net(test_reader, args.use_gpu, args.model_path)
Example #10
0
 def PredictProbaFn(X):
     preds = []
     seq1 = []
     ct = 0
     for i in X:
         p1 = i.strip()
         p1 = ' '.join(re.split('(\W+)', p1))
         X1 = evaluate.getSeq(p1,words)
         seq1.append(X1)
         ct += 1
         if ct % 100 == 0:
             x1,m1 = utils.prepare_data(seq1)
             scores = model.predict_proba(x1,m1)
             if scores.shape[0] > 1:
                 scores = np.squeeze(scores)
             preds.extend(scores.tolist())
             seq1 = []
     if len(seq1) > 0:
         x1,m1 = utils.prepare_data(seq1)
         scores = model.predict_proba(x1,m1)
         if scores.shape[0] > 1:
             scores = np.squeeze(scores)
         preds.extend(scores.tolist())
     preds = np.array(preds).reshape(-1, 1)
     return np.hstack((1 - preds, preds))
def getAcc(model, words, f):
    f = open(f, 'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    seq2 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]
        p2 = i[1]
        score = i[2]
        X1, X2 = getSeqs(p1, p2, words)
        seq1.append(X1)
        seq2.append(X2)
        ct += 1
        if ct % 100 == 0:
            x1, m1 = utils.prepare_data(seq1)
            x2, m2 = utils.prepare_data(seq2)
            scores = model.scoring_function(x1, x2, m1, m2)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
            seq2 = []
        golds.append(score)
    if len(seq1) > 0:
        x1, m1 = utils.prepare_data(seq1)
        x2, m2 = utils.prepare_data(seq2)
        scores = model.scoring_function(x1, x2, m1, m2)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return acc(preds, golds)
Example #12
0
def getAcc(model,words,f):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    seq2 = []
    ct = 0
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = i[2]
        X1, X2 = getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        ct += 1
        if ct % 100 == 0:
            x1,m1 = utils.prepare_data(seq1)
            x2,m2 = utils.prepare_data(seq2)
            scores = model.scoring_function(x1,x2,m1,m2)
            scores = np.squeeze(scores)
            preds.extend(scores.tolist())
            seq1 = []
            seq2 = []
        golds.append(score)
    if len(seq1) > 0:
        x1,m1 = utils.prepare_data(seq1)
        x2,m2 = utils.prepare_data(seq2)
        scores = model.scoring_function(x1,x2,m1,m2)
        scores = np.squeeze(scores)
        preds.extend(scores.tolist())
    return acc(preds,golds)
Example #13
0
def main(args):
    transform = tv.transforms.Compose([
        tv.transforms.Grayscale(num_output_channels=1),
        tv.transforms.ToTensor()
    ])
    trainset = tv.datasets.MNIST(root='~/torch/data/MNIST',
                                 train=True,
                                 download=True,
                                 transform=transform)
    prior = utils.StandardLogistic()
    device = torch.device("cuda:0")
    flow = nice.NICE(prior=utils.StandardLogistic(),
                     coupling=4,
                     in_out_dim=kMNISTInputDim,
                     mid_dim=1000,
                     hidden=5,
                     mask_config=1).to(device)

    # mask, x = PrepareMNISTData(trainset)
    # ShowImagesInGrid(x, 10, 10, save_path="original.png")
    scaling_weights = torch.load(
        args.model_path)['model_state_dict']['scaling.scale']
    flow.load_state_dict(torch.load(args.model_path)['model_state_dict'])

    # Sort the scales
    mask = GetScaleMask(scaling_weights, ktop=args.ktop, reverse=True)
    samples = flow.sample(args.sample_size, mask=mask.cuda()).cpu()
    mean = torch.load('./statistics/mnist_mean.pt')
    result = utils.prepare_data(samples,
                                'mnist',
                                zca=None,
                                mean=mean,
                                reverse=True)
    tv.utils.save_image(
        tv.utils.make_grid(result),
        './samples_masked/result_true_' + str(args.ktop) + '.png')

    mask = GetScaleMask(scaling_weights, ktop=args.ktop, reverse=False)
    samples = flow.sample(args.sample_size).cpu()
    mean = torch.load('./statistics/mnist_mean.pt')
    result = utils.prepare_data(samples,
                                'mnist',
                                zca=None,
                                mean=mean,
                                reverse=True)
    tv.utils.save_image(
        tv.utils.make_grid(result),
        './samples_masked/result_false_' + str(args.ktop) + '.png')

    samples = flow.sample(args.sample_size).cpu()
    mean = torch.load('./statistics/mnist_mean.pt')
    result = utils.prepare_data(samples,
                                'mnist',
                                zca=None,
                                mean=mean,
                                reverse=True)
    tv.utils.save_image(
        tv.utils.make_grid(result),
        './samples_masked/result_original_' + str(args.ktop) + '.png')
Example #14
0
    def test_scaleX(self):
        c = get_preproc_config(use_exog=True)
        d = prepare_data(c)
        d = prepare_data(c, dim="3d")

        # train
        self.assertAlmostEqual(0.0, d.trainX[0][0][0], 2)
        self.assertAlmostEqual(1.0, d.trainX[-1][-1][-1], 2)
Example #15
0
def main():
    train_data = utils.get_data()
    test_data = utils.get_data(test=True, sample=False)
    y = train_data['radiant_win']
    X_train = train_data.loc[:, train_data.columns != 'radiant_win']
    X_train = utils.replace_with_bag_of_words(utils.prepare_data(X_train))
    X_test = utils.replace_with_bag_of_words(utils.prepare_data(test_data))
    make_coursera_testing(X_train, y)
Example #16
0
def main(state, freq):
    """Where the magic happens"""
    print(state, freq)
    if FULL_TRIAL:
        labels = np.concatenate((np.ones(18), np.zeros(18)))
        groups = range(36)
    elif SUBSAMPLE:
        info_data = pd.read_csv(SAVE_PATH.parent / "info_data.csv")[STATE_LIST]
        n_trials = info_data.min().min()
        n_subs = len(info_data) - 1
        groups = [i for i in range(n_subs) for _ in range(n_trials)]
        n_total = n_trials * n_subs
        labels = [0 if i < n_total / 2 else 1 for i in range(n_total)]
    else:
        labels = loadmat(LABEL_PATH / state + "_labels.mat")["y"].ravel()
        labels, groups = create_groups(labels)

    file_path = (SAVE_PATH / "results" / PREFIX + NAME +
                 "_{}_{}_{}_{:.2f}.mat".format(state, freq, WINDOW, OVERLAP))

    if not file_path.isfile():
        file_name = NAME + "_{}_{}_{}_{:.2f}.mat".format(
            state, freq, WINDOW, OVERLAP)
        data_file_path = SAVE_PATH / file_name

        if data_file_path.isfile():
            final_save = {}

            random_seed = 0
            data = loadmat(data_file_path)
            if FULL_TRIAL:
                data = data["data"]
            elif SUBSAMPLE:
                data = prepare_data(data,
                                    n_trials=n_trials,
                                    random_state=random_seed)
            else:
                data = prepare_data(data)

            sl2go = StratifiedLeave2GroupsOut()
            lda = LDA()
            clf = TSclassifier(clf=lda)
            best_combin, best_score = backward_selection(
                clf, data, labels, sl2go, groups)

            final_save = {
                "best_combin_index": best_combin,
                "best_combin": CHANNEL_NAMES[best_combin],
                "score": best_score,
            }
            savemat(file_path, final_save)

            print(
                f"Best combin: {CHANNEL_NAMES[best_combin]}, score: {best_score}"
            )

        else:
            print(data_file_path.NAME + " Not found")
Example #17
0
def make_kaggle_prediction(X_test, X_train, y):
    X_train_ = utils.replace_with_bag_of_words(utils.prepare_data(X_train))
    X_test_ = utils.replace_with_bag_of_words(utils.prepare_data(X_test))
    clf = LogisticRegression(C=0.01, random_state=241)
    clf.fit(X_train_, y)
    clf.predict_proba(X_test_)
    pred = clf.predict_proba(X_test_)[:, 1]
    result = pandas.DataFrame({'radiant_win': pred}, index=X_test_.index)
    result.to_csv('result.csv')
Example #18
0
def ontology_alignment(model, ontoTerms_a, ontoTerms_b, words, ceil = 0.5):

    with open(ontoTerms_a) as f:
        ontoText_a = f.readlines()
    with open(ontoTerms_b) as f:
        ontoText_b = f.readlines()
    # Remove whitespace characters like `\n` at the end of each line.
    ontoText_a = [x.strip() for x in ontoText_a] 
    ontoText_b = [x.strip() for x in ontoText_b]

    whole = []
    for text_a in ontoText_a:
        for text_b in ontoText_b:
            txt_a = re.sub(' +',' ',text_a)
            txt_b = re.sub(' +',' ',text_b)
            if txt_a == txt_b:
                whole.append([text_a, text_b, 0.0])
                try:
                    ontoText_a.remove(text_a)
                except ValueError:
                    pass
                    #print(text_a)
                try:
                    ontoText_b.remove(text_b)
                except ValueError:
                    pass
                    #print(text_b)
    # Transform to Word & Mask vectors to apply "feedforward_function"
    ontoData_a, ontoData_b = [], []
    for sentence in ontoText_a:
        ontoData_a.append(getSeq(sentence, words))
    for sentence in ontoText_b:
        ontoData_b.append(getSeq(sentence, words))
    x1,m1 = utils.prepare_data(ontoData_a)
    x2,m2 = utils.prepare_data(ontoData_b)
    OntoEmbg_a = model.feedforward_function(x1,m1)
    OntoEmbg_b = model.feedforward_function(x2,m2)
    # Compute the Cosine Distances:
    dist = cosine_distances(OntoEmbg_a,OntoEmbg_b)
    disT = np.transpose(dist)

    
    males    = preferances(dist)
    females  = preferances(disT)
    del(disT)
    match = Matcher(males, females)
    marriage = match()
    del(males); del(females)

    for key, value in marriage.items():
        man         = ontoText_a[value]
        woman       = ontoText_b[key]
        value       = dist[value][key]
        if value < ceil:
            whole.append([man, woman, value])
    return whole
Example #19
0
def train(preproc_dir, n_classes, max_length, hidden_units, dropout,
          batch_size, epochs, output_dir):
    """
    Train the ESIM model on some dataset and save the learned weights.

    Args:
        preproc_dir: The directory where the preprocessed data is saved.
        n_classes: The number of classes in the problem.
        max_length: The maximum length of the sentences in the premises and
                    hypotheses of the dataset.
        hidden_units: The number of hidden units to use in the various layers
                      of the model.
        dropout: The dropout rate to use in the model.
        batch_size: The size of the batches to use for training.
        epochs: The number of epochs to apply during training.
        output_dir: The path to the directory where the weights learned during
                    training must be saved.
    """
    print("Loading training and validation data...")
    train_premises, train_hyps, train_labels = prepare_data(
        preproc_dir, 'train', n_classes, max_length)
    valid_premises, valid_hyps, valid_labels = prepare_data(
        preproc_dir, 'dev', n_classes, max_length)
    # train_premises是如下形式:
    # [[5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 14, 2, 15, 16, 3,0,0,0,0],
    #  [17, 18, 19, 20, 21, 22, 4, 23, 2, 24,0,0,0,0,0,0,0,0,0],
    #  [25, 26, 27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

    print("Loading embedding weights...")
    embedding_weights = load_embeddings(
        os.path.join(preproc_dir, "embedding_weights.pkl"))

    # Build the model.
    esim = ESIM(n_classes, embedding_weights, max_length, hidden_units,
                dropout)
    model = esim.build_model()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    filepath = os.path.join(output_dir,
                            "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    model.fit(x=[train_premises, train_hyps],
              y=train_labels,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([valid_premises, valid_hyps], valid_labels),
              callbacks=[checkpoint],
              shuffle=True)
Example #20
0
 def test_prepare_data_output_fon_nan(self):
     data_nan = [[1523889000000, np.nan], [1523889000001, np.nan],
                 [1523889000002, np.nan]]
     data_none = [[1523889000000, None], [1523889000001, None],
                  [1523889000002, None]]
     return_data_nan = prepare_data(data_nan)
     return_data_none = prepare_data(data_none)
     for item in return_data_nan.value:
         self.assertTrue(np.isnan(item))
     for item in return_data_none.value:
         self.assertTrue(np.isnan(item))
Example #21
0
 def test_scaleYref(self):
     # assert original Y's are not changed after scaling
     c = get_preproc_config(use_exog=True, horizon=1)
     d = prepare_data(c)
     d = prepare_data(c, dim="3d")
     self.assertEqual(52.0, d.trainYref[0])
     self.assertEqual(242.0, d.trainYref[-1])
     self.assertEqual(292.0, d.valYref[0])
     self.assertEqual(322.0, d.valYref[-1])
     self.assertEqual(372.0, d.testYref[0])
     self.assertEqual(402.0, d.testYref[-1])
Example #22
0
 def test_sets(self):
     c = get_preproc_config(lags=3)
     d_ml = prepare_data(c)
     d_lstm = prepare_data(c, dim="3d")
     # same shapes
     self.assertEqual(d_ml.trainY.shape[0], d_lstm.trainY.shape[0])
     self.assertEqual(d_ml.valY.shape[0], d_lstm.valY.shape[0])
     self.assertEqual(d_ml.testY.shape[0], d_lstm.testY.shape[0])
     # same values
     self.assertEqual(d_ml.trainY[0], d_lstm.trainY[0])
     self.assertEqual(d_ml.valY[0], d_lstm.valY[0])
     self.assertEqual(d_ml.testY[0], d_lstm.testY[0])
Example #23
0
def net_prediction(img, comask, fomask, model):
    utils.prepare_data(img, comask, fomask)
    normalize = transforms.Normalize(mean=saliency.mean, std=saliency.std)
    transform = transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ])
    img, comask, cont, fomask = utils.test_data_loader(transform)
    inputs = torch.cat((img, comask, cont, fomask), 0)
    inputs = torch.unsqueeze(inputs, dim=0)
    inputs = Variable(inputs.cuda(), volatile=True)
    mask = model(inputs)
    return mask
Example #24
0
def week_2_task_3():
    train_y, train_X = prepare_data('perceptron-train.csv')
    test_y, test_X = prepare_data('perceptron-test.csv')
    before_scale = count_accuracy(train_y, train_X, test_y, test_X)
    print 'Before scaling: ', '%2.3f' % before_scale

    scaler = StandardScaler()
    scaled_train_X = scaler.fit_transform(train_X)
    scaled_test_X = scaler.transform(test_X)
    after_scale = count_accuracy(train_y, scaled_train_X, test_y,
                                 scaled_test_X)
    print 'After scaling: ', '%2.3f' % after_scale

    print 'Difference: ', '%2.3f' % (after_scale - before_scale)
Example #25
0
def main():
    args = utils.parse_args_new()
    #  Data Parameters
    origin_data_file = args.data_file
    graph_file = args.graph_file
    generated_num = args.num_train_sample
    generated_num_test = args.num_test_sample
    seq_length = args.seq_length
    vocab_size = args.num_node
    batch_size = args.batch_size
    num_epochs = args.num_epochs

    #  Generator  Hyper-parameters
    g_emb_dim = args.g_dim_emb
    g_hidden_size = args.g_hidden_size
    train_percent = args.train_percent
    g_num_expend = args.g_num_expend

    input_length = int(seq_length * train_percent)
    train_batch = int(generated_num / batch_size)
    test_batch = int(generated_num_test / batch_size)

    # Model
    START_TOKEN = 0

    utils.prepare_data(origin_data_file)
    graph = nx.read_edgelist(graph_file,
                             nodetype=int,
                             create_using=nx.DiGraph())
    adjacency_matrix = np.asarray(
        nx.adjacency_matrix(graph).todense()).transpose()

    generator = Generator(vocab_size, batch_size, g_emb_dim, g_hidden_size,
                          seq_length, START_TOKEN, input_length)

    init = tf.global_variables_initializer()
    sess = tf.InteractiveSession()
    sess.run(init)

    print 'Start training...'
    for epoch in range(num_epochs):
        for it in range(train_batch):
            batch = utils.train_next_batch(generator.batch_size, hard=True)
            generator.update_step(sess, batch)
        if epoch % 5 == 0:
            accuracy, test_loss, p_n, n_n = gan_seq_tree.test_accuracy_epoch(
                sess, generator, generator.batch_size, test_batch,
                generator.input_length, adjacency_matrix, g_num_expend)
            print 'training epoch:%d loss:%.5f jaccard:%.5f p@n:%.5f, n@n:%.5f' % (
                epoch, test_loss, accuracy, p_n, n_n)
Example #26
0
def test(flow, testloader, epoch, filename, device, dataset):
    flow.eval()  # set to inference mode
    running_loss = 0
    with torch.no_grad():
        samples = flow.sample(100).cpu()
        samples = utils.prepare_data(samples, dataset, reverse=True)
        torchvision.utils.save_image(torchvision.utils.make_grid(samples),
                                     './samples/' + filename + '_epoch%d.png' % epoch)
        for n_batches, data in enumerate(testloader,1):
            inputs, _ = data
            inputs = utils.prepare_data(
                inputs, dataset).to(device)
            loss = -flow(inputs).mean()
            running_loss += float(loss)
    return running_loss / n_batches
Example #27
0
def gen_examples(x1, x2, l, y, batch_size):
    """
        Divide examples into batches of size `batch_size`.
    """
    minibatches = utils.get_minibatches(len(x1), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_x1 = [x1[t] for t in minibatch]
        mb_x2 = [x2[t] for t in minibatch]
        mb_l = l[minibatch]
        mb_y = [y[t] for t in minibatch]
        mb_x1, mb_mask1 = utils.prepare_data(mb_x1)
        mb_x2, mb_mask2 = utils.prepare_data(mb_x2)
        all_ex.append((mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y))
    return all_ex
Example #28
0
def gen_examples(x1, x2, l, y, batch_size):
    """
        Divide examples into batches of size `batch_size`.
    """
    minibatches = utils.get_minibatches(len(x1), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_x1 = [x1[t] for t in minibatch]
        mb_x2 = [x2[t] for t in minibatch]
        mb_l = l[minibatch]
        mb_y = [y[t] for t in minibatch]
        mb_x1, mb_mask1 = utils.prepare_data(mb_x1)
        mb_x2, mb_mask2 = utils.prepare_data(mb_x2)
        all_ex.append((mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y))
    return all_ex
Example #29
0
def main():
    path_to_train_x = 'dataset/train-images.idx3-ubyte'
    path_to_train_y = 'dataset/train-labels.idx1-ubyte'
    path_to_save = 'result'

    parser = argparse.ArgumentParser(description='mnist train')
    parser.add_argument('--x_train_dir=',
                        dest='x_train_dir',
                        default=path_to_train_x,
                        type=str)
    parser.add_argument('--y_train_dir=',
                        dest='y_train_dir',
                        default=path_to_train_y,
                        type=str)
    parser.add_argument('--model_output_dir=',
                        dest='model_output_dir',
                        default=path_to_save,
                        type=str)
    parser.add_argument('--mini_batch_num=',
                        dest='mini_batch_num',
                        type=int,
                        default=1000,
                        help='По дефолту 1000 эпох')
    args = parser.parse_args()

    X, y = prepare_data(args.x_train_dir, args.y_train_dir)
    weights = train(X, y, args.mini_batch_num, 0.5)
    np.save(args.model_output_dir, weights)

    p = softmax(weights, X)
    y_pred = np.argmax(p, axis=1)
    y_pred_matrix = y_to_matrix(y_pred, 10)
    print(classification_report(y, y_pred_matrix))
Example #30
0
 def test_prepare_data_for_nan(self):
     data = [[1523889000000, np.nan], [1523889000001, np.nan],
             [1523889000002, np.nan]]
     try:
         data = prepare_data(data)
     except ValueError:
         self.fail('Model {} raised unexpectedly'.format(model_name))
def final_training(model, folder_out):
    """
    Function to train the winner model 

    Args:
        model: the winner model of the experiment
        folder_out: folder to save the model
    Returns:
    """
    print('final training')

    # Load training data without validation part before final training
    trainloader_final, _, testloader_final = utils.prepare_data(valid_frac=0.0)
    
    # Change lr for the final training 
    model['pytorch_model'].optimizer.param_groups[0]['initial_lr'] = lr_final
    model['pytorch_model'].optimizer.param_groups[0]['lr'] = lr_final

    # Train the winner model
    model['pytorch_model'].fit(trainloader_final, train_type = 'winner', epochs=epoch_final)

    # Evaluate the performance
    performance = model['pytorch_model'].evaluate(testloader_final)
    final_num_params = sum(p.numel() for p in model['pytorch_model'].parameters() if p.requires_grad)

    # Save
    with open(folder_out + "performance.txt", "a+") as f_out:
        f_out.write('final perf ' + str(performance) + ' final number of params ' + str(final_num_params))

    torch.save(model['pytorch_model'].state_dict(), folder_out + 'best_model')
    descriptor_file = open(folder_out + 'best_model_descriptor.txt', 'w')
    for layer in model['model_descriptor']['layers']:
        layer_str = str(layer)
        descriptor_file.write(layer_str + "\n")
    descriptor_file.close()
Example #32
0
def main():
    """
        主函数
    """
    # 准备数据集
    train_data, test_data = utils.prepare_data()

    # 查看数据集
    utils.inspect_dataset(train_data, test_data)

    # 特征工程处理
    # 构建训练测试数据
    X_train, X_test = utils.do_feature_engineering(train_data, test_data)

    print('共有{}维特征。'.format(X_train.shape[1]))

    # 标签处理
    y_train = train_data['label'].values
    y_test = test_data['label'].values

    # 数据建模及验证
    print('\n===================== 数据建模及验证 =====================')
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)

    print('准确率:', accuracy_score(y_test, y_pred))
    print('AUC值:', roc_auc_score(y_test, y_pred))
Example #33
0
    def test_scaleX(self):
        c = get_preproc_config(use_exog=True)
        d = prepare_data(c)

        # train
        self.assertEqual(0, int(d.trainX[0][0]))
        self.assertEqual(1, int(d.trainX[-1][1]))
Example #34
0
def main():
    """
        主函数
    """
    # 准备数据集
    train_data, test_data = utils.prepare_data()

    # 查看数据集
    utils.inspect_dataset(train_data, test_data)

    # 特征工程处理
    # 构建训练测试数据
    X_train, X_test = utils.do_feature_engineering(train_data, test_data)

    print('共有{}维特征。'.format(X_train.shape[1]))

    # 标签处理
    y_train = train_data['label'].values
    y_test = test_data['label'].values

    # 数据建模及验证
    print('\n===================== 数据建模及验证 =====================')
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    y_pred = nb_model.predict(X_test)

    print('准确率:', accuracy_score(y_test, y_pred))
    print('AUC值:', roc_auc_score(y_test, y_pred))
  def evaluate(self, data, ref_alignments, batch_size=4, training=False):
    """Evaluate the model on a data set."""

    ref_align = read_naacl_alignments(ref_alignments)

    ref_iterator = iter(ref_align)
    metric = AERSufficientStatistics()
    accuracy_correct = 0
    accuracy_total = 0
    loss_total = 0
    steps = 0.

    for batch_id, batch in enumerate(iterate_minibatches(data, batch_size=batch_size)):
      x, y = prepare_data(batch, self.x_vocabulary, self.y_vocabulary)
      y_len = np.sum(np.sign(y), axis=1, dtype="int64")

      align, prob, acc_correct, acc_total, loss = self.get_viterbi(x, y, training)
      accuracy_correct += acc_correct
      accuracy_total += acc_total
      loss_total += loss
      steps += 1

      for alignment, N, (sure, probable) in zip(align, y_len, ref_iterator):
        # the evaluation ignores NULL links, so we discard them
        # j is 1-based in the naacl format
        pred = set((aj, j) for j, aj in enumerate(alignment[:N], 1) if aj > 0)
        metric.update(sure=sure, probable=probable, predicted=pred)
        # print(batch[s])
        # print(alignment[:N])
        # print(pred)
        #       s +=1

    accuracy = accuracy_correct / float(accuracy_total)
    return metric.aer(), accuracy, loss_total/float(steps)
Example #36
0
def getCorrelation(model,words,f):
    f = open(f,'r')
    lines = f.readlines()
    preds = []
    golds = []
    seq1 = []
    seq2 = []
    for i in lines:
        i = i.split("\t")
        p1 = i[0]; p2 = i[1]; score = float(i[2])
        X1, X2 = getSeqs(p1,p2,words)
        seq1.append(X1)
        seq2.append(X2)
        golds.append(score)
    x1,m1 = utils.prepare_data(seq1)
    x2,m2 = utils.prepare_data(seq2)
    scores = model.scoring_function(x1,x2,m1,m2)
    preds = np.squeeze(scores)
    return pearsonr(preds,golds)[0], spearmanr(preds,golds)[0]
Example #37
0
def gen_data(p, data, batch_size = 1):
    # generate data for the model
    # y in train data is a matrix (batch_size, seq_length)
    # y in test data is an array
    x = data['x'][p:p + batch_size]
    y = data['y'][p:p + batch_size]
    batch_data = {'x':x,'y':y}
    if data.has_key('t'):
        batch_data['t'] = data['t'][p:p + batch_size]

    ret = utils.prepare_data(batch_data, VOCAB_SIZE, one_hot=ONE_HOT, sigmoid_on=SIGMOID_ON)
    return ret
Example #38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data', type=str, help='a dataset')
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--verbose', action='store_true')
    parser.add_argument('--incomplete', dest='incomplete',
                        action='store_true', help='allow incomplete queries')
    args = parser.parse_args()

    dataset = args.data
    seed = args.seed
    incomplete = args.incomplete
    verbose = args.verbose

    if verbose:
        level = logging.INFO

        logger = logging.getLogger()
        logger.setLevel(level)
        ch = logging.StreamHandler(sys.stderr)
        ch.setLevel(level)
        formatter = logging.Formatter('%(message)s')
        ch.setFormatter(formatter)
        logger.addHandler(ch)

    np.random.seed(seed)

    _, _, X, _, _ = utils.prepare_data(dataset, onehot=False, labelEncode=False)

    cat_idx = [i for i in range(len(X.columns))
               if isinstance(X.iloc[0][i], basestring)]
    cont_idx = range(X.shape[1])
    for i in cat_idx:
        cont_idx.remove(i)
    X = X[cat_idx + cont_idx].values

    ext = AWSRegressionExtractor(dataset, X.copy(), cat_idx,
                                 incomplete=incomplete)

    try:
        X_test = X[0:500]

        if ext.binning:
            r = -decimal.Decimal(str(ext.eps)).as_tuple().exponent
            for i, t in enumerate(ext.feature_types):
                if t == "NUMERIC":
                    X_test[:, i] = np.round(X_test[:, i].astype(np.float), r)
    except ValueError:
        X_test = None

    ext.run(args.data, X_test, 500, random_seed=seed,
            alphas=[1], methods=['passive'], baseline=False)
Example #39
0
def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True, normalize=False):
    probs = []

    n_done = 0

    for x, y in iterator:
        n_done += len(x)

        lengths = np.array([len(s) for s in x])

        x, x_mask, y, y_mask = prepare_data(x, y)

        pprobs = f_log_probs(x, x_mask, y, y_mask)
        if normalize:
            pprobs = pprobs / lengths

        for pp in pprobs:
            probs.append(pp)

        sys.stdout.write('\rDid ' + str(n_done) + ' samples')

    print
    return np.array(probs)
Example #40
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          n_words_src=30000,
          n_words=30000,
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=1.,  # learning rate
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          saveto='model.npz',
          saveFreq=1000,  # save the parameters after every saveFreq updates
          datasets=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
          picked_train_idxes_file=r'',
          use_dropout=False,
          reload_=False,
          overwrite=False,
          preload='',
          sort_by_len=False,
          convert_embedding=True,
          dump_before_train=False,
    ):
    # Model options
    model_options = locals().copy()
    if reload_:
        lrate *= 0.5

    # load dictionaries and invert them

    # reload options
    if reload_ and os.path.exists(preload):
        print 'Reloading model options'
        with open(r'.\model\en2fr.iter160000.npz.pkl', 'rb') as f:
            model_options = pkl.load(f)

    print 'Configuration from fy'

    vocab_en_filename = './data/dic/en2fr_en_vocabs_top1M.pkl'
    vocab_fr_filename = './data/dic/en2fr_fr_vocabs_top1M.pkl'
    map_filename = './data/dic/mapFullVocab2Top1MVocab.pkl'
    lr_discount_freq = 80000

    print 'Done'

    print 'Loading data'

    text_iterator = TextIterator(
        datasets[0],
        datasets[1],
        vocab_en_filename,
        vocab_fr_filename,
        batch_size,
        maxlen,
        n_words_src,
        n_words,
    )

    # sys.stdout.flush()
    # train_data_x = pkl.load(open(datasets[0], 'rb'))
    # train_data_y = pkl.load(open(datasets[1], 'rb'))
    #
    # if len(picked_train_idxes_file) != 0:
    #     picked_idxes = pkl.load(open(picked_train_idxes_file, 'rb'))
    #     train_data_x = [train_data_x[id] for id in picked_idxes]
    #     train_data_y = [train_data_y[id] for id in picked_idxes]
    #
    # print 'Total train:', len(train_data_x)
    # print 'Max len:', max([len(x) for x in train_data_x])
    # sys.stdout.flush()
    #
    # if sort_by_len:
    #     slen = np.array([len(s) for s in train_data_x])
    #     sidx = slen.argsort()
    #
    #     _sbuf = [train_data_x[i] for i in sidx]
    #     _tbuf = [train_data_y[i] for i in sidx]
    #
    #     train_data_x = _sbuf
    #     train_data_y = _tbuf
    #     print len(train_data_x[0]), len(train_data_x[-1])
    #     sys.stdout.flush()
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=False)
    # else:
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=True)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(preload):
        print 'Reloading model parameters'
        params = load_params(preload, params)

        # for k, v in params.iteritems():
        #     print '>', k, v.shape, v.dtype

        # Only convert parameters when reloading
        if convert_embedding:
            # =================
            # Convert input and output embedding parameters with a exist word embedding
            # =================
            print 'Convert input and output embedding'

            temp_Wemb = params['Wemb']
            orig_emb_mean = np.mean(temp_Wemb, axis=0)

            params['Wemb'] = np.tile(orig_emb_mean, [params['Wemb'].shape[0], 1])

            # Load vocabulary map dicts and do mapping
            with open(map_filename, 'rb') as map_file:
                map_en = pkl.load(map_file)
                map_fr = pkl.load(map_file)

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['Wemb'][top] = temp_Wemb[full]

            print 'Convert input embedding done'

            temp_ff_logit_W = params['ff_logit_W']
            temp_Wemb_dec = params['Wemb_dec']
            temp_b = params['ff_logit_b']

            orig_ff_logit_W_mean = np.mean(temp_ff_logit_W, axis=1)
            orig_Wemb_dec_mean = np.mean(temp_Wemb_dec, axis=0)
            orig_b_mean = np.mean(temp_b)

            params['ff_logit_W'] = np.tile(orig_ff_logit_W_mean, [params['ff_logit_W'].shape[1], 1]).T
            params['ff_logit_b'].fill(orig_b_mean)
            params['Wemb_dec'] = np.tile(orig_Wemb_dec_mean, [params['Wemb_dec'].shape[0], 1])

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['ff_logit_W'][:, top] = temp_ff_logit_W[:, full]
                    params['ff_logit_b'][top] = temp_b[full]
                    params['Wemb_dec'][top] = temp_Wemb[full]

            print 'Convert output embedding done'

            # for k, v in params.iteritems():
            #     print '>', k, v.shape, v.dtype

            # ================
            # End Convert
            # ================

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, x_emb = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    f_x_emb = theano.function([x, x_mask], x_emb, profile=profile)
    print 'Done'
    sys.stdout.flush()
    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(np.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    sys.stdout.flush()
    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    if reload_:
        m = re.search('.+iter(\d+?)\.npz', preload)
        if m:
            uidx = int(m.group(1))
    print 'uidx', uidx, 'l_rate', lrate

    estop = False
    history_errs = []
    # reload history

    if dump_before_train:
        print 'Dumping before train...',
        saveto_uidx = '{}.iter{}.npz'.format(
            os.path.splitext(saveto)[0], uidx)
        np.savez(saveto_uidx, history_errs=history_errs,
                 uidx=uidx, **unzip(tparams))
        print 'Done'

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    for eidx in xrange(max_epochs):
        n_samples = 0

        # for i, batch_idx in train_batch_idx:
        #
        #     x = [train_data_x[id] for id in batch_idx]
        #     y = [train_data_y[id] for id in batch_idx]

        for i, (x, y) in enumerate(text_iterator):
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # discount reward
            if lr_discount_freq > 0 and np.mod(uidx, lr_discount_freq) == 0:
                lrate *= 0.5
                print 'Discount learning rate to {} at iteration {}'.format(lrate, uidx)

            # verbose
            if np.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
                sys.stdout.flush()

            if np.mod(uidx, saveFreq) == 0:
                # save with uidx
                if not overwrite:
                    # print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                            os.path.splitext(saveto)[0], uidx)
                    np.savez(saveto_uidx, history_errs=history_errs,
                             uidx=uidx, **unzip(tparams))
                    # print 'Done'
                    # sys.stdout.flush()
            # generate some samples with the model and display them

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)

    return 0.
Example #41
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data', type=str, help='a dataset')
    parser.add_argument('hidden_nodes', type=int, help='number of hidden nodes')
    parser.add_argument('action', type=str, help='action to perform')
    parser.add_argument('budget', type=str, help='query budget')
    parser.add_argument('--num_passes', type=int, help='number of passes',
                        default=1000)
    parser.add_argument('--epsilon', type=float, help='learning rate',
                        default=0.1)
    parser.add_argument('--rounding', type=int, help='rounding digits')
    parser.add_argument('--steps', type=str, nargs='+', default=[],
                        help='adaptive active learning')
    parser.add_argument('--adaptive_oracle', dest='adaptive_oracle',
                        action='store_true',
                        help='adaptive active learning from oracle')
    parser.add_argument('--force_reg', dest='force_reg',
                        action='store_true',
                        help='train a regression layer only')
    parser.add_argument('--batch_size', type=int, help='batch size', default=1)
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    args = parser.parse_args()

    dataset = args.data
    action = args.action
    hidden_nodes = args.hidden_nodes
    budget = args.budget
    num_passes = args.num_passes
    rounding = args.rounding
    steps = args.steps
    adaptive_oracle = args.adaptive_oracle
    epsilon = args.epsilon
    batch_size = args.batch_size
    force_reg = args.force_reg
    seed = args.seed

    np.random.seed(0)

    X_train, y_train, X_test, y_test, scaler = utils.prepare_data(dataset)

    if force_reg:
        dataset += "_reg"

    ext = LocalPerceptronExtractor(dataset, hidden_nodes, X_train, y_train,
                                   rounding=rounding, force_reg=force_reg)

    num_unknowns = hidden_nodes * (X_train.shape[1] + 1) + \
                   len(ext.get_classes()) * (hidden_nodes + 1)

    try:
        budget = int(budget)
    except ValueError:
        budget = int(float(budget) * num_unknowns)

    try:
        steps = map(int, steps)
    except ValueError:
        steps = map(lambda x: int(float(x) * num_unknowns), steps)

    print >> sys.stderr, 'Data: {}, Action: {}, Budget:{}, Seed: {}'.\
        format(dataset, action, budget, seed)
    print >> sys.stderr, 'Number of unknowns: {}'.format(num_unknowns)

    if action == "train":
        ext.train(X_test, y_test, num_passes=num_passes)
    elif action == "extract":
        ext.extract(X_train, y_train, budget, steps=steps,
                    adaptive_oracle=adaptive_oracle, num_passes=num_passes,
                    epsilon=epsilon, batch_size=batch_size, random_seed=seed)
    elif action == "baseline":
        ext.extract(X_train, y_train, budget, steps=steps,
                    adaptive_oracle=adaptive_oracle, baseline=True,
                    num_passes=num_passes, epsilon=epsilon,
                    batch_size=batch_size, random_seed=seed,
                    reg_lambda=1e-40)
    elif action == "compare":
        X_test_u = utils.gen_query_set(X_test.shape[1], 10000)
        ext.compare(X_test, X_test_u, force_reg=force_reg)
    else:
        raise ValueError('Unknown action')
Example #42
0
print "building..."
model = Seq2Seq(embedding_dim, hidden_dim,source_vocab_size,target_vocab_size, drop_rate)

print "get dataset....."
train_dataset=TextIterator(dataset[0],dataset[1],
                           dictionary[0],dictionary[1],
                           n_words_source=source_vocab_size,n_words_target=target_vocab_size,
                           batch_size=batch_size,maxlen=maxlen)
print "Training...."
begin_again=time.time()
for eidx in xrange(max_epochs):
    uidx=0
    for x,y in train_dataset:
        uidx+=1
        x,x_mask,y,y_mask=prepare_data(x,y,maxlen,source_vocab_size,target_vocab_size)
        ud_start=time.time()
        cost = model.train_model(x,x_mask,y,y_mask,lr)
        ud=time.time()-ud_start
        if np.isnan(cost) or np.isinf(cost):
            print "Nan Detected!"

        if uidx % dispFreq==0:
            print "epoch:",eidx,'uidx',uidx,"cost:",cost

        if uidx%saveFreq==0:
            print "dumping..."
            with open('parameters_%.2f.pkl' % (time.time()-begin_again),'w')as f:
                pickle.dump(model.params,f)

        if uidx%sampleFreq==0:
Example #43
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('data', type=str, help='a dataset')
    parser.add_argument('action', type=str, help='action to perform')
    parser.add_argument('num_repr', type=int, help='number of representers')
    parser.add_argument('budget', type=str, help='query budget')
    parser.add_argument('--num_passes', type=int, help='number of passes',
                        default=1000)
    parser.add_argument('--rounding', type=int, help='rounding digits')
    parser.add_argument('--steps', type=str, nargs='+', default=[],
                        help='adaptive active learning')
    parser.add_argument('--adaptive_oracle', dest='adaptive_oracle',
                        action='store_true',
                        help='adaptive active learning from oracle')
    parser.add_argument('--gamma', type=float,
                        help='RBF kernel hyper-parameter')
    parser.add_argument('--epsilon', type=float, help='learning rate',
                        default=0.1)
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    parser.add_argument('--batch_size', type=int, help='batch size')
    args = parser.parse_args()

    dataset = args.data
    action = args.action
    num_repr = args.num_repr
    budget = args.budget
    num_passes = args.num_passes
    rounding = args.rounding
    steps = args.steps
    adaptive_oracle = args.adaptive_oracle
    gamma = args.gamma
    epsilon = args.epsilon
    seed = args.seed
    batch_size = args.batch_size

    np.random.seed(0)

    X_train, y_train, X_test, y_test, scaler = utils.prepare_data(dataset)
    X_test_u = utils.gen_query_set(X_test.shape[1], 1000)
    ext = LocalKernelExtractor(dataset, X_train, y_train, rounding=rounding)

    num_unknowns = num_repr * X_train.shape[1] + \
                   len(ext.get_classes()) * (num_repr + 1)
    try:
        budget = int(budget)
    except ValueError:
        budget = int(float(budget) * num_unknowns)

    try:
        steps = map(int, steps)
    except ValueError:
        steps = map(lambda x: int(float(x) * num_unknowns), steps)

    print >> sys.stderr, 'Data: {}, Action: {}, Budget:{}, Seed: {}'.\
        format(dataset, action, budget, seed)
    print >> sys.stderr, 'Number of unknowns: {}'.format(num_unknowns)

    if action == "train":
        ext.train(num_repr, X_test, y_test)
    elif action == "extract":
        if gamma is None:
            gamma = ext.get_gamma()

        print gamma
        ext.extract(X_train, y_train, num_repr, budget, gamma=gamma, steps=steps,
                    adaptive_oracle=adaptive_oracle, num_passes=num_passes,
                    epsilon=epsilon, random_seed=seed, batch_size=batch_size)
    elif action == "baseline":
        if gamma is None:
            gamma = ext.get_gamma()
        ext.extract(X_train, y_train, num_repr, budget, gamma=gamma, steps=steps,
                    adaptive_oracle=adaptive_oracle, baseline=True,
                    num_passes=num_passes, epsilon=epsilon, random_seed=seed,
                    batch_size=batch_size, reg_lambda=1e-40)
    elif action == "compare":
        ext.compare(X_test, X_test_u, scaler=None)
Example #44
0
epoch        = 0
start_at     = time.time()
cur_at       = start_at
state        = make_initial_state(n_units, batchsize=batchsize)
if args.gpu >= 0:
    accum_loss   = Variable(cuda.zeros(()))
    for key, value in state.items():
        value.data = cuda.to_gpu(value.data)
else:
    accum_loss   = Variable(np.zeros((), dtype=np.float32))
print train_data
print 'going to train {} iterations'.format(n_batches * n_epochs)
for i in xrange(n_epochs):
  for j in xrange(n_batches):
    batch_data = train_data[j*batchsize:(j+1)*batchsize]
    batch_data, mask_data = prepare_data(batch_data, vocab_index_of_GO)
    #batch_data = batch_data.T
    #mask_data = mask[j*batchsize:(j+1)*batchsize]
    #mask_data = mask_data.T
    #assert batch_data.shape[0] == args.max_len
    #assert mask_data.shape[0] == args.max_len
    for timestep in xrange(len(batch_data)):
        x_batch = batch_data[timestep]
        y_batch = x_batch * mask_data[timestep]
        m_batch = mask_data[timestep].astype(np.float32) #this has to be converted here
        mask_bar = 1-m_batch
        m_batch = m_batch[:,None]
        
        #s_batch is the mask for probabilty distribution 
        s_batch = np.array(np.zeros((len(x_batch),len(vocab))), dtype=np.float32)
        s_batch[:,0] = mask_bar
Example #45
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('model_file', type=str, help='a pickled model file')
    parser.add_argument('action', type=str, help='action to perform')
    parser.add_argument('budget', type=str, help='query budget')
    parser.add_argument('--num_passes', type=int, help='number of passes',
                        default=1000)
    parser.add_argument('--epsilon', type=float, help='learning rate',
                        default=0.1)
    parser.add_argument('--rounding', type=int, help='rounding digits')
    parser.add_argument('--steps', type=str, nargs='+', default=[],
                        help='adaptive active learning')
    parser.add_argument('--adaptive_oracle', dest='adaptive_oracle',
                        action='store_true',
                        help='adaptive active learning from oracle')
    parser.add_argument('--force_reg', dest='force_reg',
                        action='store_true',
                        help='train a regression layer only')
    parser.add_argument('--batch_size', type=int, help='batch size', default=1)
    parser.add_argument('--seed', type=int, default=0, help='random seed')
    args = parser.parse_args()

    model_file = args.model_file
    action = args.action
    budget = args.budget
    num_passes = args.num_passes
    rounding = args.rounding
    steps = args.steps
    adaptive_oracle = args.adaptive_oracle
    epsilon = args.epsilon
    batch_size = args.batch_size
    force_reg = args.force_reg
    seed = args.seed

    np.random.seed(0)

    X_train, y_train, X_test, y_test, _ = utils.prepare_data('att_faces')

    ext = TheanoMLInversionExtractor(model_file)

    num_unknowns = len(ext.get_classes()) * ext.num_features()

    try:
        budget = int(budget)
    except ValueError:
        budget = int(float(budget) * num_unknowns)

    try:
        steps = map(int, steps)
    except ValueError:
        steps = map(lambda x: int(float(x) * num_unknowns), steps)

    print >> sys.stderr, 'Data: {}, Action: {}, Budget:{}, Seed: {}'.\
        format(model_file, action, budget, seed)
    print >> sys.stderr, 'Number of unknowns: {}'.format(num_unknowns)

    if action == "extract":
        ext.extract(budget, steps=steps, print_epoch=1, 
                    adaptive_oracle=adaptive_oracle, num_passes=num_passes,
                    epsilon=epsilon, batch_size=batch_size, random_seed=seed)
    elif action == "compare":
        X_test_u = utils.gen_query_set(X_test.shape[1], 1000)
        ext.compare(X_test, X_test_u)
    else:
        raise ValueError('Unknown action')
Example #46
0
File: tvec.py Project: imito/odin
args = args_parse(descriptions=[
    ('-feat', 'Input feature for training', None, 'mspec24'),
    ('-task', 'gender, age, dialect, speaker, digit', None, 'gender'),
    ('-batch', 'batch size', None, 32),
    ('-epoch', 'Number of training epoch', None, 12),
    ('--retrain', "deleted trained model, and re-train everything", None, False)
])
# ===========================================================================
# Const
# ===========================================================================
EXP_DIR, MODEL_PATH, LOG_PATH = get_exp_path('tvec', args, override=args.retrain)
stdio(LOG_PATH)
# ====== load data feeder ====== #
(train, valid,
 X_test_name, X_test_true, X_test_data,
 labels) = prepare_data(feat=args.feat, label=args.task)
n_classes = len(labels)
# ===========================================================================
# Create model
# ===========================================================================
inputs = [K.placeholder(shape=(None,) + shape[1:], dtype='float32', name='input%d' % i)
          for i, shape in enumerate(as_tuple_of_shape(train.shape))]
X = inputs[0]
y = inputs[1]
print("Inputs:", ctext(inputs, 'cyan'))
# ====== create the networks ====== #
with N.args_scope(
    [('Conv', 'Dense'), dict(b_init=None, activation=K.linear, pad='same')],
        ['BatchNorm', dict(activation=K.relu)]):
  f = N.Sequence([
      N.Dimshuffle(pattern=(0, 1, 2, 'x')),
    # tn: test set distrubution of shopping_pt (#10-11 merged)    
    ############################################################################
    submit = True; N = 50; NS = 9; kfold = 3; N_proc = None;
    include_from_pt = 1; verbose_selection = False
    tn = np.array([18943,13298,9251,6528,4203,2175,959,281,78])
    ############################################################################
    # Random Forest Setting ####################################################
    # Must be a list containg a tuple with (ntree,maxfea,leafsize)
    params = [(50,5,23)]
    # ex. [(x,5,23) for x in [35,50,75]] # [(50,x,23) for x in range(4,12)]
    # anything you'd like to try, here is the place for the modifications
    ############################################################################
    
    print "Majority vote using %i models, selecting %i\n" % (N,NS)
    # initialize data
    data,test,con,cat,extra,conf,conf_f,encoders = prepare_data()
    data = data[data.shopping_pt >=include_from_pt]; print "Including from shopping_pt #%i\n" % data.shopping_pt.min(),
    # features, target, weights (not used)
    X = data[con+cat+conf+extra]; y = data['G_f'] ; w = np.ones(y.shape)
    
    vmask = reduce(operator.and_,data[conf[:-1]].values.T==data[conf_f[:-1]].values.T)
    scores,imp,ptscores = {},{},{}
    for n,m,l in params:
        t = time();
        scores[(m,l)],imp[(m,l)],ptscores[(m,l)] = [],[],[]
        col_trscores,col_cvscores = [],[]

        # initialize the ensemble of forests to run in parallel
        # class is also structured to handle single-process 
        rfs = RandomForestsParallel(N, n, m, l, N_proc)