Exemple #1
0
def main():
    # train_bundle = process_data('raw/train.csv', None, interpolation=True, add_noise=True)

    train_bundle = process_data('raw/train.csv',
                                None,
                                interpolation=True,
                                add_noise=True)[:160]
    val_bundle = process_data('raw/train.csv',
                              None,
                              interpolation=False,
                              add_noise=False)[160:]

    # features = np.load('../osic_version_03/label/test_01.npy')
    # features = read_csv('../osic_version_03/label/offset.csv')
    # train_set = LsmDataset(train_bundle, features=features[:160], tag='train')
    # val_set = LsmDataset(val_bundle, features=features[160:], tag='val')

    train_set = LsmDataset(train_bundle, features=None, tag='train')
    val_set = LsmDataset(val_bundle, features=None, tag='val')
    # val_set = None

    model = LsmModel()
    model.fit(train_set, val_set)

    print(model.a.shape)
    model.save_model('model/lsm_15.npy')
def evaluate_accuracy(model, loader, vali_data, batchsize, dim_input_course,
                      dim_input_grade, dim_input_major):

    model.eval()
    summ1 = 0  # ABCDF
    summ2 = 0  # credit/uncredit
    len1 = len2 = 0
    for step, (batch_x,
               batch_y) in enumerate(loader):  # batch_x: index of batch data
        processed_data = process_data(batch_x.numpy(), vali_data, batchsize,
                                      dim_input_course, dim_input_grade,
                                      dim_input_major)
        padded_input = Variable(torch.Tensor(processed_data[0]),
                                requires_grad=False).cuda()
        seq_len = processed_data[1]
        padded_label = Variable(torch.Tensor(processed_data[2]),
                                requires_grad=False).cuda()

        # clear hidden states
        model.hidden = model.init_hidden()

        # compute output
        y_pred = model(padded_input, seq_len).cuda()

        # only compute the accuracy for testing period
        accura = accuracy(y_pred, seq_len, padded_label)
        len1 += accura[3]
        len2 += accura[4]
        summ1 += (accura[0] * accura[3])
        summ2 += (accura[1] * accura[4])

    average_accuracy = (summ1 + summ2) / (len1 + len2)
    return average_accuracy
def evaluate_loss(model, loader, vali_data, batchsize, dim_input_course,
                  dim_input_grade, dim_input_major, weight1, weight2):

    model.eval()
    summ = []
    for step, (batch_x,
               batch_y) in enumerate(loader):  # batch_x: index of batch data
        processed_data = process_data(batch_x.numpy(), vali_data, batchsize,
                                      dim_input_course, dim_input_grade,
                                      dim_input_major)
        padded_input = Variable(torch.Tensor(processed_data[0]),
                                requires_grad=False).cuda()
        seq_len = processed_data[1]
        padded_label = Variable(torch.Tensor(processed_data[2]),
                                requires_grad=False).cuda()

        # clear hidden states
        model.hidden = model.init_hidden()
        model.hidden[0] = model.hidden[0].cuda()
        model.hidden[1] = model.hidden[1].cuda()
        # compute output
        y_pred = model(padded_input, seq_len).cuda()
        # only compute the loss for testing period
        loss = model.vali_loss(y_pred, seq_len, padded_label, weight1,
                               weight2).cuda()
        summ.append(loss.data[0])
    average_loss = np.average(summ)
    return average_loss
Exemple #4
0
def train(model, optimizer, loader, train_data, epoch):

    model.train()
    summ = []
    for step, (batch_x,
               batch_y) in enumerate(loader):  # batch_x: index of batch data
        print('Epoch: ', epoch, ' | Iteration: ', step + 1)
        processed_data = process_data(batch_x.numpy(), train_data, batchsize,
                                      dim_input_course, dim_input_grade)
        padded_input = Variable(torch.Tensor(processed_data[0]),
                                requires_grad=False).cuda()
        seq_len = processed_data[1]
        padded_label = Variable(torch.Tensor(processed_data[2]),
                                requires_grad=False).cuda()

        # clear gradients and hidden state
        optimizer.zero_grad()
        model.hidden = model.init_hidden()
        model.hidden[0] = model.hidden[0].cuda()
        model.hidden[1] = model.hidden[1].cuda()
        y_pred = model(padded_input, seq_len).cuda()
        loss = model.loss(y_pred, padded_label).cuda()
        print('Epoch ' + str(epoch) + ': ' + 'The ' + str(step + 1) +
              '-th interation: loss' + str(loss.item()) + '\n')
        loss.backward()
        if clip_gradient > 0:
            clip_grad_norm(model.parameters(), clip_gradient)
        optimizer.step()
        summ.append(loss.item())

    average_loss = np.mean(summ)
    return average_loss
Exemple #5
0
async def meh(event):

    sender = await event.get_sender()
    if '/start' in event.raw_text:

        await bot.send_message(sender, 'Привет')
        if not database.is_user_new(sender.id):
            await bot.send_message(sender,
                                   "Выбери категории:",
                                   buttons=buttons)
        else:
            await bot.send_message(sender,
                                   'Хочешь изменить выбор?',
                                   buttons=buttons_to_change)
    else:

        await bot.send_message(sender, 'Напиши: /start')

    f = open('last_date.txt')
    last_time = f.read()
    if abs(datetime.datetime.now().hour - int(last_time)) > 0:
        f = open('last_date.txt', 'w')
        f.write(str(datetime.datetime.now().hour))
        f.close()

        if abs(datetime.datetime.now().hour - int(last_time)) > 24:
            database.truncate_all_data()

        database.truncate_tempdata()
        ch = database.get_channels(amount_bottom=10, amount_top=390)
        await parse(ch, 400, sleep_time=10, update_id=True)

        process_data(probability_sim=0.3, rnn=True)
        await send_posts(amount_to_send=0)

    if '/help' in event.raw_text:
        pass
Exemple #6
0
def main():
    bundle = process_data('raw/train.csv', add_noise=False)
    train_bundle, val_bundle = bundle[:160], bundle[160:]

    train_set = OsicDataset(train_bundle,
                            train_transform,
                            tag='train',
                            sample_num=100)
    val_set = OsicDataset(val_bundle, val_transform, tag='val')

    lsm_model = LsmModel()
    lsm_model.load_model('model/lsm_09.npy')

    model = OsicModel('cnn_01',
                      lsm_model=lsm_model,
                      net=NetFc(input_dim=13, input_channel=1, output_dim=1),
                      learning_rate=5e-5)
    model.fit(train_set, val_set, epochs=200, batch_size=32)
def pretrain(rnn_params, input_file, training_split, feature_file, test_file,
             judgement_file):
    raw_data = get_corpus_data(input_file)
    inventory, phone2ix, ix2phone, training, dev = process_data(
        raw_data, dev=False, training_split=training_split)
    inventory_size = len(inventory)
    rnn_params['inv_size'] = inventory_size

    if feature_file is None:
        RNN = Emb_RNNLM(rnn_params)
        print('Fitting embedding model...')
    else:
        features, num_feats = process_features(feature_file, inventory)
        # build feature table, to replace embedding table, No grad b/c features
        # are fixed
        feature_table = torch.zeros(inventory_size,
                                    num_feats,
                                    requires_grad=False)
        for i in range(inventory_size):
            feature_table[i] = torch.tensor(features[ix2phone[i]])
            rnn_params['d_feats'] = num_feats
            RNN = Feature_RNNLM(rnn_params, feature_table)
            print('Fitting feature model...')

    RNN = RNN.cuda()
    dev = dev.cuda()
    train_lm(training, dev, rnn_params, RNN)
    RNN.eval()
    # TODO add automatic creation of models and results folder
    torch.save(
        RNN,
        f"models/rnn_{rnn_params['stress']}_{rnn_params['num_layers']}_{rnn_params['d_emb']}_{rnn_params['d_hid']}.pt"
    )

    prepend = "stress_" if rnn_params["stress"] else ""
    pickle.dump(phone2ix, open(f'models/{prepend}phone2ix.bin', mode='wb'))
    pickle.dump(ix2phone, open(f'models/{prepend}ix2phone.bin', mode='wb'))
    pickle.dump(inventory, open(f'models/{prepend}inventory.bin', mode='wb'))

    get_probs(test_file, RNN, phone2ix, judgement_file)
Exemple #8
0
                        default=DEFAULT_DEV,
                        help='Trains on all data and tests on a small subset.')
    parser.add_argument('--device', default=DEFAULT_DEVICE, help='cpu or cuda')
    parser.add_argument('--num_rnns',
                        default=DEFAULT_NUM_RNNS,
                        help='number of rnns')

    args = parser.parse_args()
    if args.device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = args.device
    print('Running on', device)

    raw_data = get_corpus_data(args.input_file)
    inventory, phone2ix, ix2phone, training, dev = process_data(
        raw_data, device, dev=args.dev, training_split=args.training_split)
    print('training size', training.size())
    inventory_size = len(inventory)

    rnn_params = {}
    rnn_params['d_emb'] = args.d_emb
    rnn_params['d_hid'] = args.d_hid
    rnn_params['num_layers'] = args.num_layers
    rnn_params['batch_size'] = args.batch_size
    rnn_params['learning_rate'] = args.learning_rate
    rnn_params['epochs'] = args.epochs
    rnn_params['tied'] = args.tied
    rnn_params['device'] = device
    rnn_params['inv_size'] = inventory_size

    rnns = {}
def evaluate_metrics(model, loader, vali_data, batchsize, dim_input_course,
                     dim_input_grade, dim_input_major):

    model.eval()
    summ1 = 0  # >=B or <B
    summ2 = 0  # credit/uncredit

    len1 = len2 = 0
    tp = np.zeros(2)
    tn = np.zeros(2)
    true = np.zeros(2)
    false = np.zeros(2)
    predict_true = np.zeros(2)
    predict_false = np.zeros(2)
    for step, (batch_x,
               batch_y) in enumerate(loader):  # batch_x: index of batch data
        processed_data = process_data(batch_x.numpy(), vali_data, batchsize,
                                      dim_input_course, dim_input_grade,
                                      dim_input_major)
        padded_input = Variable(torch.Tensor(processed_data[0]),
                                requires_grad=False).cuda()
        seq_len = processed_data[1]
        padded_label = Variable(torch.Tensor(processed_data[2]),
                                requires_grad=False).cuda()

        # clear hidden states
        model.hidden = model.init_hidden()
        model.hidden[0] = model.hidden[0].cuda()
        model.hidden[1] = model.hidden[1].cuda()
        # compute output
        y_pred = model(padded_input, seq_len)

        # only compute the accuracy for testing period
        accura = accuracy(y_pred, seq_len, padded_label)
        len1 += accura[3]
        len2 += accura[4]
        summ1 += (accura[0] * accura[3])
        summ2 += (accura[1] * accura[4])

        print('>=B or not', accura[0], 'credit/uncredit', accura[1], 'total',
              accura[2])

        # compute tp, fp, fn, tn
        sen = sensitivity(y_pred, seq_len, padded_label)
        tp += sen[0]
        tn += sen[1]
        true += sen[2]
        false += sen[3]
        predict_true += sen[4]
        predict_false += sen[5]

    average_metric1 = summ1 / len1
    average_metric2 = summ2 / len2
    average_metric = (summ1 + summ2) / (len1 + len2)

    print("num of >=B or <B: ", len1, "num of credit/uncredit: ", len2)
    print("On average: ", average_metric1, average_metric2, average_metric)

    tpr = tp / true
    fpr = (predict_true - tp) / false
    fnr = (predict_false - tn) / true
    tnr = tn / false

    precision_B = (tn / predict_false)[0]
    f_value_B = 2 / (1 / tnr[0] + 1 / precision_B)
    precision_uncredit = (tn / predict_false)[-1]
    f_value_uncredit = 2 / (1 / tnr[-1] + 1 / precision_uncredit)
    f_value = np.append(f_value_B, f_value_uncredit)
    print("tpr: ", tpr)
    print("fpr: ", fpr)
    print("fnr: ", fnr)
    print("tnr: ", tnr)
    print('F: ', f_value, 'average F:', np.average(f_value))
Exemple #10
0
def main(_):
    word2id = {}
    ent2id = {}
    rel2id = {}
    words = set()
    relations = set()
    entities = set()

    FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.data_file)
    FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.KB_file)
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)

    KB_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.KB_file)
    data_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.data_file)

    start = time.time()
    if FLAGS.data_file == "WC-C":
        Q, A, P, S, Triples, FLAGS.query_size = process_data_c(
            KB_file, data_file, word2id, rel2id, ent2id, words, relations,
            entities)
        FLAGS.path_size = len(P[0][0])  #5
    else:
        Q, A, P, S, Triples, FLAGS.query_size = process_data(
            KB_file, data_file, word2id, rel2id, ent2id, words, relations,
            entities)
        FLAGS.path_size = len(P[0])  #5 or 7 or

    FLAGS.nhop = FLAGS.path_size // 2  # 2 or 3

    print("read data cost %f seconds" % (time.time() - start))
    FLAGS.nwords = len(word2id)
    FLAGS.nrels = len(rel2id)
    FLAGS.nents = len(ent2id)

    #trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = cross_validation.train_test_split(Q, A, P, S, test_size=.1, random_state=123)
    #trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = cross_validation.train_test_split(trainQ, trainA, trainP, trainS, test_size=.11, random_state=0)
    trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split(
        Q, A, P, S, test_size=.1, random_state=123)
    trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = train_test_split(
        trainQ, trainA, trainP, trainS, test_size=.11, random_state=0)

    # for UNSEEN relations (incomplete kb setting, change data_utils.py)
    if FLAGS.unseen:
        id_c = []
        for idx in range(trainQ.shape[0]):
            if trainP[idx][-4] == 1 or trainP[idx][-4] == 2 or trainP[idx][
                    -4] == 3:
                id_c.append(idx)
        trainQ = np.delete(trainQ, id_c, axis=0)
        trainA = np.delete(trainA, id_c, axis=0)
        trainP = np.delete(trainP, id_c, axis=0)
        trainS = np.delete(trainS, id_c, axis=0)

    n_train = trainQ.shape[0]
    n_test = testQ.shape[0]
    n_val = validQ.shape[0]
    print("Training Size", n_train)
    print("Validation Size", n_val)
    print("Testing Size", n_test)

    #
    #other data and some flags
    #
    id2word = dict(zip(word2id.values(), word2id.keys()))
    id2rel = dict(
        zip(rel2id.values(), rel2id.keys())
    )  #{0: '<end>', 1: 'cause_of_death', 2: 'gender', 3: 'profession', 4: 'institution', 5: 'religion', 6: 'parents', 7: 'location', 8: 'place_of_birth', 9: 'nationality', 10: 'place_of_death', 11: 'spouse', 12: 'children', 13: 'ethnicity'}

    train_labels = np.argmax(trainA, axis=1)
    test_labels = np.argmax(testA, axis=1)
    valid_labels = np.argmax(validA, axis=1)

    print(flags.FLAGS.__flags)

    #batch_id
    #batches = [(start, end) for start, end in batches] abandom last few examples
    batches = list(
        zip(range(0, n_train - FLAGS.batch_size, FLAGS.batch_size),
            range(FLAGS.batch_size, n_train, FLAGS.batch_size)))

    r = np.arange(n_train)  # instance idx to be shuffled
    l = n_train / FLAGS.batch_size * FLAGS.batch_size  #total instances used in training

    with tf.Session() as sess:
        if not FLAGS.data_file == "WC-C":
            model = IRN(FLAGS, sess)

            print("KB Size", Triples.shape[0])  #144
            pre_batches = list(
                zip(
                    range(0, Triples.shape[0] - FLAGS.batch_size,
                          FLAGS.batch_size),
                    range(FLAGS.batch_size, Triples.shape[0],
                          FLAGS.batch_size)))

            pre_val_preds = model.predict(Triples, validQ, validP)
            pre_test_preds = model.predict(Triples, testQ, testP)
            best_val_epoch = -1
            best_val_acc = MultiAcc(validP, pre_val_preds, FLAGS.path_size)
            best_val_true_acc = InSet(validP, validS, pre_val_preds)

            for t in range(1, FLAGS.nepoch + 1):
                start = time.time()
                np.random.shuffle(batches)
                for i in range(FLAGS.inner_nepoch):
                    np.random.shuffle(pre_batches)
                    pre_total_cost = 0.0
                    for s, e in pre_batches:
                        pre_total_cost += model.batch_pretrain(
                            Triples[s:e], trainQ[0:FLAGS.batch_size],
                            trainA[0:FLAGS.batch_size],
                            np.argmax(trainA[0:FLAGS.batch_size],
                                      axis=1), trainP[0:FLAGS.batch_size])

                total_cost = 0.0
                for s, e in batches:
                    total_cost += model.batch_fit(
                        Triples[s:e], trainQ[s:e], trainA[s:e],
                        np.argmax(trainA[s:e], axis=1), trainP[s:e])

                if t % 1 == 0:
                    train_preds = model.predict(Triples, trainQ, trainP)
                    train_acc = MultiAcc(trainP, train_preds, FLAGS.path_size)
                    train_true_acc = InSet(trainP, trainS, train_preds)

                    val_preds = model.predict(
                        Triples, validQ,
                        validP)  # (n_val,1)  each is answer id
                    val_acc = MultiAcc(validP, val_preds, FLAGS.path_size)
                    val_true_acc = InSet(validP, validS, val_preds)

                    if val_true_acc > best_val_true_acc:
                        best_val_epoch = t
                        best_val_true_acc = val_true_acc
                        model.store()

                    print('-----------------------')
                    print('Epoch', t)
                    print('timing', (time.time() - start))
                    print('Total Cost:', total_cost)
                    print('Train Accuracy:', train_true_acc)
                    print('Validation Accuracy:', val_true_acc)
                    print('Best Validation epoch & Acc:', best_val_epoch,
                          best_val_true_acc)
                    print('-----------------------')
                    '''
                    if not t % 100 == 0:
                        continue
                    idx = model.match()
                    for i in range(1,14):
                        print "relation: ",id2word[i]
                        print "similar words are: "
                        for iid in idx[i]:
                            print id2word[iid]
                        print('-----------------------')
                        print('-----------------------')
                    '''

        elif FLAGS.data_file == "WC-C":
            model = IRN_C(FLAGS, sess)

            print("KB Size", Triples.shape[0])  #144
            pre_batches = list(
                zip(
                    range(0, Triples.shape[0] - FLAGS.batch_size,
                          FLAGS.batch_size),
                    range(FLAGS.batch_size, Triples.shape[0],
                          FLAGS.batch_size)))

            pre_val_preds = model.predict(Triples, validQ, validP)
            pre_test_preds = model.predict(Triples, testQ, testP)
            best_val_epoch = -1
            best_val_acc = MultiAcc_C(validP, pre_val_preds)
            best_val_true_acc = InSet(validP, validS, pre_val_preds)

            for t in range(1, FLAGS.nepoch + 1):
                start = time.time()
                np.random.shuffle(batches)
                for i in range(FLAGS.inner_nepoch):
                    np.random.shuffle(pre_batches)
                    pre_total_cost = 0.0
                    for s, e in pre_batches:
                        pre_total_cost += model.batch_pretrain(
                            Triples[s:e], trainQ[0:FLAGS.batch_size],
                            trainA[0:FLAGS.batch_size],
                            np.argmax(trainA[0:FLAGS.batch_size],
                                      axis=1), trainP[0:FLAGS.batch_size])

                total_cost = 0.0
                for s, e in batches:
                    total_cost += model.batch_fit(
                        Triples[s:e], trainQ[s:e], trainA[s:e],
                        np.argmax(trainA[s:e], axis=1), trainP[s:e])

                if t % 1 == 0:
                    train_preds = model.predict(Triples, trainQ, trainP)
                    train_acc = MultiAcc_C(trainP, train_preds)
                    train_true_acc = InSet(trainP, trainS, train_preds)
                    val_preds = model.predict(
                        Triples, validQ,
                        validP)  # (n_val,1)  each is answer id
                    val_acc = MultiAcc_C(validP, val_preds)
                    val_true_acc = InSet(validP, validS, val_preds)

                    if val_true_acc > best_val_true_acc:
                        best_val_epoch = t
                        best_val_true_acc = val_true_acc
                        model.store()

                    print('-----------------------')
                    print('Epoch', t)
                    print('timing', (time.time() - start))
                    print('Total Cost:', total_cost)
                    print('Train Accuracy:', train_true_acc)
                    print('Validation Accuracy:', val_true_acc)
                    print('Best Validation epoch & Acc:', best_val_epoch,
                          best_val_true_acc)
                    print('-----------------------')
Exemple #11
0
def train(vi=None,
          device='/gpu:1',
          label_Flag=False,
          fps='Morgan',
          pro='psa',
          fps_size=512,
          nclass=2):
    #Data set
    ds = 'HIV'
    prop = pro

    #Settings

    if fps == 'Maccs':
        fps_size = 167

    continue_training = True

    if continue_training:
        preTrain = 'preTrain'
    else:
        preTrain = 'scratch'

    save_model = False
    batch_size = 256
    _flag_noise = False
    nEpoch = 501
    fps_dim = fps_size
    latent_space = 6
    n_classes = nclass
    layers_dim = np.array([fps_dim // 2, fps_dim // 8, latent_space])

    data = dp.process_data(ds,
                           fps_type=fps,
                           n_classes=n_classes,
                           nBits=fps_dim,
                           test_size=600,
                           prop=prop)

    len_train = len(data['fps_train'])
    len_val = len(data['fps_test'])

    if fps == 'Morgan':
        val_lr_enc = 0.00001
        val_lr_dec = 0.0001
        val_lr_dis = 0.00001
    elif fps == 'Maccs':
        val_lr_enc = 0.00003
        val_lr_dec = 0.0001
        val_lr_dis = 0.000012

    decay_steps = 1000
    thrs_noise = 0.8

    #Paths
    model_path = os.path.join(
        '/mnt/HDD1/models/',
        ds + '_' + fps + '_' + prop + '_classes' + str(n_classes))
    preTrain_model_path = os.path.join(
        '/mnt/HDD1/models/',
        'Train' + '_' + fps + '_' + prop + '_classes' + str(n_classes))
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    #File name

    with tf.device(device):
        with tf.variable_scope('input'):
            #real and fake image placholders
            real_fps = tf.placeholder(tf.float32,
                                      shape=[None, fps_dim],
                                      name='real_fps')
            gen_fps = tf.placeholder(tf.float32,
                                     shape=[None, fps_dim],
                                     name='gen_fps')
            if label_Flag:
                dist_encode = tf.placeholder(
                    tf.float32,
                    shape=[None, layers_dim[2] + n_classes],
                    name='real_z')
            else:
                dist_encode = tf.placeholder(tf.float32,
                                             shape=[None, layers_dim[2]],
                                             name='real_z')
            labels = tf.placeholder(tf.float32,
                                    shape=[None, n_classes],
                                    name='labels')

            is_train_enc = tf.placeholder(tf.bool, name='is_train_enc')
            is_train_dec = tf.placeholder(tf.bool, name='is_tain_dec')
            is_train_dis = tf.placeholder(tf.bool, name='is_train_dis')

            global_step = tf.placeholder(tf.float32, name='global_step')

            lengt = tf.placeholder(tf.float32, name='lengt')

            l = tf.placeholder(tf.float32,
                               shape=[None, latent_space],
                               name='l')

            fp = tf.placeholder(tf.float32, shape=[None, fps_dim], name='fp')

        lr_dis = tf.train.polynomial_decay(val_lr_dis,
                                           global_step,
                                           decay_steps,
                                           end_learning_rate=0.000001,
                                           power=1.0)
        lr_enc = tf.train.polynomial_decay(val_lr_enc,
                                           global_step,
                                           decay_steps,
                                           end_learning_rate=0.000001,
                                           power=1.0)
        lr_dec = tf.train.polynomial_decay(val_lr_dec,
                                           global_step,
                                           decay_steps,
                                           end_learning_rate=0.000001,
                                           power=2.0)
        # wgan
        real_encode = module.dense_encoder(real_fps,
                                           fps_dim,
                                           layers_dim,
                                           is_train=is_train_enc,
                                           reuse=False)
        real_decode = module.dense_decoder(real_encode,
                                           fps_dim,
                                           layers_dim,
                                           is_train=is_train_dec,
                                           reuse=False)

        if label_Flag:
            real_encode = tf.concat([real_encode, labels], 1)

        #Discriminator
        real_result = module.dense_discriminator(dist_encode,
                                                 layers_dim,
                                                 is_train=is_train_dis,
                                                 n_classes=n_classes,
                                                 reuse=False,
                                                 label_Flag=label_Flag)
        fake_result = module.dense_discriminator(real_encode,
                                                 layers_dim,
                                                 is_train=is_train_dis,
                                                 n_classes=n_classes,
                                                 reuse=True,
                                                 label_Flag=label_Flag)

        decode = module.heavside(
            module.dense_decoder(l,
                                 fps_dim,
                                 layers_dim,
                                 is_train=False,
                                 reuse=True))
        encode = module.dense_encoder(fp,
                                      fps_dim,
                                      layers_dim,
                                      is_train=False,
                                      reuse=True)
        #Loss calculations
        #dis_loss_real = tf.losses.mean_squared_error(real_result, tf.ones_like(real_result))
        #dis_loss_fake = tf.losses.mean_squared_error(fake_result, -tf.ones_like(fake_result))
        #dis_loss_real = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(real_result),logits =real_result)
        #dis_loss_fake = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(fake_result), logits = fake_result)
        dis_loss_fake = tf.reduce_mean(fake_result)
        dis_loss_real = -tf.reduce_mean(real_result)
        dis_loss = tf.reduce_mean([dis_loss_real, dis_loss_fake])

        enc_loss = -tf.reduce_mean(fake_result)
        #enc_loss = tf.losses.mean_squared_error(fake_result, tf.ones_like(fake_result))
        dec_loss = tf.losses.mean_squared_error(real_fps, real_decode)
        #dec_loss = tf.reduce_mean(tf.reduce_sum(tf.abs(real_fps - real_decode)))

        #Trainers
        t_vars = tf.trainable_variables()
        dis_vars = [var for var in t_vars if 'dense_discriminator' in var.name]
        enc_vars = [var for var in t_vars if 'dense_encoder' in var.name]
        dec_vars = [var for var in t_vars if 'dense_decoder' in var.name]

        trainer_dis_real = tf.train.AdamOptimizer(
            learning_rate=lr_dis,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
            use_locking=False,
            name='Adam_discriminator').minimize(dis_loss_real,
                                                var_list=dis_vars)
        trainer_dis_fake = tf.train.AdamOptimizer(
            learning_rate=lr_dis,
            beta1=0.9,
            beta2=0.999,
            epsilon=1e-08,
            use_locking=False,
            name='Adam_discriminator').minimize(dis_loss_fake,
                                                var_list=dis_vars)

        trainer_enc = tf.train.AdamOptimizer(learning_rate=lr_enc,
                                             beta1=0.9,
                                             beta2=0.999,
                                             epsilon=1e-08,
                                             use_locking=False,
                                             name='Adam_encoder').minimize(
                                                 enc_loss, var_list=enc_vars)

        trainer_dec = tf.train.AdamOptimizer(learning_rate=lr_dec,
                                             beta1=0.9,
                                             beta2=0.999,
                                             epsilon=1e-08,
                                             use_locking=False,
                                             name='Adam_decoder').minimize(
                                                 dec_loss, var_list=dec_vars)

        d_clip = [v.assign(tf.clip_by_value(v, -0.01, 0.01)) for v in dis_vars]

        #Accuracy calculations
        less_then_05 = tf.cast(
            tf.math.less_equal(tf.zeros_like(real_result), real_result),
            tf.float32)
        count = tf.reduce_sum(less_then_05)
        acc_real = tf.divide(count, lengt)
        acc_fake = tf.divide(
            tf.reduce_sum(
                tf.cast(
                    tf.math.less_equal(fake_result,
                                       tf.zeros_like(fake_result)),
                    tf.float32)), lengt)
        acc_dis = tf.reduce_mean([acc_real, acc_fake])

        acc_enc = 1 - acc_fake
        gen_fps = module.heavside(real_decode)

        acc_dec = tf.metrics.accuracy(module.heavside(real_fps),
                                      module.heavside(real_decode))

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.43)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    # continue training
    if continue_training:
        ckpt = tf.train.latest_checkpoint(preTrain_model_path)
        saver.restore(sess, ckpt)

    #gpu_options = tf.GPUOptions(allow_growth=True)
    #session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False,
    #                                gpu_options=gpu_options)
    #threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    batch_num = math.floor(len(data['fps_train']) / batch_size)
    print('batch size: %d, batch num per epoch: %d, epoch num: %d' %
          (batch_size, batch_num, nEpoch))
    print('start training...')

    dec_iters, dis_iters, enc_iters = 5, 1, 0
    trainLoss_dis, trainAcc_dis = 0, 0
    trainLoss_enc, trainAcc_enc = 0, 0
    trainLoss_dec, trainAcc_dec = 0, 0
    valLoss_dis, valLoss_enc, valLoss_dec = 0, 0, 0
    valAcc_dis, valAcc_enc, valAcc_dec = 0, 0, 0
    for i in range(nEpoch):

        if (trainAcc_dis < 0.505 and trainAcc_enc > 0.98):
            enc_iters = 1
            dis_iters = 7
            thrs_noise = 0.9
            _flag_noise = False
        elif (trainAcc_dis < 0.505 and trainAcc_enc < 0.01):
            dis_iters = 1
            enc_iters = 1
            thrs_noise = 0.7
            _flag_noise = False
        else:
            _flag_noise = False
            thrs_noise = 0.95
            dis_iters = 5
            enc_iters = 1

        trainLoss_dis, trainAcc_dis = 0, 0
        trainLoss_enc, trainAcc_enc = 0, 0
        trainLoss_dec, trainAcc_dec = 0, 0
        valLoss_dis, valLoss_enc, valLoss_dec = 0, 0, 0
        valAcc_dis, valAcc_enc, valAcc_dec = 0, 0, 0

        max_iter = max([dec_iters, dis_iters, enc_iters])
        batch = dp.batch_gen(data['fps_train'],
                             data['labels_train'],
                             batch_size=batch_size,
                             n_dim=layers_dim[2],
                             n_labels=n_classes,
                             label_Flag=label_Flag,
                             dic_iter=max_iter)
        print("Epoch %d" % i)
        train_real_z = distrib.normal_mixture(data['labels_train'],
                                              np.shape(
                                                  data['labels_train'])[0],
                                              n_dim=layers_dim[2],
                                              n_labels=n_classes)
        val_real_z = distrib.normal_mixture(data['labels_val'],
                                            np.shape(data['labels_val'])[0],
                                            n_dim=layers_dim[2],
                                            n_labels=n_classes)
        if label_Flag:
            train_real_z = np.concatenate((train_real_z, data['labels_train']),
                                          axis=1)
            val_real_z = np.concatenate((val_real_z, data['labels_val']),
                                        axis=1)

        for j in range(batch_num):

            if _flag_noise and np.random.uniform(0, 1) > thrs_noise:
                _real_fps = batch['fps'][j] + np.random.normal(
                    0, 0.4, size=np.shape(batch['fps'][j]))
            else:
                _real_fps = batch['fps'][j]
            enc_dict = {
                real_fps: _real_fps,
                labels: batch['label'][j],
                global_step: i,
                is_train_enc: True,
                is_train_dis: False
            }

            for k in range(dis_iters):
                if _flag_noise and np.random.uniform(0, 1) > thrs_noise:
                    _real_fps = batch['fps'][
                        j * dis_iters + k] + np.random.normal(
                            0,
                            0.4,
                            size=np.shape(batch['fps'][j * dis_iters + k]))
                else:
                    _real_fps = batch['fps'][j * dis_iters + k]
                dis_dict = {
                    real_fps: _real_fps,
                    labels: batch['label'][j * dis_iters + k],
                    dist_encode: batch['real_z'][j * dis_iters + k],
                    global_step: i,
                    is_train_enc: False,
                    is_train_dis: True
                }

                sess.run([trainer_dis_real], feed_dict=dis_dict)
                sess.run([trainer_dis_fake], feed_dict=dis_dict)

            # Update the encoder
            for k in range(enc_iters):
                sess.run([trainer_enc], feed_dict=enc_dict)

            # Update decoder
            for k in range(dec_iters):
                if _flag_noise and np.random.uniform(0, 1) > thrs_noise:
                    _real_fps = batch['fps'][
                        j * dis_iters + k] + np.random.normal(
                            0,
                            0.2,
                            size=np.shape(batch['fps'][j * dis_iters + k]))
                else:
                    _real_fps = batch['fps'][j * dis_iters + k]
                dec_dict = {
                    real_fps: _real_fps,
                    global_step: i,
                    is_train_dec: True,
                    is_train_enc: False
                }
                sess.run([trainer_dec], feed_dict=dec_dict)
        nom = 10000
        ds_size_nom = np.shape(data['fps_train'])[0] // nom + 1

        if i % 10 == 0:
            l_space = np.zeros(
                [latent_space, np.shape(data['fps_train'])[0]],
                dtype=np.float32)
            for b in range(ds_size_nom):

                l_space[:, b * nom:b * nom + nom] = (np.array(
                    sess.run([encode],
                             feed_dict={
                                 fp:
                                 data['fps_train'][b * nom:b * nom + nom, :]
                             }))[0].T)

            sample = GM.generate_latent(l_space,
                                        np.array(data['labels_train']))
            for j in sample.keys():
                generated_fingerprints = np.array(
                    sess.run([decode], feed_dict={l: sample[j]})[0])
                for k in range(n_classes):
                    avg_tver, max_tver, min_tver, u_tver, su_tver, nu_tver = sim.tversky(
                        data['fps_test'][k], generated_fingerprints, 1, 1)
                    arg1 = {
                        'Average_tversky': [avg_tver],
                        'Max_tversky': [max_tver],
                        'Min_tversky': [min_tver],
                        'Useful_tversky': [u_tver],
                        'Semiuseful_tversky': [su_tver],
                        'Notuseful_tversky': [nu_tver]
                    }
                    log.log_sim_data(i,
                                     arg1,
                                     flag=label_Flag,
                                     fps=fps,
                                     dSet=ds,
                                     prop=prop,
                                     n_class=n_classes,
                                     preTrain=preTrain)
        d = np.zeros([np.shape(data['fps_train'])[0], latent_space],
                     dtype=np.float32)
        for b in range(ds_size_nom):

            train_loss_dict = {
                real_fps: data['fps_train'][b * nom:b * nom + nom],
                labels: data['labels_train'][b * nom:b * nom + nom],
                dist_encode: train_real_z,
                is_train_dec: False,
                is_train_enc: False,
                is_train_dis: False,
                lengt: len_train
            }
            val_loss_dict = {
                real_fps: data['fps_val'],
                labels: data['labels_val'],
                dist_encode: val_real_z,
                is_train_dec: False,
                is_train_enc: False,
                is_train_dis: False,
                lengt: len_val
            }

            d[b * nom:b * nom + nom, :] = sess.run(
                [encode],
                feed_dict={fp: data['fps_train'][b * nom:b * nom + nom, :]})[0]

            trainLoss_dis += sess.run([dis_loss], feed_dict=train_loss_dict)[0]
            trainLoss_enc += sess.run([enc_loss], feed_dict=train_loss_dict)[0]
            trainLoss_dec += sess.run([dec_loss], feed_dict=train_loss_dict)[0]

            valLoss_dis += sess.run([dis_loss], feed_dict=val_loss_dict)[0]
            valLoss_enc += sess.run([enc_loss], feed_dict=val_loss_dict)[0]
            valLoss_dec += sess.run([dec_loss], feed_dict=val_loss_dict)[0]

            trainAcc_dis += sess.run([acc_dis], feed_dict=train_loss_dict)[0]
            valAcc_dis += sess.run([acc_dis], feed_dict=val_loss_dict)[0]
            trainAcc_enc += sess.run([acc_enc], feed_dict=train_loss_dict)[0]
            valAcc_enc += sess.run([acc_enc], feed_dict=val_loss_dict)[0]
            trainAcc_dec += sess.run([acc_dec],
                                     feed_dict=train_loss_dict)[0][0]
            valAcc_dec += sess.run([acc_dec], feed_dict=val_loss_dict)[0][0]

        print(sess.run([lr_dis], feed_dict={global_step: i}))
        print(
            'Discriminator trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f'
            % (trainLoss_dis / (ds_size_nom), valLoss_dis /
               (ds_size_nom), trainAcc_dis / (ds_size_nom + 1), valAcc_dis /
               (ds_size_nom)))
        print('Encoder trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f' %
              (trainLoss_enc / (ds_size_nom), valLoss_enc /
               (ds_size_nom), trainAcc_enc / (ds_size_nom), valAcc_enc /
               (ds_size_nom)))
        print('Decoder trainLoss = %f valLoss = %f trainAcc = %f valAcc = %f' %
              (trainLoss_dec / (ds_size_nom), valLoss_dec /
               (ds_size_nom), trainAcc_dec / (ds_size_nom), valAcc_dec /
               (ds_size_nom)))
        arg = {
            'Train_loss': [],
            'Val_loss': [],
            'Train_acc': [],
            'Val_acc': []
        }
        arg['Train_loss'] = [
            trainLoss_dis / (ds_size_nom), trainLoss_enc / (ds_size_nom),
            trainLoss_dec / (ds_size_nom)
        ]
        arg['Val_loss'] = [
            valLoss_dis / (ds_size_nom), valLoss_enc / (ds_size_nom),
            valLoss_dec / (ds_size_nom)
        ]
        arg['Train_acc'] = [
            trainAcc_dis / (ds_size_nom), trainAcc_enc / (ds_size_nom),
            trainAcc_dec / (ds_size_nom)
        ]
        arg['Val_acc'] = [
            valAcc_dis / (ds_size_nom), valAcc_enc / (ds_size_nom),
            valAcc_dec / (ds_size_nom)
        ]

        log.log_train_data(i,
                           arg,
                           flag=label_Flag,
                           fps=fps,
                           dSet=ds,
                           prop=prop,
                           n_class=n_classes,
                           preTrain=preTrain)
        if vi != None:
            d1 = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.float32)
            d2 = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.float32)
            c = np.empty(np.shape(d)[0] * latent_space // 2, dtype=np.int32)
            for h in range(latent_space // 2):
                d1[np.shape(d)[0] * h:np.shape(d)[0] * h +
                   np.shape(d)[0]] = d[:, 2 * h]
                d2[np.shape(d)[0] * h:np.shape(d)[0] * h +
                   np.shape(d)[0]] = d[:, 2 * h + 1]
                c[np.shape(d)[0] * h:np.shape(d)[0] * h +
                  np.shape(d)[0]] = np.nonzero(data['labels_train'])[1]
            #vi.update(d1_avg, d2_avg, np.nonzero(data['labels_train'])[1])
            vi.update(d1, d2, c)
        if i % 50 == 0 and i != 0:
            if save_model:
                saver.save(
                    sess, os.path.join(model_path, 'model' + str(i) + '.ckpt'))
    sess.close()
Exemple #12
0
def main(_):
    word2id = {}
    ent2id = {}
    rel2id = {}
    words = set()
    relations = set()
    entities = set()

    FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.data_file)
    FLAGS.checkpoint_dir = os.path.join(FLAGS.checkpoint_dir, FLAGS.KB_file)
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)

    KB_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.KB_file)
    data_file = '%s/%s.txt' % (FLAGS.data_dir, FLAGS.data_file)

    start = time.time()
    if FLAGS.data_file == "WC-C":
        Q, A, P, S, Triples, FLAGS.query_size = process_data_c(
            KB_file, data_file, word2id, rel2id, ent2id, words, relations,
            entities)
        FLAGS.path_size = len(P[0][0])  #5
    else:
        Q, A, P, S, Triples, FLAGS.query_size = process_data(
            KB_file, data_file, word2id, rel2id, ent2id, words, relations,
            entities)
        FLAGS.path_size = len(P[0])  #5 or 7 or

    FLAGS.nhop = FLAGS.path_size // 2  # must be an integer

    print("read data cost %f seconds" % (time.time() - start))
    FLAGS.nwords = len(word2id)
    FLAGS.nrels = len(rel2id)
    FLAGS.nents = len(ent2id)

    trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split(
        Q, A, P, S, test_size=.1, random_state=123)

    # for UNSEEN relations (incomplete kb setting, change data_utils.py)
    if FLAGS.unseen:
        id_c = []
        for idx in range(trainQ.shape[0]):
            if trainP[idx][-4] == 1 or trainP[idx][-4] == 2 or trainP[idx][
                    -4] == 3:
                id_c.append(idx)
        trainQ = np.delete(trainQ, id_c, axis=0)
        trainA = np.delete(trainA, id_c, axis=0)
        trainP = np.delete(trainP, id_c, axis=0)
        trainS = np.delete(trainS, id_c, axis=0)

    #
    #other data and some flags
    #
    id2word = dict(zip(word2id.values(), word2id.keys()))
    id2ent = dict(zip(ent2id.values(), ent2id.keys()))
    id2rel = dict(
        zip(rel2id.values(), rel2id.keys())
    )  #{0: '<end>', 1: 'cause_of_death', 2: 'gender', 3: 'profession', 4: 'institution', 5: 'religion', 6: 'parents', 7: 'location', 8: 'place_of_birth', 9: 'nationality', 10: 'place_of_death', 11: 'spouse', 12: 'children', 13: 'ethnicity'}

    test_labels = np.argmax(testA, axis=1)

    print(flags.FLAGS.__flags)

    with tf.Session() as sess:
        if not FLAGS.data_file == "WC-C":
            model = IRN(FLAGS, sess)
        elif FLAGS.data_file == "WC-C":
            model = IRN_C(FLAGS, sess)

        model.load()

        test_preds = model.predict(Triples, testQ, testP)

        if not FLAGS.data_file == "WC-C":
            test_acc = MultiAcc(testP, test_preds, FLAGS.path_size)
        elif FLAGS.data_file == "WC-C":
            test_acc = MultiAcc_C(testP, test_preds)

        test_true_acc = InSet(testP, testS, test_preds)

        show_k = FLAGS.show_case_no if FLAGS.show_case_no < testQ.shape[
            0] else 0
        input_q = " ".join([id2word[w] for w in testQ[show_k]])
        #output = test_preds[0][0]
        path_words = []
        for j in range(FLAGS.path_size):
            if j % 2 == 0:
                path_words.append(id2ent[test_preds[show_k][j]])
            else:
                path_words.append(id2rel[test_preds[show_k][j]])
        output = "---".join(path_words)

        if FLAGS.show_case_only:
            print('-----------------------')
            print('test input:', input_q)
            print('test output:', output)
            print('-----------------------')
            return

        print('-----------------------')
        print('Test Data', data_file)
        print('Test Accuracy:', test_true_acc)
        print('Test Accuracy for whole Path:', test_acc)
        print('-----------------------')
Created on Thu Sep  6 15:47:18 2018

@author: huangjin
"""
from data_all_get import data_to_local
from data_process import process_data
from data_fea_gen import gen_feature_file
from data_split import split_data
from model_train import train_model
from model_pred import pred_model
from evaluation import evaluation_result
if __name__=='__main__':
    # 读取数据到本地
    data_to_local()
    # 处理数据
    process_data()
    # 提取特征:输入值,预测目标名
    gen_feature_file('oper_rev')
    # 划分数据集
    split_data('2011-03-31', 
               '2016-12-31', 
              '2017-03-31', 
               '2017-12-31',
               '2017-12-31',
               '2018-03-31', 'oper_rev')
    # 模型训练,并保存模型
    train_model('train.csv', 'valid.csv', '2018-03-31', 'oper_rev')
    # 预测
    pred_model('2018-03-31', 'oper_rev')
    # 评估
    evaluation_result('2018-03-31', 'oper_rev')
Exemple #14
0
    print "y size: ", len(y)
    plt.scatter(x, y, c=color)
    plt.xlabel(xname)
    plt.ylabel(yname)
    # add legend
    classes = ['0', '1']
    class_colours = ['r', 'g']
    recs = []
    for i in range(len(class_colours)):
        recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i]))
    plt.legend(recs, classes, loc='upper left')
    plt.show()


train, test, features, features_non_numeric = data_process.read_data()

train, test, features, features_non_numeric = data_process.process_data(
    train, test, features, features_non_numeric)

tsize = 0.001
dtrain, dtest = cross_validation.train_test_split(train, test_size=tsize)

#importance_feat(features)

#Correlation_Matrix_plot(train)

features = ['Customers', 'Sales', 'Promo']
data = dtest[features]

Scatter_plot(data)
Exemple #15
0
# Data sources (from NYT repository as submodule)
pop_data_dir = 'population-data'
covid_data_dir = 'covid-19-data'
county_data_filename = 'us-counties.csv'
state_data_filename = 'us-states.csv'

# County population data from USDA
county_pop_data = dp.import_pop_data(os.path.join(pop_data_dir, county_data_filename))
state_pop_data = dp.import_pop_data(os.path.join(pop_data_dir, state_data_filename))


county_df = dp.import_data(os.path.join(covid_data_dir, county_data_filename))
county_df = county_df[county_df['fips'].notnull()]
# county_df = county_df[(county_df['fips'].notnull()) & (county_df['fips'] != 'nan')]
county_df = dp.process_data(county_df, county_pop_data)
state_df = dp.process_data(dp.import_data(os.path.join(covid_data_dir, state_data_filename)),
                           state_pop_data)


states = state_df.state.unique()
states.sort()

last_date = county_df['date'].max()

# Get the listing of counties for placing data on the map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)


# ------------------------------------------------------------------------------
from cnn import create_convolutional_netowork
from data_generator import traing_data_generator, testing_data_generator
from data_process import process_data
from training_phase import train_cnn
from testing_phase import test_cnn

create_convolutional_netowork()
training_data, validation_data = process_data('Training_Dataset',
                                              'Validation_Data', 150,
                                              'categorical')

train_cnn(training_data, validation_data)
#test_cnn(Skip_lable=False): Pass True if do not want to display lable
test_cnn()