Ejemplo n.º 1
0
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])

   # Glass dataset
   if "glass" in test[0]:
      kmeans_sfs_glass = np.array([1,3])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_glass])
      print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance()

      kmeans_ga_glass = np.array([0,1,2,3,4,5,6])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_glass])
      print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance()

      hac_sfs_glass = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_glass])
      print "HAC SFS glass performance = %f" % hac_model.calculate_performance()

   # Iris dataset
   elif "iris" in test[0]:
      kmeans_sfs_iris = np.array([1])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_iris])
      print "Kmeans SFS iris performance = %f" % kmeans_model.calculate_performance()
      kmeans_ga_iris = np.array([0,1])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_iris])
      print "Kmeans GA iris performance = %f" % kmeans_model.calculate_performance()
      hac_sfs_iris = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_iris])
      print "HAC SFS glass performance = %f" % hac_model.calculate_performance()
Ejemplo n.º 2
0
def train(args, seed=0):
    blocks = np.array([
        'allen_d', 'moore_a', 'lee_l', 'robinson_h', 'mcguire_j', 'blum_a',
        'jones_s', 'young_s'
    ])

    use_gpu = args['use_gpu']

    np.random.seed(seed)
    torch.manual_seed(seed)
    idxs = np.random.permutation(len(blocks))
    train_blocks = list(blocks[idxs[0:3]])
    val_blocks = list(blocks[idxs[3:5]])
    test_blocks = list(blocks[idxs[5:8]])

    # train_blocks = ['robinson_h']
    # val_blocks = ['robinson_h']
    # test_blocks = list(blocks)

    # print(train_blocks)

    num_epochs = args['n_epochs']
    in_dim = 14
    margin = args['margin']
    model = DeepSetLinkage(in_dim=in_dim,
                           lr=args['lr'],
                           linear=args['linear'],
                           wd=args['wd'],
                           feature_dim=args['feature_dim'])

    train_losses = []
    val_losses = []

    prev_train_loss = np.inf
    best_val_loss = np.inf
    best_model = deepcopy(model)
    irritaion = 0

    for epoch in range(num_epochs):

        train_loss = 0
        for idx, tb in enumerate(train_blocks):
            pair_features = np.loadtxt(
                'data/rexa/{}/pairFeatures.csv'.format(tb),
                delimiter=',',
                dtype=np.float)

            pairs = process_pair_features(pair_features)
            gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(tb),
                                     delimiter='\t',
                                     dtype=np.float)[:, 1]
            hac = HAC(pairs,
                      gt_clusters,
                      model,
                      margin=margin,
                      use_gpu=use_gpu,
                      feature_dim=args['feature_dim'],
                      teacher_force=args['teacher_force'])

            loss = hac.train_epoch()
            #print(tb, 'train loss:', loss)
            train_loss += loss
        train_loss = train_loss / len(train_blocks)
        print('epoch:', epoch, 'train loss:', train_loss)

        val_loss = 0
        for idx, vb in enumerate(val_blocks):
            pair_features = np.loadtxt(
                'data/rexa/{}/pairFeatures.csv'.format(vb),
                delimiter=',',
                dtype=np.float)
            pairs = process_pair_features(pair_features)
            gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb),
                                     delimiter='\t',
                                     dtype=np.float)[:, 1]
            hac = HAC(pairs,
                      gt_clusters,
                      model,
                      margin=margin,
                      use_gpu=use_gpu,
                      feature_dim=args['feature_dim'],
                      teacher_force=args['teacher_force'])

            loss = hac.validate()
            #print(vb, 'val loss:', loss)
            val_loss += loss
        val_loss = val_loss / len(val_blocks)
        print('epoch:', epoch, 'val loss:', val_loss)

        if train_loss > prev_train_loss:
            print('train loss went up, stopping now')
            model = best_model
            break

        if val_loss >= best_val_loss:
            irritaion += 1
        elif val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = deepcopy(model)
            irritaion = 0

        if irritaion >= args['patience']:
            print("val loss hasn't improved in {} epochs, stopping now".format(
                args['patience']))
            model = best_model
            break

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        prev_train_loss = train_loss

    print('saving results')
    np.save(args['path'] + '/train_losses_' + str(seed),
            np.array(train_losses))
    np.save(args['path'] + '/val_losses_' + str(seed), np.array(val_losses))
    print('done saving results')

    # find f1 score
    link_list = []
    f1_list = []
    # for idx, vb in enumerate(val_blocks):
    for idx, vb in enumerate(val_blocks + train_blocks):
        pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(vb),
                                   delimiter=',',
                                   dtype=np.float)
        pairs = process_pair_features(pair_features)
        gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(vb),
                                 delimiter='\t',
                                 dtype=np.float)[:, 1]
        hac = HAC(pairs,
                  gt_clusters,
                  model,
                  margin=margin,
                  use_gpu=use_gpu,
                  feature_dim=args['feature_dim'])

        links, f1s = hac.cluster()
        link_list.append(links)
        f1_list.append(f1s)

        idx = np.argmax(f1s)
        best_f1 = f1s[idx]
        best_link = links[idx]
        print('{} best f1: {} best link: {}'.format(vb, best_f1, best_link))

    if args['thresh'] == 'find':
        print('finding best thresh')
        best_thresh = find_thresh(link_list, f1_list)
    else:
        best_thresh = float(args['thresh'])
    print('best threshold:', best_thresh)

    test_f1s = []
    for idx, teb in enumerate(test_blocks):
        pair_features = np.loadtxt('data/rexa/{}/pairFeatures.csv'.format(teb),
                                   delimiter=',',
                                   dtype=np.float)
        pairs = process_pair_features(pair_features)
        gt_clusters = np.loadtxt('data/rexa/{}/gtClusters.tsv'.format(teb),
                                 delimiter='\t',
                                 dtype=np.float)[:, 1]
        hac = HAC(pairs,
                  gt_clusters,
                  model,
                  margin=margin,
                  use_gpu=use_gpu,
                  feature_dim=args['feature_dim'])

        f1, log = hac.get_test_f1(best_thresh)
        print('test f1 on {}: {}'.format(teb, f1))
        test_f1s.append(f1)
        np.savetxt(args['path'] + '/log_' + teb + '_' + str(seed) + '.csv',
                   log,
                   delimiter=',')

    print('test f1:', np.mean(test_f1s))
    np.save(args['path'] + '/test_f1_' + str(seed), np.mean(test_f1s))