Example #1
0
def supernatural_rs():
    file_config = FilesConfig(
        vocab_file='twitter_hashtag/twitterhashtags.vocab',
        dataset_file='DataSetsEraldo/dataSetSupernatural.txt',
        task='supernatural')
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    ndy = np.array(y)
    print('exemplos positivos, todo dataset')
    print(ndy.sum())
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    myTuner = Tuner(c, file_config)
    epochs = (100, 0)
    lrs = (1e-5, 1e-1)
    myTuner.random_search_cv(execs=6,
                             epoch_limits=epochs,
                             lr_limits=lrs,
                             cv=10,
                             folds=f,
                             freeze_epochs=True,
                             freeze_lr=False)
    print("RS finished!\n")
Example #2
0
def test_kfold():
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    cnn_config = TCNNConfig()
    cnn_config.num_epochs = 10

    file_config = FilesConfig(vocab_file='twitterhashtags.vocab',
                              dataset_file='DataSetsEraldo/dataSetBahia.txt')
    for i in range(5):
        for cf in f:
            model0 = TextCNN(cnn_config)
            print(c.train_distribution())
            c.prepare_sample(x, y, size=300)
            c.sub_sampling(size=300)
            print(c.x_train.shape)
            t = Trainer(corpus=cf,
                        model=model0,
                        config=cnn_config,
                        file_config=file_config,
                        verbose=True)
            train_acc, train_loss, val_acc, val_loss, best_epoch = t.train()
 def cv_(param):
     params_names=list(params_df.columns.values)
     try:
         self.kernel=param[params_names.index('kernel')] #looks for index of kernel in df
     except ValueError:
         pass
     
     try:
         self.ld=param[params_names.index('ld')]
     except ValueError:
         pass
     
     try:
         self.sigma=param[params_names.index('sigma')]
     except ValueError:
         pass
     
     try:
         self.k=param[params_names.index('k')]
     except ValueError:
         pass
     
     try:
         self.d=param[params_names.index('d')]
     except ValueError:
         pass
     
     try:
         self.e=param[params_names.index('e')]
     except ValueError:
         pass
     
     try:
         self.beta=param[params_names.index('beta')]
     except ValueError:
         pass
     
     X_reset=X.reset_index(drop=True)
     y_reset=y.reset_index(drop=True) #needed to compute accuracy
     
     kf = KFold(n_splits=cv, shuffle=True, random_state=42) 
     score=[]
     for train_index, val_index in kf.split(X_reset):
         X_train, X_val = X_reset.iloc[train_index], X_reset.iloc[val_index] 
         y_train, y_val = y_reset.iloc[train_index], y_reset.iloc[val_index]
         self.fit(X_train,y_train)
         y_pred=self.predict(X_val)
         score.append(np.mean(y_pred==y_val.reset_index(drop=True)))
     return(np.mean(score),np.var(score))
Example #4
0
def TIcE(data, features, labeled_info, nfolds=5, M=500, bepp=5, evaluation=EvaluatePaper):
  ''' Tree Induction fo c Estimation (TIcE)
    Args:
      data (list): list of observation points (dictionaries with [feature]:value).
      features (list): list of which features should be considered.
      labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively).
      nfolds (int): number of folds to average the final prediction.
      M (int): hard limit for branching.
      depp (number): parameter from TIcE's paper.
      evaluation (callable): evaluation measure of branch quality.
    
    Returns:
      (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data).
  '''
  pred_c = 0.5
  for _ in range(2):
    clist = []
    for est_data, tree_data in KFold(nfolds, data):
      delta = max(0.025, 1 / (1 + 0.004 * T(est_data)))
      cbest = L(est_data, labeled_info) / T(est_data)
      pq = [(-evaluation(tree_data, delta, pred_c, labeled_info), random(), tree_data, est_data, features[:])]
      limit = max(0, min(1000, math.floor(0.5 + 0.1 * min(T(est_data), T(tree_data)))))
      m = 0
      while m < M and len(pq) > 0:
        _, _, St, Se, feat = heapq.heappop(pq)
        m += 1
        if T(St) < limit or T(Se) < limit:
          continue

        nev = evaluation(Se, delta, pred_c, labeled_info)
        cbest = max(cbest, nev)
        
        nfeat = []
        possible_feats = []
        for f in feat:
          med = median([x[f] for x in St])
          left_St = [x for x in St if x[f] <= med]
          left_Se = [x for x in Se if x[f] <= med]
          right_St = [x for x in St if x[f] > med]
          right_Se = [x for x in Se if x[f] > med]
          if T(left_St) == 0 or T(right_St) == 0:
            continue
          nfeat.append(f)
          crit = max(L(left_St, labeled_info)/(bepp + T(left_St)), L(right_St, labeled_info)/(bepp + T(right_St)))
          node = (crit, random(), f, left_St, left_Se, right_St, right_Se)
          possible_feats.append(node)

        if len(possible_feats) > 0:
          _, _, f, left_St, left_Se, right_St, right_Se = max(possible_feats)
          if T(left_St) > limit and T(left_Se) > limit:
            heapq.heappush(pq, (-evaluation(left_St, delta, pred_c, labeled_info), random(), left_St, left_Se, nfeat))
          if T(right_St) > limit and T(right_Se) > limit:
            heapq.heappush(pq, (-evaluation(right_St, delta, pred_c, labeled_info), random(), right_St, right_Se, nfeat))

      clist.append(cbest)
    pred_c = mean(clist)

  l = L(data, labeled_info)
  pred_alpha = max(0, min(1, (l / pred_c - l) / (len(data) - l)))
  return pred_c, pred_alpha
Example #5
0
def ENKF(data, features, labeled_info, k=5, gamma="auto"):
  ''' Elkan's Algorithm (EN) with k-fold cross validation (not original Elkan). Returned C is incorrect.
  '''
  labeled = [x for x in data if x[labeled_info] == 1]
  unlabeled = [x for x in data if x[labeled_info] != 1]

  all_probs = []
  all_alphas = []
  for tr, te in KFold(k, labeled):
    data_tr = unlabeled + tr
    shuffle(data_tr)
    test_data = pd.DataFrame(te)[features]
    training_data = pd.DataFrame(data_tr)
    svc = SVC(probability=True, gamma=gamma)
    svc.fit(training_data[features], training_data[labeled_info])
    right_index = 0 if svc.classes_[0] == 1 else 1
    svc_probs = svc.predict_proba(test_data)
    c_probs = [x[right_index] for x in svc_probs]
    all_probs += c_probs
    pred_c = np.mean(c_probs)
    l = len(tr)
    all_alphas += [max(0, min(1, (l / pred_c - l) / (len(data_tr) - l)))]

  pred_c = np.mean(all_probs)

  pred_alpha = np.mean(all_alphas)
  return pred_c, pred_alpha
Example #6
0
def train_cv():
    ultimos_r = []
    dt = []
    file_config = FilesConfig(
        vocab_file='twitter_hashtag/twitterhashtags.vocab',
        dataset_file='DataSetsEraldo/dataSetSupernatural.txt')
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    for cv in f:
        t = Trainer(corpus=cv, file_config=file_config, verbose=True)
        ultimos_r.append(t.train(dt))

    print(ultimos_r)
    print(':)')
Example #7
0
    def test_get_next(self):
        data = [1,2,3,4,5,6]
        classes = ['a', 'b', 'c', 'd', 'e', 'f']

        kfold = KFold(3, data, classes)
        d1, c1 = kfold.get_next()
        d2, c2 = kfold.get_next()

        self.assertEquals(1, d1[0])
        self.assertEquals(2, d1[1])
        self.assertEquals(3, d1[2])

        self.assertEquals('a', c1[0])
        self.assertEquals('b', c1[1])
        self.assertEquals('c', c1[2])

        self.assertEquals(4, d2[0])
        self.assertEquals(5, d2[1])
        self.assertEquals(6, d2[2])

        self.assertEquals('d', c2[0])
        self.assertEquals('e', c2[1])
        self.assertEquals('f', c2[2])
Example #8
0
def pre_rs_supernatural():
    file_config = FilesConfig(
        vocab_file='twitter_hashtag/twitterhashtags.vocab',
        dataset_file='twitter_hashtag/out.txt',
        task='supernatural')
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    myTuner = Tuner(c, file_config)
    epochs = (100, 6)
    lrs = (1e-5, 1e-2)
    myTuner.random_search_cv(execs=1,
                             epoch_limits=epochs,
                             lr_limits=lrs,
                             cv=1,
                             folds=f,
                             freeze_lr=True,
                             freeze_epochs=True)
    print("PRS finished!\n")
Example #9
0
def test_kfold_rs():

    cnn_config = TCNNConfig()
    cnn_config.num_epochs = 4
    file_config = FilesConfig(
        vocab_file='twitter_hashtag/twitterhashtags.vocab',
        dataset_file='twitter_hashtag/out.txt')
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    myTuner = Tuner(c, file_config)
    epochs = (10, 30)
    lrs = (0.0001, 0.01)
    myTuner.random_search_cv(execs=5,
                             epoch_limits=epochs,
                             lr_limits=lrs,
                             cv=4,
                             folds=f,
                             freeze_lr=True)
    print("RS finished!\n")
Example #10
0
    def test_get_next(self):
        data = [1,2,3,4,5,6,7,8,9,10]
        classes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

        counter = 0
        kfold = KFold(2, data, classes)
        while kfold.has_next():
            train_d1, train_c1, test_d1, test_c1 = kfold.get_next()

            self.assertEquals(8, len(train_d1))
            self.assertEquals(8, len(train_c1))
            self.assertEquals(2, len(test_d1))
            self.assertEquals(2, len(test_c1))
            counter += 1

        self.assertEquals(5, counter)

        kfold = KFold(2, data, classes)

        train_d1, train_c1, test_d1, test_c1 = kfold.get_next()
        self.assertEquals(True, lists_are_equal(train_d1, [3,4,5,6,7,8,9,10]))
        self.assertEquals(True, lists_are_equal(train_c1, ['c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']))
        self.assertEquals(True, lists_are_equal(test_d1, [1,2]))
        self.assertEquals(True, lists_are_equal(test_c1, ['a', 'b']))

        train_d1, train_c1, test_d1, test_c1 = kfold.get_next()
        self.assertEquals(True, lists_are_equal(train_d1, [1,2,5,6,7,8,9,10]))
        self.assertEquals(True, lists_are_equal(train_c1, ['a', 'b', 'e', 'f', 'g', 'h', 'i', 'j']))
        self.assertEquals(True, lists_are_equal(test_d1, [3,4]))
        self.assertEquals(True, lists_are_equal(test_c1, ['c', 'd']))

        train_d1, train_c1, test_d1, test_c1 = kfold.get_next()
        train_d1, train_c1, test_d1, test_c1 = kfold.get_next()
        train_d1, train_c1, test_d1, test_c1 = kfold.get_next()

        self.assertEquals(True, lists_are_equal(train_d1, [1,2,3,4,5,6,7,8]))
        self.assertEquals(True, lists_are_equal(train_c1, ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']))
        self.assertEquals(True, lists_are_equal(test_d1, [9,10]))
        self.assertEquals(True, lists_are_equal(test_c1, ['i', 'j']))
Example #11
0
def BFT(data, features, labeled_info, scorer, nfolds=10):
    ''' Best Fixed Threshold
    Args:
      data (list): list of observation points (dictionaries with [feature]:value).
      features (list): list of which features should be considered.
      labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively).
      scorer (callable): One-class scorer to be used. See OCScorers.py for more information.
      nfolds (int): number of folds for cross validation to generate training scores.
    
    Returns:
      pred_alphas: 101 predicted p's, one for each percentile of training positive scores (p is the proportion of positive observation points within the UNLABELED portion of the data).
  '''
    labeled = [x for x in data if x[labeled_info] == 1]

    p_scores = []
    for tr, te in KFold(nfolds, labeled):
        tr_df = pd.DataFrame(tr)[features]
        te_df = pd.DataFrame(te)[features]
        p_scores += scorer(tr_df, te_df)
    p_scores.sort()

    labeled_df = pd.DataFrame(labeled)[features]
    unlabeled = [x for x in data if x[labeled_info] == 0]
    unlabeled_df = pd.DataFrame(unlabeled)[features]
    t_scores = scorer(labeled_df, unlabeled_df)
    t_scores.sort()

    percentiles = np.arange(0, 101, 1)

    thresholds = np.percentile(p_scores, percentiles)

    n = len(t_scores)
    alphas = []
    for thr in zip(thresholds):
        n_neg = float(np.searchsorted(t_scores, thr, side="right"))
        n_pos = n - n_neg
        alphas.append(n_pos / n)

    return np.array(alphas)
Example #12
0
def ODIn(data, features, labeled_info, scorer, nfolds=10):
    ''' One Distribution Inside (ODIn)
    Args:
      data (list): list of observation points (dictionaries with [feature]:value).
      features (list): list of which features should be considered.
      labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively).
      scorer (callable): One-class scorer to be used. See OCScorers.py for more information.
      nfolds (int): number of folds for cross validation to generate training scores.
    
    Returns:
      (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data).
  '''
    labeled = [x for x in data if x[labeled_info] == 1]

    p_scores = []
    for tr, te in KFold(nfolds, labeled):
        tr_df = pd.DataFrame(tr)[features]
        te_df = pd.DataFrame(te)[features]
        p_scores += scorer(tr_df, te_df)

    labeled_df = pd.DataFrame(labeled)[features]
    unlabeled = [x for x in data if x[labeled_info] == 0]
    unlabeled_df = pd.DataFrame(unlabeled)[features]
    t_scores = scorer(labeled_df, unlabeled_df)

    percentiles = np.arange(0, 101, 10)
    thresholds = np.percentile(p_scores, percentiles)

    overflow_limit = EstimateOverflowLimit(p_scores, thresholds)

    p_histogram = CreateHistogram(p_scores, thresholds)
    t_histogram = CreateHistogram(t_scores, thresholds)

    p = FindP(p_histogram, t_histogram, overflow_limit)
    c = max(0, min(1, len(labeled) / (p * len(unlabeled) + len(labeled))))
    return c, p
Example #13
0
def supernatural_lltrain():
    file_config = FilesConfig(
        vocab_file='twitter_hashtag/twitterhashtags.vocab',
        dataset_file='twitter_hashtag/out.txt',
        task='10llsupernatural')
    c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt',
                 vocab_file='twitter_hashtag/twitterhashtags.vocab')
    x, y = c.prepare()
    print(c.size)
    f = KFold(c, 3, rand=1)
    f.prepare_fold(x, y)

    cnn_config_s = TCNNConfig()
    cnn_config_s.num_classes = 2

    args = [
        cnn_config_s,
        '../experiments/1kthashtag.2019-10-21/checkpoints/model21102019-211333epc200lr0.0001.emb',
        '../experiments/1kthashtag.2019-10-21/checkpoints/model21102019-211333epc200lr0.0001.convs'
    ]

    f = RandomSplit(corpus=c, n=10, sub=350)
    f.x = x
    f.y = y

    t = Tuner(c, file_config, callback=model_load, args=args, rand=False)
    epochs = (5, 6)
    lrs = (1e-5, 1e-2)
    t.random_search_rsplit(execs=4,
                           rsplits=f,
                           epoch_limits=epochs,
                           lr_limits=lrs,
                           freeze_epochs=True,
                           freeze_lr=False,
                           r=10)

    print("RS finished!\n")
Example #14
0
def RunExperiment(exp_name, scorer_name, niterations=5, max_sample_size=2000, max_labeled_size=500, nfolds=5):
  exp = Exp[exp_name]
  dataset_filename = exp["input"]
  output_filename = exp["output"]
  class_feature = exp["class_feature"]
  positive_label = exp["positive_label"]
  scorer = scorers_by_name[scorer_name]
  features = None
  negative_labels = None

  data_df = pd.read_csv(dataset_filename, index_col=False)

  if callable(exp["negative_labels"]):
    negative_labels = set(filter(exp["negative_labels"], set(data_df[class_feature])))
  elif isinstance(exp["negative_labels"], list):
    negative_labels = set(exp["negative_labels"])
  else:
    negative_labels = set([x for x in set(data_df[class_feature]) if x != positive_label])

  all_labels = set(list(negative_labels) + [positive_label])

  data_df = pd.DataFrame(data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)])

  if callable(exp["features"]):
    features = list(filter(exp["features"], list(data_df)))
  elif isinstance(exp["features"], list):
    features = exp["features"]
  else:
    features = [x for x in list(data_df) if x != class_feature]


  labeled_info = 'dfjiweojgf'
  data = data_df.to_dict('registers')

  for_table = []
  for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"):
    abs_errors = np.zeros(101)
    abs_errors_2 = np.zeros(101)
    errors = np.zeros(101)
    errors_2 = np.zeros(101)
    ms_per_example = []
    for it in trange(niterations):
      shuffle(data)
      eprint('Iteration %d#' % (it + 1))
      for fold_i, (unlabeled, all_labeled) in zip(trange(nfolds, desc="kfold"), KFold(nfolds, data)):
        eprint('  Fold #%d' % (fold_i + 1))
        for x in unlabeled: x[labeled_info] = 0
        for x in all_labeled: x[labeled_info] = 1
        labeled = [x for x in all_labeled if x[class_feature] == positive_label]
        positives = [x for x in unlabeled if x[class_feature] == positive_label]
        negatives = [x for x in unlabeled if x[class_feature] != positive_label]


        shuffle(labeled)
        shuffle(positives)
        shuffle(negatives)

        sample_size = min(len(positives), len(negatives), max_sample_size)
        npos = math.floor(alpha * sample_size)
        nneg = sample_size - npos
        nlab = min(len(labeled), max_labeled_size)

        sample = positives[:npos] + negatives[:nneg] + labeled[:nlab]
        shuffle(sample)

        actual_c = len(labeled) / (len(labeled) + npos)
        actual_alpha = npos / sample_size
        eprint('       Actual c: %6.2f |    Actual alpha: %6.2f' % (actual_c, actual_alpha))
        tm_start = timer()
        pred_alpha = BFT(sample, features, labeled_info, scorer)
        tm_end = timer()

        ms_per_example.append((tm_end - tm_start) * 1000 / len(sample))
        abs_errors += np.abs(actual_alpha - pred_alpha) / (niterations * nfolds)
        abs_errors_2 += np.abs(actual_alpha - pred_alpha) ** 2.0 / (niterations * nfolds)

        errors += (actual_alpha - pred_alpha) / (niterations * nfolds)
        errors_2 += (actual_alpha - pred_alpha) ** 2.0 / (niterations * nfolds)
    

    std_abs_errors = 100 * np.sqrt(abs_errors_2 - abs_errors ** 2)
    std_errors = 100 * np.sqrt(errors_2 - errors ** 2)
    abs_errors *= 100
    errors *= 100

    row = [100 * alpha, np.mean(ms_per_example), np.std(ms_per_example)] + list(abs_errors) + list(std_abs_errors) + list(errors) + list(std_errors)
    for_table.append(tuple(row))

  h1 = ','.join(['abs_error_th%03d' % x for x in range(101)])
  h2 = ','.join(['abs_error_th%03d_std' % x for x in range(101)])
  h3 = ','.join(['error_th%03d' % x for x in range(101)])
  h4 = ','.join(['error_th%03d_std' % x for x in range(101)])
  header_csv = 'alpha,time,time_std,%s,%s,%s,%s' % (h1, h2, h3, h4)
  mask_csv = ','.join(['%.2f'] * len(for_table[0]))

  eprint()
  with open(output_filename % ('', 'bft_raw_%s' % scorer_name), mode="w") as out:
    print(header_csv, file=out)
    for row in for_table:
      print(mask_csv % row, file=out)
Example #15
0
def PE(data, features, labeled_info, nfolds=5):
    ''' PE
    Args:
      data (list): list of observation points (dictionaries with [feature]:value).
      features (list): list of which features should be considered.
      labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively).
      nfolds (int): number of folds for cross validation.
    
    Returns:
      (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data).
  '''

    labeled = [x for x in data if x[labeled_info] == 1]
    unlabeled = [x for x in data if x[labeled_info] == 0]

    X = pd.DataFrame(data)[features].values
    xp = pd.DataFrame(labeled)[features].values
    xm = pd.DataFrame(unlabeled)[features].values

    n1, n2 = len(labeled), len(unlabeled)

    mm = np.matmul
    sqnorm = lambda x: x.dot(x)
    # med_dist = np.median([norm(x - y) for x in X for y in X])
    med_dist = np.sqrt(np.median([sqnorm(x - y) for x in X for y in X]))
    sigma_list = np.linspace(1 / 5, 5, 10) * med_dist

    lambda_list = np.logspace(-3, 1, 9)

    for (xp_tr, xp_te), (xm_tr, xm_te) in zip(KFold(nfolds, xp),
                                              KFold(nfolds, xm)):
        xp_tr, xp_te = np.array(xp_tr), np.array(xp_te)
        xm_tr, xm_te = np.array(xm_tr), np.array(xm_te)

        n1_tr, n1_te = len(xp_tr), len(xp_te)
        n2_tr, n2_te = len(xm_tr), len(xm_te)

        p1_tr = n1_tr / (n1_tr + n2_tr)
        p1_te = n1_te / (n1_te + n2_te)

        cv_scores = []

        for sigma in sigma_list:
            phi = lambda x: np.array(
                [[math.exp(-sqnorm(x - xi) / (2 * sigma**2))
                  for xi in xp_tr]]).transpose()
            b = len(xp_tr)

            Phi1_tr = np.array([phi(x) for x in xp_tr])
            Phi1_te = np.array([phi(x) for x in xp_te])

            Phi2_tr = np.array([phi(x) for x in xm_tr])
            Phi2_te = np.array([phi(x) for x in xm_te])

            h_tr = np.mean(Phi1_tr, 0)
            h_te = np.mean(Phi1_te, 0)

            add = lambda x, y: x + y
            x = Phi1_tr[0]

            H_tr = p1_tr * reduce(add, (mm(x, x.transpose()) for x in Phi1_tr)) / n1_tr \
              + (1 - p1_tr) * reduce(add, (mm(x, x.transpose()) for x in Phi2_tr)) / n2_tr
            H_te = p1_te * reduce(add, (mm(x, x.transpose()) for x in Phi1_te)) / n1_te \
              + (1 - p1_te) * reduce(add, (mm(x, x.transpose()) for x in Phi2_te)) / n2_te

            for lamb in lambda_list:
                alpha = np.linalg.solve(H_tr + lamb * np.identity(b), h_tr)
                alpha_t = alpha.transpose()
                score = float(0.5 * mm(mm(alpha_t, H_te), alpha) -
                              mm(alpha_t, h_te))
                cv_scores.append((score, lamb, sigma))

    _, lamb, sigma = min(cv_scores)
    phi = lambda x: np.array(
        [[math.exp(-sqnorm(x - xi) / (2 * sigma**2))
          for xi in xp]]).transpose()
    b = len(xp)

    Phi1 = np.array([phi(x) for x in xp])
    Phi2 = np.array([phi(x) for x in xm])
    p1 = n1 / (n1 + n2)
    h = np.mean(Phi1, 0)
    H = p1 * reduce(add, (mm(x, x.transpose()) for x in Phi1)) / n1 \
      + (1 - p1) * reduce(add, (mm(x, x.transpose()) for x in Phi2)) / n2
    alpha = np.linalg.solve(H + lamb * np.identity(b), h)
    alpha_t = alpha.transpose()
    prior = 1 / float((2 * mm(alpha_t, h) - mm(mm(alpha_t, H), alpha)))
    prior = min(1, max(prior, n1 / (n1 + n2)))

    c = max(0, min(1, len(labeled) / (prior * len(unlabeled) + len(labeled))))
    return c, prior
Example #16
0
def RunExperiment(exp_name,
                  method_name,
                  niterations=5,
                  max_sample_size=2000,
                  max_labeled_size=500):
    '''Loads the experiment exp_name and applies method_name to it. Creates a CSV file (according to exp_name) with the results.
  
  Args:
    exp_name (str): the name of the experiment (see explist.py for more information).
    method_name (str): the name of the method (the method includes algorith + parameters. See methodlist.py for more details).
    niterations (int): number of runs to be averaged.
    max_sample_size (int): maximum size for UNLABELED part of the sample (from PU PE perspective) or maximum test size (from OCQ perspective).
    max_labeled_size (int): maximum size for LABELED part of the sample (from PU PE perspective) or maximum training size (from OCQ perspective).
  '''
    exp = Exp[exp_name]
    dataset_filename = exp["input"]
    output_filename = exp["output"]
    class_feature = exp["class_feature"]
    positive_label = exp["positive_label"]
    method = methods[method_name]["func"]
    method_kargs = methods[method_name]["kargs"]
    features = None
    negative_labels = None

    data_df = pd.read_csv(dataset_filename, index_col=False)

    if callable(exp["negative_labels"]):
        negative_labels = set(
            filter(exp["negative_labels"], set(data_df[class_feature])))
    elif isinstance(exp["negative_labels"], list):
        negative_labels = set(exp["negative_labels"])
    else:
        negative_labels = set(
            [x for x in set(data_df[class_feature]) if x != positive_label])

    all_labels = set(list(negative_labels) + [positive_label])

    data_df = pd.DataFrame(
        data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)])

    if callable(exp["features"]):
        features = list(filter(exp["features"], list(data_df)))
    elif isinstance(exp["features"], list):
        features = exp["features"]
    else:
        features = [x for x in list(data_df) if x != class_feature]

    labeled_info = 'dfjiweojgf'
    data = data_df.to_dict('registers')

    for_table = []
    for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"):
        abs_errors = []
        errors = []
        ms_per_example = []
        for it in trange(niterations):
            shuffle(data)
            eprint('Iteration %d#' % (it + 1))
            for fold_i, (unlabeled,
                         all_labeled) in zip(trange(5, desc="kfold"),
                                             KFold(5, data)):
                eprint('  Fold #%d' % (fold_i + 1))
                for x in unlabeled:
                    x[labeled_info] = 0
                for x in all_labeled:
                    x[labeled_info] = 1
                labeled = [
                    x for x in all_labeled
                    if x[class_feature] == positive_label
                ]
                positives = [
                    x for x in unlabeled if x[class_feature] == positive_label
                ]
                negatives = [
                    x for x in unlabeled if x[class_feature] != positive_label
                ]

                shuffle(labeled)
                shuffle(positives)
                shuffle(negatives)

                sample_size = min(len(positives), len(negatives),
                                  max_sample_size)
                npos = math.floor(alpha * sample_size)
                nneg = sample_size - npos
                nlab = min(len(labeled), max_labeled_size)

                sample = positives[:npos] + negatives[:nneg] + labeled[:nlab]
                shuffle(sample)

                actual_c = len(labeled) / (len(labeled) + npos)
                actual_alpha = npos / sample_size
                eprint('  #L %d #U %d' % (len(labeled), npos + nneg))
                eprint('       Actual c: %6.2f |    Actual alpha: %6.2f' %
                       (actual_c, actual_alpha))
                tm_start = timer()
                pred_c, pred_alpha = method(sample, features, labeled_info,
                                            **method_kargs)
                tm_end = timer()
                eprint('    Predicted c: %6.2f | Predicted alpha: %6.2f' %
                       (pred_c, pred_alpha))

                ms_per_example.append((tm_end - tm_start) * 1000 / len(sample))
                abs_errors.append(abs(actual_alpha - pred_alpha))
                errors.append(actual_alpha - pred_alpha)

        for_table.append(
            (100 * alpha, 100 * np.mean(abs_errors), 100 * np.std(abs_errors),
             100 * np.mean(errors), 100 * np.std(errors),
             np.mean(ms_per_example), np.std(ms_per_example)))

    header_csv = 'alpha,mean_abs_error,mean_abs_error_std,mean_error,mean_error_std,time,time_std'
    mask_csv = ','.join(['%.2f'] * 7)
    mask_show = '  '.join(['%7.2f'] * 7)

    eprint()
    with open(output_filename % ('', method_name), mode="w") as out:
        print(header_csv, file=out)
        for row in for_table:
            eprint(mask_show % row)
            print(mask_csv % row, file=out)
    print "count : %d" % count
    # return data
    return data


data = Encoding(data, general_matrix)
test_data = Encoding(test_data, general_matrix)

p = pca(n_components=2)
pca_cal(standardize_dataset(data), labels.T[0].tolist(), data, title = "PCA with z-score normalization on training set")
pca_cal(standardize_dataset(test_data), test_labels, test_data, title = "PCA with z-score normalization on test set")
data[:, 4:] = standardize_dataset(data[:, 4:])
test_data[:, 4:] = standardize_dataset(test_data[:, 4:])

# Seperate dataset to test and train set
kf = KFold(n=len(data), n_folds=10, shuffle=True)
train, test = kf.get_indices()
s = Score()

total_cv_error = []
total_test_error = []
confusion_matx = []
f1score = []
for k in range(1, 100, 5):
    cv_error = []
    test_error = []
    nn = Pipeline([
            ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
            ('classification', KNeighborsClassifier(n_neighbors=k, metric='manhattan'))
        ])
    for i in range(10):
Example #18
0
def RunExperiment(exp_name,
                  method_name,
                  niterations=1,
                  max_sample_size=2000,
                  max_labeled_size=500):
    exp = Exp[exp_name]
    dataset_filename = exp["input"]
    output_filename = exp["output"]
    class_feature = exp["class_feature"]
    positive_label = exp["positive_label"]
    method = methods[method_name]["func"]
    method_kargs = methods[method_name]["kargs"]
    features = None
    negative_labels = None

    data_df = pd.read_csv(dataset_filename, index_col=False)

    if callable(exp["negative_labels"]):
        negative_labels = set(
            filter(exp["negative_labels"], set(data_df[class_feature])))
    elif isinstance(exp["negative_labels"], list):
        negative_labels = set(exp["negative_labels"])
    else:
        negative_labels = set(
            [x for x in set(data_df[class_feature]) if x != positive_label])

    all_labels = set(list(negative_labels) + [positive_label])

    data_df = pd.DataFrame(
        data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)])

    if callable(exp["features"]):
        features = list(filter(exp["features"], list(data_df)))
    elif isinstance(exp["features"], list):
        features = exp["features"]
    else:
        features = [x for x in list(data_df) if x != class_feature]

    labeled_info = 'dfjiweojgf'
    data = data_df.to_dict('registers')

    for_table = []
    for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"):
        abs_errors = []
        errors = []
        ms_per_example = []
        for it in trange(niterations):
            shuffle(data)
            eprint('Iteration %d#' % (it + 1))
            for fold_i, (unlabeled,
                         all_labeled) in zip(range(5), KFold(5, data)):
                eprint('  Fold #%d' % (fold_i + 1))
                for x in unlabeled:
                    x[labeled_info] = 0
                for x in all_labeled:
                    x[labeled_info] = 1
                labeled = [
                    x for x in all_labeled
                    if x[class_feature] == positive_label
                ]
                positives = [
                    x for x in unlabeled if x[class_feature] == positive_label
                ]
                negatives = [
                    x for x in unlabeled if x[class_feature] != positive_label
                ]

                shuffle(labeled)
                shuffle(positives)
                shuffle(negatives)

                sample_size = min(len(positives), len(negatives),
                                  max_sample_size)
                npos = math.floor(alpha * sample_size)
                nneg = sample_size - npos
                nlab = min(len(labeled), max_labeled_size)

                sample = positives[:npos] + negatives[:nneg] + labeled[:nlab]
                shuffle(sample)

                actual_c = len(labeled) / (len(labeled) + npos)
                actual_alpha = npos / sample_size
                eprint('  #L %d #U %d' % (len(labeled), npos + nneg))
                eprint('       Actual c: %6.2f |    Actual alpha: %6.2f' %
                       (actual_c, actual_alpha))
                tm_start = timer()
                pred_c, pred_alpha = method(sample, features, labeled_info,
                                            **method_kargs)
                tm_end = timer()
                eprint('    Predicted c: %6.2f | Predicted alpha: %6.2f' %
                       (pred_c, pred_alpha))

                ms_per_example.append((tm_end - tm_start))
                abs_errors.append(abs(actual_alpha - pred_alpha))
                errors.append(actual_alpha - pred_alpha)
                break

        for_table.append(
            (100 * alpha, 100 * np.mean(abs_errors), 100 * np.std(abs_errors),
             100 * np.mean(errors), 100 * np.std(errors),
             np.mean(ms_per_example), np.std(ms_per_example)))

    header_csv = 'alpha,abs_mean_error,abs_mean_error_std,mean_error,mean_error_std,time,time_std'
    mask_csv = ','.join(['%.2f'] * 7)
    mask_show = '  '.join(['%7.2f'] * 7)

    eprint()
    with open(output_filename % ('_time', method_name), mode="w") as out:
        print(header_csv, file=out)
        for row in for_table:
            eprint(mask_show % row)
            print(mask_csv % row, file=out)
Example #19
0
new_data = []
new_classes = []
for index in srt:
    new_data.append(data[index])
    new_classes.append(classes[index])

# setup the network
for node in nodes:
    BN.setup_node(node, nodes)

# initial confusion matrix
confusion = {'0':{'0':0, '1':0}, '1':{'0':0, '1':0}}

# do k-fold validation
total_count = 0
kfold = KFold(100, new_data, new_classes)
while kfold.has_next():
    
    dat, cls = kfold.get_next()

    correct_count = 0
    for i in range(0, len(dat)):
        row = dat[i]
        guess = BNClassifier.classify(row, nodes, dat, cls, ['0', '1'])

        if guess == cls[i]:
            correct_count += 1

        confusion[cls[i]][guess] += 1

    total_count += correct_count