Ejemplo n.º 1
0
def blur_no_infogain(blue_c_imb, data_dir, task, pos_class):
    # assumed b_imb == 0.5
    blue_dir = data_dir.replace('Red', 'Blue')
    d_red = setup.get_label_dict_knowing(data_dir, task, pos_class)
    d_blue = setup.get_label_dict_knowing(blue_dir, task, pos_class)
    red_c_imb = float(len(
        d_red['Default'])) / (len(d_red[task]) + len(d_red['Default']))
    blue_c_imb = float(blue_c_imb)
    if red_c_imb >= blue_c_imb:
        # can't add all negatives
        num_pos = len(d_red[task])
        num_neg = num_pos * (blue_c_imb / float(1 - blue_c_imb))
        # num_neg = num_pos * (len(d_blue['Default'])/len(d_blue[task]))
    else:
        # can't add all positives
        k = ((1 - blue_c_imb) / float(blue_c_imb))
        print k
        for key in d_red.keys():
            try:
                print len(d_red[key]), key
            except:
                pass
        num_neg = len(d_red['Default'])
        num_pos = num_neg * k
        # num_pos = num_neg * (len(d_blue[task])/len(d_blue['Default']))
    return int(num_pos), int(num_neg)
Ejemplo n.º 2
0
def blur_no_infogain(blue_c_imb, data_dir, task, pos_class):
  # assumed b_imb == 0.5
  blue_dir = data_dir.replace('Red','Blue')
  d_red = setup.get_label_dict_knowing(data_dir, task, pos_class)
  d_blue = setup.get_label_dict_knowing(blue_dir, task, pos_class)
  red_c_imb=float(len(d_red['Default']))/(len(d_red[task])+len(d_red['Default']))
  blue_c_imb = float(blue_c_imb)
  if red_c_imb >= blue_c_imb:
    # can't add all negatives
    num_pos = len(d_red[task])
    num_neg = num_pos * (blue_c_imb/float(1-blue_c_imb))
    # num_neg = num_pos * (len(d_blue['Default'])/len(d_blue[task]))
  else:
    # can't add all positives
    k = ((1-blue_c_imb)/float(blue_c_imb))
    print k
    for key in d_red.keys():
      try:
        print len(d_red[key]), key
      except: pass
    num_neg = len(d_red['Default'])
    num_pos = num_neg * k
    # num_pos = num_neg * (len(d_blue[task])/len(d_blue['Default']))
  return int(num_pos), int(num_neg)
Ejemplo n.º 3
0
def what_redbox_numbers(c_imb, b_imb, data_dir, task, pos_class, b_pos, b_neg):
    # big prob: after redbox sampling, imbalance has changed.
    # so actually, redbox sampling and undersampling both need to be
    # determined before either takes place.
    # other prob: given b_imb, compute num_neg num_pos
    # maybe easier is given info gain, compute num_neg num_pos
    d = setup.get_label_dict_knowing(data_dir, task, pos_class)
    red_c_imb = float(len(d['Default'])) / (len(d[task]) + len(d['Default']))
    if red_c_imb >= c_imb:
        r_pos = len(d[task])
        r_neg = r_pos * (b_imb / (1 - b_imb)) * (r_pos / (r_pos))
        return r_pos, r_neg
        # if red_c_imb lower, would have to
    elif red_c_imb <= c_imb:
        print "class imbalance going to decrease! :D"
        return len(d[task]), len(d['Default']) * (c_imb / red_c_imb)
Ejemplo n.º 4
0
def what_redbox_numbers(c_imb, b_imb, data_dir, task, pos_class,
                        b_pos, b_neg):
  # big prob: after redbox sampling, imbalance has changed.
  # so actually, redbox sampling and undersampling both need to be
  # determined before either takes place.
  # other prob: given b_imb, compute num_neg num_pos
  # maybe easier is given info gain, compute num_neg num_pos
  d = setup.get_label_dict_knowing(data_dir, task, pos_class)
  red_c_imb=float(len(d['Default']))/(len(d[task])+len(d['Default']))
  if red_c_imb >= c_imb:
    r_pos = len(d[task])
    r_neg = r_pos * (b_imb/(1-b_imb)) * (r_pos/(r_pos))
    return r_pos, r_neg
    # if red_c_imb lower, would have to 
  elif red_c_imb <= c_imb:
    print "class imbalance going to decrease! :D"
    return len(d[task]), len(d['Default'])*(c_imb/red_c_imb)
Ejemplo n.º 5
0
def same_amount_as_bluebox(data_dir, task, pos_class):
    d = setup.get_label_dict_knowing(data_dir, task, pos_class)
    # ASSUMING MODEL LEARNS P(label|data) !
    return len(d[task]), len(d[task])
Ejemplo n.º 6
0
def same_amount_as_bluebox(data_dir, task, pos_class):
  d = setup.get_label_dict_knowing(data_dir, task, pos_class)
  # ASSUMING MODEL LEARNS P(label|data) !
  return len(d[task]), len(d[task])