Exemple #1
0
 def get_transfer_inds(self, labels_or_ids):
     if labels_or_ids is None:
         return array_functions.true(self.n)
     if self.is_regression:
         return array_functions.find_set(self.data_set_ids, labels_or_ids)
     else:
         return array_functions.find_set(self.true_y, labels_or_ids)
Exemple #2
0
 def reveal_labels(self, inds=None):
     if inds is None:
         assert False, 'Is this a good way of doing this?  Wouldn''t "None" imply nothing should be revealed?'
         inds = array_functions.true(self.n)
     #if inds are pairwise relationships
     try:
         #Old instances may be missing 'pairwise_relationships' or it will be a list
         if not hasattr(self,'pairwise_relationships') or len(self.pairwise_relationships) == 0:
             self.pairwise_relationships = set()
         assert inds.shape[1] == 2
         #assert np.asarray([len(i) == 2 for i in inds]).all()
         inds_set = set()
         for x1, x2 in inds:
             if len(Set([(x1,x2),(x2,x1)]) & self.pairwise_relationships) > 0:
                 continue
             item = (x2,x1)
             if self.true_y[x1] <= self.true_y[x2]:
                 item = (x1,x2)
             self.pairwise_relationships.add(item)
     #If inds are for instances
     except TypeError as error:
         self.y[inds] = self.true_y[inds]
     except IndexError as error:
         self.y[inds] = self.true_y[inds]
     #If 'pairwise' data has a number of indices other than 2
     except AssertionError:
         assert False, 'Number of inds for pairwise data must be 2'
    def eval_variance(data, a):
        x, y = data.get_xy()
        C, C2, C3 = data.get_reg()
        n = x.shape[0]
        p = x.shape[1]
        t = StandardScaler()
        loss = 0

        #C = 1000
        #a[:] = 1
        for i in range(n):
            I = array_functions.true(n)
            I[i] = False
            xi = x[i, :]
            yi = y[i]
            xmi = t.fit_transform(x[I, :])
            ymi = y[I]
            bi = ymi.mean()
            #w = np.linalg.lstsq(xmi.T.dot(xmi) + C*D_a, xmi.T.dot(ymi))[0]
            w = MixedFeatureGuidanceMethod.solve_w(a, xmi, ymi, C)
            loss += (xi.T.dot(w) + bi - yi)**2
            '''
            ridge = linear_model.Ridge(C, normalize=False)
            ridge.fit(xmi, ymi)
            b_ridge = ridge.intercept_
            w_ridge = ridge.coef_
            rel_err = array_functions.relative_error(w_ridge, w)
            '''
            pass
        loss = loss / n
        reg = 0
        #reg = C2*norm(a - C*np.ones(p))**2
        reg = C2 * norm(a)**2
        #reg = C2 * norm(a, 1)
        return loss + reg
Exemple #4
0
    def optimize(self, opt_data):
        assert opt_data.instances_to_keep is None, 'Not implemented yet!'
        W_x = array_functions.make_rbf(opt_data.X, self.sigma_x)
        W = W_x
        if not self.no_f_x:
            W_y = array_functions.make_rbf(opt_data.Y, self.sigma_y)
            W = W_x * W_y
        n = W.shape[0]
        selected = array_functions.false(W.shape[0])
        splits = [array_functions.true(n)]
        num_per_split = [opt_data.subset_size]
        if self.num_class_splits is not None:
            assert self.num_class_splits == 2
            I1 = opt_data.Y <= opt_data.Y.mean()
            splits = [I1, ~I1]
            num_per_split = [opt_data.subset_size/2, opt_data.subset_size/2]
        for split, num in zip(splits, num_per_split):
            W_split = W[np.ix_(split, split)]
            split_selections = self.optimize_for_data(W_split, num)
            split_inds = split.nonzero()[0]
            selected[split_inds[split_selections]] = True

        #selected = self.compute_centroids_for_spectral_clustering(W, cluster_inds)
        self.W = W
        self.selected = selected
        if selected.sum() < opt_data.subset_size:
            #print 'Empty clusters'
            pass
        #self.learned_distribution = compute_p(selected, opt_data)
        self.learned_distribution = selected
        self.optimization_value = 0
Exemple #5
0
 def reveal_labels(self, inds=None):
     if inds is None:
         assert False, 'Is this a good way of doing this?  Wouldn' 't "None" imply nothing should be revealed?'
         inds = array_functions.true(self.n)
     #if inds are pairwise relationships
     try:
         #Old instances may be missing 'pairwise_relationships' or it will be a list
         if not hasattr(self, 'pairwise_relationships') or len(
                 self.pairwise_relationships) == 0:
             self.pairwise_relationships = set()
         assert inds.shape[1] == 2
         #assert np.asarray([len(i) == 2 for i in inds]).all()
         inds_set = set()
         for x1, x2 in inds:
             if len(
                     Set([(x1, x2), (x2, x1)])
                     & self.pairwise_relationships) > 0:
                 continue
             item = (x2, x1)
             if self.true_y[x1] <= self.true_y[x2]:
                 item = (x1, x2)
             self.pairwise_relationships.add(item)
     #If inds are for instances
     except TypeError as error:
         self.y[inds] = self.true_y[inds]
     except IndexError as error:
         self.y[inds] = self.true_y[inds]
     #If 'pairwise' data has a number of indices other than 2
     except AssertionError:
         assert False, 'Number of inds for pairwise data must be 2'
    def eval_variance(data, a):
        x, y = data.get_xy()
        C, C2, C3 = data.get_reg()
        n = x.shape[0]
        p = x.shape[1]
        t = StandardScaler()
        loss = 0

        #C = 1000
        #a[:] = 1
        for i in range(n):
            I = array_functions.true(n)
            I[i] = False
            xi = x[i, :]
            yi = y[i]
            xmi = t.fit_transform(x[I, :])
            ymi = y[I]
            bi = ymi.mean()
            #w = np.linalg.lstsq(xmi.T.dot(xmi) + C*D_a, xmi.T.dot(ymi))[0]
            w = MixedFeatureGuidanceMethod.solve_w(a, xmi, ymi, C)
            loss += (xi.T.dot(w) + bi - yi)**2
            '''
            ridge = linear_model.Ridge(C, normalize=False)
            ridge.fit(xmi, ymi)
            b_ridge = ridge.intercept_
            w_ridge = ridge.coef_
            rel_err = array_functions.relative_error(w_ridge, w)
            '''
            pass
        loss = loss / n
        reg = 0
        #reg = C2*norm(a - C*np.ones(p))**2
        reg = C2 * norm(a) ** 2
        #reg = C2 * norm(a, 1)
        return loss + reg
Exemple #7
0
 def transform(self, data):
     assert not data.is_regression
     y = data.classes
     to_keep = array_functions.true(data.n)
     if self.num_to_select is not None:
         for yi, perc in zip(y, self.num_to_select):
             inds = (data.y == yi).nonzero()[0]
             n_to_keep = np.ceil(inds.size * perc)
             to_keep[inds[n_to_keep:]] = False
         data = data.get_subset(to_keep)
     return data
Exemple #8
0
    def rand_sample(self,perc=.1,to_sample=None):
        if to_sample is None:
            to_sample = array_functions.true(self.n)
        if to_sample.dtype != 'bool':
            I = array_functions.false(self.n)
            I[to_sample] = True
            to_sample = I

        to_keep = (~to_sample).nonzero()[0]
        to_sample = to_sample.nonzero()[0]
        p = np.random.permutation(to_sample.shape[0])
        m = np.ceil(perc*p.shape[0])
        to_use = to_sample[p[:m]]
        to_use = np.hstack((to_use,to_keep))
        return self.get_subset(to_use)
Exemple #9
0
    def rand_sample(self, perc=.1, to_sample=None):
        if to_sample is None:
            to_sample = array_functions.true(self.n)
        if to_sample.dtype != 'bool':
            I = array_functions.false(self.n)
            I[to_sample] = True
            to_sample = I

        to_keep = (~to_sample).nonzero()[0]
        to_sample = to_sample.nonzero()[0]
        p = np.random.permutation(to_sample.shape[0])
        m = int(np.ceil(perc * p.shape[0]))
        to_use = to_sample[p[:m]]
        to_use = np.hstack((to_use, to_keep))
        return self.get_subset(to_use)
Exemple #10
0
 def add_noise(self, noise_rate, I=None, classes=None):
     assert not self.is_regression
     assert self.is_regression is not None
     if I is None:
         I = array_functions.true(self.n)
     if classes is None:
         I = self.classes
     to_switch = np.random.rand(self.n) <= noise_rate
     to_switch = to_switch.nonzero()[0]
     for i in to_switch:
         if not I[i]:
             continue
         old_y = self.y[i]
         y_ind = classes == old_y
         assert any(classes)
         p = np.ones(len(classes)) * 1.0 / (len(classes) - 1)
         p[classes == old_y] = 0
         new_y = np.random.choice(classes, p=p)
         self.y[i] = new_y
         self.true_y[i] = new_y
Exemple #11
0
 def add_noise(self, noise_rate, I=None, classes=None):
     assert not self.is_regression
     assert self.is_regression is not None
     if I is None:
         I = array_functions.true(self.n)
     if classes is None:
         I = self.classes
     to_switch = np.random.rand(self.n) <= noise_rate
     to_switch = to_switch.nonzero()[0]
     for i in to_switch:
         if not I[i]:
             continue
         old_y = self.y[i]
         y_ind = classes == old_y
         assert any(classes)
         p = np.ones(len(classes)) * 1.0 / (len(classes)-1)
         p[classes == old_y] = 0
         new_y = np.random.choice(classes,p=p)
         self.y[i] = new_y
         self.true_y[i] = new_y
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000
)
y_names = ['Value']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'YYYYMM')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
to_keep = array_functions.true(date_strs.shape[0])
for i, date_str in enumerate(date_strs):
    if date_str[4:] == '13' or data[i, y_inds] == 'Not Available':
        to_keep[i] = False
        continue
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids[to_keep]
data = data[to_keep, :]
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
Exemple #13
0
 def __init__(self, n=0):
     self.permutation = np.empty(n)
     self.is_train = array_functions.true(n)
Exemple #14
0
 def set_train(self):
     self.is_train = array_functions.true(self.n)
Exemple #15
0
def select_all(data):
    return array_functions.true(data.n)
Exemple #16
0
 def __init__(self,n=0):
     self.permutation = np.empty(n)
     self.is_train = array_functions.true(n)
Exemple #17
0
feat_names, data = create_data_set.load_csv(
    file_name,
    True,
    dtype='str',
    delim=',',
    #num_rows=40000
    num_rows=100000000000)
y_names = ['Value']
y_inds = []
for name in y_names:
    y_inds.append(array_functions.find_first_element(feat_names, name))
date_strs = data[:, find_first_element(feat_names, 'YYYYMM')]
prev = ''
date_str_to_idx = dict()
date_ids = np.zeros(data.shape[0])
to_keep = array_functions.true(date_strs.shape[0])
for i, date_str in enumerate(date_strs):
    if date_str[4:] == '13' or data[i, y_inds] == 'Not Available':
        to_keep[i] = False
        continue
    date_obj = to_date(date_str)
    date_str_to_idx[date_str] = date_obj.toordinal()
    date_ids[i] = date_obj.toordinal()
date_ids = date_ids[to_keep]
data = data[to_keep, :]
date_ids = date_ids.astype(np.int)
y = data[:, y_inds].astype(np.float)

#y_sub = y[I, :]

#series_id = data[:, find_first_element(feat_names, 'Site Num')].astype(np.int)
Exemple #18
0
 def set_train(self):
     self.is_train = array_functions.true(self.n)