Esempio n. 1
0
    def test_add_dummy_feature(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)

        result = df.preprocessing.add_dummy_feature(value=2)
        expected = pp.add_dummy_feature(iris.data, value=2)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        self.assert_index_equal(result.columns[1:], df.data.columns)

        s = df['sepal length (cm)']
        self.assertTrue(isinstance(s, pdml.ModelSeries))
        result = s.preprocessing.add_dummy_feature()
        expected = pp.add_dummy_feature(iris.data[:, [0]])

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.columns[1], 'sepal length (cm)')
Esempio n. 2
0
def percep(X_tr, y_tr, X_te):
    clf = Perceptron(n_iter = 1000)
    X_tr_aug = add_dummy_feature(X_tr)
    X_te_aug = add_dummy_feature(X_te)
    clf.fit(X_tr_aug, y_tr)
    y_pred = clf.predict(X_te_aug)
    return y_pred
Esempio n. 3
0
def pinv(X_tr, y_tr, X_te):
    # augment the feature space
    X_tr_aug = add_dummy_feature(X_tr)
    X_te_aug = add_dummy_feature(X_te)
    X_tr_aug[np.where(y_tr == 1)] = -X_tr_aug[np.where(y_tr == 1)]
    b = np.ones((len(X_tr_aug),))
    w = np.dot(np.linalg.pinv(X_tr_aug), b)
    indicator = np.dot(X_te_aug, w)
    for i in range(len(indicator)):
        if indicator[i] > 0:
            indicator[i] = 0
        else:
            indicator[i] = 1
    return indicator
Esempio n. 4
0
    def _augment(self, X):
        # for factorization machines, we add a dummy column for each order.

        if self.fit_lower == 'augment':
            k = 2 if self.fit_linear else 1
            for _ in range(self.degree - k):
                X = add_dummy_feature(X, value=1)
        return X
Esempio n. 5
0
    def __init__(self, history, strength_model=None, content_features=None, using_delay=True, 
            using_global_difficulty=True, using_item_bias=True, debug_mode_on=False):
        """
        Initialize memory model object

        :param pd.DataFrame history: Interaction log data. Must contain the 'tlast' column,
            in addition to the other columns that belong to the dataframe in a
            lentil.datatools.InteractionHistory object. If strength_model is not None, then
            the history should also contain a column named by the strength_model (e.g., 'nreps' or
            'deck')

        :param str|None strength_model: Corresponds to a column in the history dataframe 
            (e.g., 'nreps' or 'deck') or simply None if memory strength is always 1.

        :param dict[str,np.array]|None content_features: A dictionary mapping item names
            to feature vectors. All items should be accounted for.

        :param bool using_delay: True if the delay term is included in the recall probability, 
            False otherwise.

        :param bool using_global_difficulty: True if the global bias term should be included in
            the log-linear difficulty model, False otherwise.

        :param bool using_item_bias: True if the item-specific bias term should be included in
            the log-linear difficulty model, False otherwise.

        :param bool debug_mode_on: True if MAP estimation should log progress 
            and plot learned difficulty parameters, False otherwise.
        """

        self.history = history[history['module_type']==datatools.AssessmentInteraction.MODULETYPE]
        self.strength_model = strength_model
       
        self.using_delay = using_delay
        self.using_global_difficulty = using_global_difficulty
        self.using_item_bias = using_item_bias
        self.debug_mode_on = debug_mode_on

        self.idx_of_module_id = {x: i for i, x in enumerate(self.history['module_id'].unique())}
        self.difficulty = None
        
        if content_features is None:
            if self.using_global_difficulty:
                content_features = np.ones((len(self.idx_of_module_id), 1))
        else:    
            content_features = np.array([content_features[module_id] \
                    for module_id in self.history['module_id'].unique()])
            content_features = preprocessing.scale(content_features)
            if self.using_global_difficulty:
                content_features = preprocessing.add_dummy_feature(content_features)
        self.content_features = content_features

        if self.content_features is None and not self.using_item_bias:
            raise ValueError('The log-linear difficulty model has not been defined!')
Esempio n. 6
0
    def fit(self, X, y):
        """Fit model according to X and y.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : regressor
            Returns self.
        """
        if self.fit_intercept:
            X = add_dummy_feature(X)

        n_samples, n_features = X.shape
        rs = self._get_random_state()

        self.outputs_2d_ = len(y.shape) == 2
        if self.outputs_2d_:
            Y = y
        else:
            Y = y.reshape(-1, 1)
        Y = np.asfortranarray(Y)
        n_vectors = Y.shape[1]

        ds = get_dataset(X)

        if not self.warm_start or self.coef_ is None:
            self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64)
            self.dual_coef_ = np.zeros((n_vectors, n_samples),
                                       dtype=np.float64)

        for i in xrange(n_vectors):
            _dual_cd_svr(self, self.coef_[i], self.dual_coef_[i],
                         ds, Y[:, i], self.permute,
                         self.C, self.epsilon, self._get_loss(),
                         self.max_iter, rs, self.tol,
                         self.callback, self.n_calls,
                         verbose=self.verbose)

        if self.fit_intercept:
            self.intercept_ = self.coef_[:, 0]
            self.coef_ = self.coef_[:, 1:]

        return self
    def predict(self, X, add_interactions=False, return_labels=False):
        # Add intercept
        X = add_dummy_feature(X)

        hypothesis = self.compute_hypothesis(X)
        if self.loss_fn == "logistic":
            if return_labels:
                labels = [1 if i > 0.5 else 0 for i in hypothesis]
                return labels
            else:
                return hypothesis

        elif self.loss_fn == "squared":
            return hypothesis
    def fit(self, X, y):
        start_time = time.time()

        # Add intercept
        X = add_dummy_feature(X)

        # Store gradients
        self.grad_v = np.zeros((self.k, self.features))

        m = len(X[0])
        n = len(X)
        self.initialize_weights(m)
        for _ in xrange(self.epochs):

            # Kind of ugly and slow but all i care about is learning
            X, y = self.shuffle_data(X, y)

            # Create an iterator
            x_iter = self.batch(X, self.batch_size)
            y_iter = self.batch(y, self.batch_size)

            for X_batch, y_batch in izip(x_iter, y_iter):
                hypothesis = self.compute_hypothesis(X_batch)

                # Compute cost for entire dataset
                # cost = self.compute_cost(X, y)
                # print cost

                gradient = self.compute_gradient(X_batch, y_batch, hypothesis)
                self.update_weights(gradient)

        # Save model params for scikit compatibility
        self.intercept_ = self.w[0]
        self.coeff_ = self.w[1:,]

        print "elapsed time in training: %f" % (time.time() - start_time)
Esempio n. 9
0
    def __init__(self,
                 history,
                 strength_model=None,
                 content_features=None,
                 using_delay=True,
                 using_global_difficulty=True,
                 using_item_bias=True,
                 debug_mode_on=False):
        """
        Initialize memory model object

        :param pd.DataFrame history: Interaction log data. Must contain the 'tlast' column,
            in addition to the other columns that belong to the dataframe in a
            lentil.datatools.InteractionHistory object. If strength_model is not None, then
            the history should also contain a column named by the strength_model (e.g., 'nreps' or
            'deck')

        :param str|None strength_model: Corresponds to a column in the history dataframe 
            (e.g., 'nreps' or 'deck') or simply None if memory strength is always 1.

        :param dict[str,np.array]|None content_features: A dictionary mapping item names
            to feature vectors. All items should be accounted for.

        :param bool using_delay: True if the delay term is included in the recall probability, 
            False otherwise.

        :param bool using_global_difficulty: True if the global bias term should be included in
            the log-linear difficulty model, False otherwise.

        :param bool using_item_bias: True if the item-specific bias term should be included in
            the log-linear difficulty model, False otherwise.

        :param bool debug_mode_on: True if MAP estimation should log progress 
            and plot learned difficulty parameters, False otherwise.
        """

        self.history = history[history['module_type'] ==
                               datatools.AssessmentInteraction.MODULETYPE]
        self.strength_model = strength_model

        self.using_delay = using_delay
        self.using_global_difficulty = using_global_difficulty
        self.using_item_bias = using_item_bias
        self.debug_mode_on = debug_mode_on

        self.idx_of_module_id = {
            x: i
            for i, x in enumerate(self.history['module_id'].unique())
        }
        self.difficulty = None

        if content_features is None:
            if self.using_global_difficulty:
                content_features = np.ones((len(self.idx_of_module_id), 1))
        else:
            content_features = np.array([content_features[module_id] \
                    for module_id in self.history['module_id'].unique()])
            content_features = preprocessing.scale(content_features)
            if self.using_global_difficulty:
                content_features = preprocessing.add_dummy_feature(
                    content_features)
        self.content_features = content_features

        if self.content_features is None and not self.using_item_bias:
            raise ValueError(
                'The log-linear difficulty model has not been defined!')
Esempio n. 10
0
def test_add_dummy_feature_csr():
    X = sp.csr_matrix([[1, 0], [0, 1], [0, 1]])
    X = add_dummy_feature(X)
    assert_true(sp.isspmatrix_csr(X), X)
    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
Esempio n. 11
0
def test_add_dummy_feature():
    X = [[1, 0], [0, 1], [0, 1]]
    X = add_dummy_feature(X)
    assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
Esempio n. 12
0
 def _augment(self, X):
     # for polynomial nets, we add a single dummy column
     if self.fit_lower == 'augment':
         X = add_dummy_feature(X, value=1)
     return X
Esempio n. 13
0
def test_add_dummy_feature_coo():
    X = sp.coo_matrix([[1, 0], [0, 1], [0, 1]])
    X = add_dummy_feature(X)
    assert_true(sp.isspmatrix_coo(X), X)
    assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
Esempio n. 14
0
def test_add_dummy_feature():
    X = [[1, 0], [0, 1], [0, 1]]
    X = add_dummy_feature(X)
    assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
Esempio n. 15
0
def decision_boundary(w, c):
    b, w1, w2 = w
    y1 = -(w1 * -4 + b) / w2
    y2 = -(w1 * 4 + b) / w2

    plt.plot([-4, 4], [y1, y2], c)

    # y = w1 * x1 + w2 * x2 + b
    # 0 = w1 * x1 + w2 * x2 + b
    # -(w1 * x1 + b) = w2 * x2
    # -(w1 * x1 + b) / w2 = x2


action = np.loadtxt('Data/action.txt')
action = preprocessing.add_dummy_feature(action)
# print(action[:3])

xx = action[:, :-1]
yy = action[:, -1:]
# print(xx.shape, yy.shape)       # (100, 3) (100, 1)

for _, x1, x2, y in action:
    # print(x1, x2)
    plt.plot(x1, x2, 'ro' if y else 'go')

decision_boundary(gradient_descent(xx, yy), 'r')
# decision_boundary(gradient_stochastic_1(xx, yy), 'g')
# decision_boundary(gradient_stochastic_2(xx, yy), 'b')
# decision_boundary(gradient_minibatch(xx, yy), 'y')
# decision_boundary(gradient_minibatch_random_1(xx, yy), 'k')
Esempio n. 16
0
def load_train_or_test_data(data_file,
                            args,
                            pp=True,
                            pca_obj=None,
                            srp_obj=None,
                            for_test=False):
    print('loading %s...' % data_file)
    if args.pca or args.sparse_random_projection:
        max_dim = 0
        include_offset = False
    else:
        max_dim = args.max_dimension
        include_offset = args.include_offset
    X, y, pp = load_data(data_file,
                         data_file.split('.')[-1],
                         max_dim=max_dim,
                         preprocess=pp,
                         include_offset=include_offset,
                         target_dim=args.target_dim)
    if args.pca and args.max_dimension > 0:
        print('performing PCA')
        if pca_obj is None:
            pca_comps = args.max_dimension
            if args.include_offset:
                pca_comps -= 1
            pca_obj = PCA(n_components=pca_comps).fit(X)
        X = pca_obj.transform(X)
        if args.include_offset:
            X = preprocessing.add_dummy_feature(X)
    if args.sparse_random_projection:
        print('performing sparse random projection')
        if srp_obj is None:
            if args.max_dimension > 0:
                n_components = args.max_dimension
                print(n_components * 10, X.shape[1])
                dense_output = n_components * 10 < X.shape[1]
            else:
                n_components = 'auto'
                dense_output = True  # not sure if this is a good idea...
            srp_obj = SparseRandomProjection(n_components=n_components,
                                             dense_output=dense_output,
                                             random_state=0).fit(X)
        X = srp_obj.transform(X)
        if args.include_offset:
            X = preprocessing.add_dummy_feature(X)
        if sp.issparse(X) and (X.nnz > np.prod(X.shape) / 3.0
                               or X.shape[1] <= 20):
            print("X is either low-dimensional or not very sparse, so "
                  "converting to a numpy array")
            X = X.toarray()
    # Z = sp.diags(y).dot(X)
    num_features = X.shape[1]
    print('%d total training data points of dimension %d' % X.shape)
    if sp.issparse(X):
        print('density =', float(X.nnz) / np.prod(X.shape))
    # split data further, if necessary
    split_size = 1e4 if for_test else 2e5
    num_splits = max(1, int(X.shape[0] / split_size + .5))
    if num_splits > 1:
        print('num splits =', num_splits)
    Xs = sparse_vsplit(X, num_splits)
    ys = sparse_vsplit(y, num_splits)
    return Xs, ys, pp, pca_obj, srp_obj
if __name__ == '__main__':

    # Check user input
    if len(sys.argv) != 3:
        print('ERROR: USAGE: python <input.csv> <output.csv>')
        sys.exit(0)
    else:
        input_file = sys.argv[1]
        output_file = sys.argv[2]

    # Read in data
    raw_data = np.genfromtxt(input_file, delimiter=',')
    X_raw = raw_data[:, :-1]
    Y = raw_data[:, [-1]]

    # Scale Xs
    X_scaled = preprocessing.scale(X_raw)

    # Add in the intercept column
    X = preprocessing.add_dummy_feature(X_scaled)

    # Run gradient descent for the nine pairs of learning rates and number of iterations given, and print the results to a file
    with open(output_file, 'w') as output:
        for alpha in [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 0.3]:
            n_iter = 100
            beta = GradientDescent(X, Y, alpha, n_iter)
            output.write(
                str(alpha) + ',' + str(n_iter) + ',%.3f' % beta[0] +
                ',%.3f' % beta[1] + ',%.3f' % beta[2] + '\n')
Esempio n. 18
0
from sklearn import preprocessing
import numpy as np

#  때에 따라 존재하지 않는 임의의 특징값(feature)를 만들어야 하는 경우가 있을 수 있는데
# sklearn 의 preprocessing 을 이용하여 임의의 특징값을 생성해 봅시다,

x = [[0, 1], [3, 5]]
x2 = preprocessing.add_dummy_feature(x)
# add_dummy_feature ==> value를 생략하면 1을 각 행마다 추가해 준다.
print(x2)
'''
    [[1. 0. 1.]
     [1. 3. 5.]]
'''

x = [[0, 1, 2], [3, 4, 5]]
x2 = preprocessing.add_dummy_feature(x, 9)
print(x2)
'''
[[9. 0. 1. 2.]
 [9. 3. 4. 5.]]
'''

x3 = preprocessing.add_dummy_feature(x2, 7)
print(x3)
'''
[[7. 9. 0. 1.]
 [7. 9. 3. 5.]]
'''
Esempio n. 19
0
print("Standardizing data")
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
print("original data")
print(data)
scaler = StandardScaler()
scaler.fit(data)
print("Mean of the data")
print(scaler.mean_)
print("Standardized data")
print(scaler.transform(data))

print(" ")
le = preprocessing.LabelEncoder()
print("Labels:")
print(["paris", "paris", "tokyo", "amsterdam"])
le.fit(["paris", "paris", "tokyo", "amsterdam"])

print("Encodings for \n tokyo,amsterdam,paris::")
print(le.transform(["tokyo", "amsterdam", "paris"]) )

print("")
print("Adding dummy feature")


X = [[0,1],[1,1]]
print("Data :")
print(X)
print("adding dummy feature with value 5")
X=add_dummy_feature(X,value=5.0)
print(X)
Esempio n. 20
0
    else:
        return -1

# XTrain, XTest, YTrain, YTest = preprocess("../Databases/ionosphere.data",34, tranform_categorical)
# Preprocessing
print("READING DataBase....")
data = pd.read_csv("../Databases/ionosphere.data", header = None) #Reading
target = pd.DataFrame(data[34]) #Y
features = data.drop([34], axis= 1) #X

print("Preprocessing Data")
#tranform categorical to discrete
y = target.applymap(tranform_categorical)
y.head()
#Add of a column of 1's to X
features_p = add_dummy_feature(features)
x = pd.DataFrame(features_p )

sum_accu = 0
repeat = 100
for k in range(0,repeat):
#Splitting between training and testing
    try:
        from sklearn.model_selection import train_test_split    # sklearn > ...
    except:
        from sklearn.cross_validation import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

    XTrain = X_train.values
    XTest = X_test.values
 def predict(self, x, p_threshold=0.5):
     if self.fit_intercept and x.shape[1] < len(self.beta):
         x = add_dummy_feature(x)
     odds_threshold = p_threshold / (1.0 - p_threshold)
     return (np.exp(LogisticRegression.xbeta(x, self.beta)) >
             odds_threshold).astype(int)
Esempio n. 22
0
def load_data(path,
              file_type,
              max_data=0,
              max_dim=0,
              preprocess=True,
              include_offset=True):
    """Load data from a variety of file types.

    Parameters
    ----------
    path : string
        Data file path.

    file_type : string
        Supported file types are: 'svmlight', 'npy' (with the labels y in the
        rightmost col), 'npz', 'hdf5' (with datasets 'x' and 'y'), and 'csv'
        (with the labels y in the rightmost col)

    max_data : int
        If positive, maximum number of data points to use. If zero or negative,
        all data is used. Default is 0.

    max_dim : int
        If positive, maximum number of features to use. If zero or negative,
        all features are used. Default is 0.

    preprocess : boolean or Transformer, optional
        Flag indicating whether the data should be preprocessed. For sparse
        data, the features are scaled to [-1, 1]. For dense data, the features
        are scaled to have mean zero and variance one. Default is True.

    include_offset : boolean, optional
        Flag indicating that an offset feature should be added. Default is
        False.

    Returns
    -------
    X : array-like matrix, shape=(n_samples, n_features)

    y : int ndarray, shape=(n_samples,)
        Each entry indicates whether each example is negative (-1 value) or
        positive (+1 value)

    pp_obj : None or Transformer
        Transformer object used on data, or None if ``preprocess=False``
    """
    if not isinstance(path, str):
        raise ValueError("'path' must be a string")

    if file_type in ["svmlight", "svm"]:
        X, y = _load_svmlight_data(path)
    else:
        raise ValueError("unsupported file type, %s" % file_type)

    y_vals = set(y)
    if len(y_vals) != 2:
        raise ValueError('Only expected y to take on two values, but instead'
                         'takes on the values ' + ', '.join(y_vals))
    if 1.0 not in y_vals:
        raise ValueError('y does not take on 1.0 as one on of its values, but '
                         'instead takes on the values ' + ', '.join(y_vals))
    if -1.0 not in y_vals:
        y_vals.remove(1.0)
        print('converting y values of %s to -1.0' % y_vals.pop())
        y[y != 1.0] = -1.0

    if preprocess is False:
        pp_obj = None
    else:
        if preprocess is True:
            if sp.issparse(X):
                pp_obj = preprocessing.MaxAbsScaler(copy=False)
            else:
                pp_obj = preprocessing.StandardScaler(copy=False)
            pp_obj.fit(X)
        else:
            pp_obj = preprocess
        X = pp_obj.transform(X)

    if include_offset:
        X = preprocessing.add_dummy_feature(X)
        X = np.flip(X, -1)  # move intercept to the last column of the array

    if sp.issparse(X) and (X.nnz > np.prod(X.shape) / 10 or X.shape[1] <= 20):
        print("X is either low-dimensional or not very sparse, so converting "
              "to a numpy array")
        X = X.toarray()
    if isinstance(max_data, int) and max_data > 0 and max_data < X.shape[0]:
        X = X[:max_data, :]
        y = y[:max_data]
    if isinstance(max_dim, int) and max_dim > 0 and max_dim < X.shape[1]:
        X = X[:, :max_dim]

    return X, y, pp_obj
 def decision_function(self, X):
     if self.fit_intercept:
         X = add_dummy_feature(X)
     return safe_sparse_dot(X, self.coef_.T)
Esempio n. 24
0
 def _augment(self, X):
     # for polynomial nets, we add a single dummy column
     if self.fit_lower == 'augment':
         X = add_dummy_feature(X, value=1)
     return X
Esempio n. 25
0
def add_dummy_feature():
    x = [[0, 1], [2, 3]]

    print(preprocessing.add_dummy_feature(x))