def Encoding(data, general_matrix=None):
    encoder = LabelBinarizer()
    count = 0
    # encoding
    for i in range(data.shape[1]):
        if type(data[0, i]) == str:
            count += 1
            col = data[:, i]
            unique = np.unique(col if general_matrix is None else general_matrix[:, i])

            try:
                encoder.fit(unique)
            except:
                pass

            new_col = encoder.transform(col)

            # split at i and i + 1
            before, removed, after = np.hsplit(data, [i, i + 1])
            # concatenate
            data = np.concatenate((before, new_col, after), axis=1)
            before, removed, after = np.hsplit(general_matrix, [i, i + 1])
            general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1)

    print "count : %d" % count
    # return data
    return data
Ejemplo n.º 2
0
class BusinessCategoriesFeature(BaseEstimator):
	"""
	WARNING!!!
	Works only with a modified version of LabelBinarizer.

	A binarization of the reviews' business categories.
	"""

	def __init__(self, data=None):
		self.data = data

	def __create_labels_list(self, review_list):
		labels = []
		for review in review_list:
			business = self.data.get_business_for_review(review)
			labels.append(business['categories'])
		return labels

	def fit(self, X, y):
		self.binarizer = LabelBinarizer()
		labels = self.__create_labels_list(X)
		self.binarizer.fit(labels)
		return self

	def transform(self, X):
		labels = self.__create_labels_list(X)
		binarized_labels = self.binarizer.transform(labels)
		return binarized_labels.astype(float)
Ejemplo n.º 3
0
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1,), (1, 2)]
    indicator_mat = np.array([[0, 1, 1],
                              [1, 0, 0],
                              [1, 1, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat,
                       lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()

    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1],
                         [1, 0],
                         [0, 1],
                         [1, 1]])
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
Ejemplo n.º 4
0
def display_image_predictions(features, labels, predictions):
    n_classes = 10
    label_names = _load_label_names()
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(range(n_classes))
    label_ids = label_binarizer.inverse_transform(np.array(labels))

    fig, axies = plt.subplots(nrows=4, ncols=2)
    fig.tight_layout()
    fig.suptitle('Softmax Predictions', fontsize=20, y=1.1)

    n_predictions = 3
    margin = 0.05
    ind = np.arange(n_predictions)
    width = (1. - 2. * margin) / n_predictions

    for image_i, (feature, label_id, pred_indicies, pred_values) in enumerate(zip(features, label_ids, predictions.indices, predictions.values)):
        pred_names = [label_names[pred_i] for pred_i in pred_indicies]
        correct_name = label_names[label_id]

        axies[image_i][0].imshow(feature*255)
        axies[image_i][0].set_title(correct_name)
        axies[image_i][0].set_axis_off()

        axies[image_i][1].barh(ind + margin, pred_values[::-1], width)
        axies[image_i][1].set_yticks(ind + margin)
        axies[image_i][1].set_yticklabels(pred_names[::-1])
        axies[image_i][1].set_xticks([0, 0.5, 1.0])
Ejemplo n.º 5
0
class BaseSGD(object):
    def _get_loss(self):
        losses = {
            "modified_huber": ModifiedHuber(),
            "hinge": Hinge(1.0),
            "perceptron": Hinge(0.0),
            "log": Log(),
            "sparse_log": SparseLog(),
            "squared": SquaredLoss(),
            "huber": Huber(self.epsilon),
            "epsilon_insensitive": EpsilonInsensitive(self.epsilon),
        }
        return losses[self.loss]

    def _get_learning_rate(self):
        learning_rates = {"constant": 1, "pegasos": 2, "invscaling": 3}
        return learning_rates[self.learning_rate]

    def _set_label_transformers(self, y):
        if self.multiclass == "natural":
            self.label_encoder_ = LabelEncoder()
            y = self.label_encoder_.fit_transform(y).astype(np.float64)

        self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_classes = len(self.label_binarizer_.classes_)
        n_vectors = 1 if n_classes <= 2 else n_classes
        return n_classes, n_vectors
Ejemplo n.º 6
0
def get_abalone19():
    """Loads abalone dataset, maps gender feature to binary features, adds
    new label to create abalone19 imbalanced binary classification dataset."""
    raw_data = pd.read_csv(ABALONE_FILE, sep=',')
    genders = list(raw_data.ix[:, 'gender'])
    cts_data = raw_data.drop(labels='gender', axis=1)

    # initialize & fit preprocesser
    lbz = LabelBinarizer()
    lbz.fit(genders)

    # encode categorical var
    encoded_genders = pd.DataFrame(lbz.transform(genders))
    encoded_genders.columns = ['gender_' + k for k in lbz.classes_]

    # recombine encoded data & return
    new_data = pd.concat(objs=[encoded_genders, cts_data], axis=1)
    new_data['label'] = raw_data['rings'].map(
        lambda k: 1 if k > 10 else 0)               # binary clf task
    new_data = new_data.drop('rings', axis=1)

    # standardize cts features
    if STANDARDIZE:
        for col in new_data.ix[:, 3:-1]:
            mean = new_data[col].mean()
            std = new_data[col].std()
            new_data[col] = new_data[col].map(lambda k: (k - mean) / float(std))

    pos_recs = new_data['label'].sum()
    print 'total pos class pct = {} %\n'.format(
        round(100 * pos_recs / float(len(new_data)), 3))

    return new_data
Ejemplo n.º 7
0
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        samples, self.n_features = X.shape

        # because our space of targets are discrete
        lb = LabelBinarizer()
        lb.fit(y)
        self.classes = lb.classes_
        self.n_class = self.classes.size

        self.class_prior = np.zeros(self.n_class, dtype=np.float64)
        self.feature_proba = []

        for i, y_i in enumerate(self.classes):
            # get Xs only for y_i class
            X_yi = X[y == y_i]
            class_count = X_yi[:, 0].size
            self.class_prior[i] = np.float64(class_count) / samples

            count_all_features = 0
            all_features = np.zeros(self.n_features)
            for sample_features in X_yi:
                # accumulate feature according our algorithm
                all_features, count_all_features = self._add_features_dens(
                    sample_features, all_features, count_all_features)

            # calculate probabilites according our algorithm
            self.feature_proba.append(
                self._compute_proba(all_features, count_all_features))

        return self
Ejemplo n.º 8
0
def one_hot_encoding(y_train, y_test):
    labelBinarizer = LabelBinarizer()
    labelBinarizer.fit(y_train)

    y_train_one_hot = labelBinarizer.transform(y_train)
    y_test_one_hot = labelBinarizer.transform(y_test)
    return y_train_one_hot, y_test_one_hot
Ejemplo n.º 9
0
def train_logreg(X, y, test_X, test_y, load_vec=True):
	""" 	
	Trains logistic regression on the feature set.
	"""
	full_y = y + test_y
	
	lb = LabelBinarizer()
	lb.fit(full_y)
	# Convert into 1-D array
	print len(X), len(test_X)
	model = LogisticRegression()
	big_X = X + test_X

	features = featurize(big_X)
	X, test_X = features[:4500], features[4500:]
	print X.shape, X

	model.fit(X, y)

	y_pred = model.predict(X)
	print set(y_pred)
	print metrics.classification_report(y, y_pred, digits = 3)
	y_pred = model.predict(test_X)
	print set(y_pred)
	print metrics.classification_report(test_y, y_pred, digits = 3)
Ejemplo n.º 10
0
def logloss(act, pred):
    epsilon = 10 ** -15
    pred = np.maximum(np.minimum(pred, 1 - epsilon), epsilon)
    lb = LabelBinarizer()
    lb.fit(act)
    act_binary = lb.transform(act)
    logloss = - np.sum(np.multiply(act_binary, np.log(pred))) / pred.shape[0]
    return logloss
Ejemplo n.º 11
0
    def fit(self, Xt, yt, Xh, yh, callback=None):
        lbin = LabelBinarizer()
        lbin.fit(yt)
        Yt_multi = lbin.transform(yt)
        Yh_multi = lbin.transform(yh)
        sample_weight_train = np.ones(Xt.shape[0])
        sample_weight_test = np.ones(Xh.shape[0])


        if Yt_multi.shape[1] == 1:
            Yt_multi = np.hstack([1 - Yt_multi, Yt_multi])
            Yh_multi = np.hstack([1 - Yh_multi, Yh_multi])
            print('warning: only two classes detected')

        n_classes = Yt_multi.shape[1]
        n_features = Xt.shape[1]

        if self.alpha0 is None:
            self.alpha0 = np.zeros(n_classes * n_features)  # if not np.all(np.unique(yt) == np.array([-1, 1])):
        #     raise ValueError
        x0 = np.zeros(n_features * n_classes)

        # assert x0.size == self.alpha0.size

        def h_func_grad(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_loss_grad(
                x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[:2]

        def h_hessian(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_grad_hess(
                x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[1]

        def g_func_grad(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_loss_grad(
                x, Xh, Yh_multi, np.zeros(alpha.size),
                sample_weight_test)[:2]

        def h_crossed(x, alpha):
            # return x.reshape((n_classes, -1)) * alpha
            # x = x.reshape((-1,Yt_multi.shape[1]))
            tmp = np.exp(alpha) * x
            return sparse.dia_matrix(
                (tmp, 0),
                shape=(n_features * n_classes, n_features * n_classes))

        opt = hoag_lbfgs(
            h_func_grad, h_hessian, h_crossed, g_func_grad, x0,
            callback=callback,
            tolerance_decrease=self.tolerance_decrease,
            lambda0=self.alpha0, maxiter=self.max_iter,
            verbose=self.verbose)

        self.coef_ = opt[0]
        self.alpha_ = opt[1]
        return self
Ejemplo n.º 12
0
    def __init__(
        self,
        train_file,
        test_file,
        batch_size=32,
        embedding_size=20,
        max_norm=40,
        lr=0.01,
        num_hops=3,
        adj_weight_tying=True,
        linear_start=True,
        **kwargs
    ):
        train_lines, test_lines = self.get_lines(train_file), self.get_lines(test_file)
        lines = np.concatenate([train_lines, test_lines], axis=0)
        vocab, word_to_idx, idx_to_word, max_seqlen, max_sentlen = self.get_vocab(lines)

        self.data = {"train": {}, "test": {}}
        S_train, self.data["train"]["C"], self.data["train"]["Q"], self.data["train"]["Y"] = self.process_dataset(
            train_lines, word_to_idx, max_sentlen, offset=0
        )
        S_test, self.data["test"]["C"], self.data["test"]["Q"], self.data["test"]["Y"] = self.process_dataset(
            test_lines, word_to_idx, max_sentlen, offset=len(S_train)
        )
        S = np.concatenate([np.zeros((1, max_sentlen), dtype=np.int32), S_train, S_test], axis=0)
        for i in range(10):
            for k in ["C", "Q", "Y"]:
                print k, self.data["test"][k][i]
        print "batch_size:", batch_size, "max_seqlen:", max_seqlen, "max_sentlen:", max_sentlen
        print "sentences:", S.shape
        print "vocab:", len(vocab), vocab
        for d in ["train", "test"]:
            print d,
            for k in ["C", "Q", "Y"]:
                print k, self.data[d][k].shape,
            print ""

        lb = LabelBinarizer()
        lb.fit(list(vocab))
        vocab = lb.classes_.tolist()

        self.batch_size = batch_size
        self.max_seqlen = max_seqlen
        self.max_sentlen = max_sentlen
        self.embedding_size = embedding_size
        self.num_classes = len(vocab) + 1
        self.vocab = vocab
        self.adj_weight_tying = adj_weight_tying
        self.num_hops = num_hops
        self.lb = lb
        self.init_lr = lr
        self.lr = self.init_lr
        self.max_norm = max_norm
        self.S = S
        self.idx_to_word = idx_to_word
        self.nonlinearity = None if linear_start else lasagne.nonlinearities.softmax

        self.build_network(self.nonlinearity)
Ejemplo n.º 13
0
    def load_dataset2(self):
        X, y, X_test, y_test = dataset = snippet_reader.toNumpy()
        X, y = shuffle(X, y)

        lb = LabelBinarizer()
        lb.fit(y)

        for y_bin in lb.transform(y).T:
            return X, y_bin
Ejemplo n.º 14
0
class BaseClassifier(BaseEstimator):

    def predict_proba(self, X):
        if len(self.classes_) != 2:
            raise NotImplementedError("predict_(log_)proba only supported"
                                      " for binary classification")

        if self.loss == "log":
            df = self.decision_function(X).ravel()
            prob = 1.0 / (1.0 + np.exp(-df))
        elif self.loss == "modified_huber":
            df = self.decision_function(X).ravel()
            prob = np.minimum(1, np.maximum(-1, df))
            prob += 1
            prob /= 2
        else:
            raise NotImplementedError("predict_(log_)proba only supported when"
                                      " loss='log' or loss='modified_huber' "
                                      "(%s given)" % self.loss)

        out = np.zeros((X.shape[0], 2), dtype=np.float64)
        out[:, 1] = prob
        out[:, 0] = 1 - prob

        return out

    def _set_label_transformers(self, y, reencode=False, neg_label=-1):
        if reencode:
            self.label_encoder_ = LabelEncoder()
            y = self.label_encoder_.fit_transform(y).astype(np.int32)
        else:
            y = y.astype(np.int32)

        self.label_binarizer_ = LabelBinarizer(neg_label=neg_label,
                                               pos_label=1)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_classes = len(self.label_binarizer_.classes_)
        n_vectors = 1 if n_classes <= 2 else n_classes

        return y, n_classes, n_vectors

    def decision_function(self, X):
        pred = safe_sparse_dot(X, self.coef_.T)
        if hasattr(self, "intercept_"):
            pred += self.intercept_
        return pred

    def predict(self, X):
        pred = self.decision_function(X)
        out = self.label_binarizer_.inverse_transform(pred)

        if hasattr(self, "label_encoder_"):
            out = self.label_encoder_.inverse_transform(out)

        return out
Ejemplo n.º 15
0
    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise RuntimeError("Only works with DataFrames.  Got {}".format(X.__class__))

        self.binarizers_ = []
        for col in X.columns:
            binarizer = LabelBinarizer(self.neg_label, self.pos_label)
            binarizer.fit(X[col].values)
            self.binarizers_.append((col, binarizer))
        return self
 def encode(self, data, label, value_set=None):
     le =LabelBinarizer()
     if value_set is None:
         encoded = le.fit_transform(data[label])
     else:
         le.fit(value_set)
         encoded = le.transform(data[label])
     for i in range(encoded.shape[1]):
         new_label = '{0}_is_{1}'.format(label, i)
         data[new_label] = encoded[:,i]
Ejemplo n.º 17
0
class _CategoricalEncoder:
    """OneHotEncoder that can handle categorical variables."""

    def __init__(self):
        """Convert labeled categories into one-hot encoded features."""
        self._lb = LabelBinarizer()

    def fit(self, X):
        """Fit a list or array of categories.

        Parameters
        ----------
        * `X` [array-like, shape=(n_categories,)]:
            List of categories.
        """
        self.mapping_ = {v: i for i, v in enumerate(X)}
        self.inverse_mapping_ = {i: v for v, i in self.mapping_.items()}
        self._lb.fit([self.mapping_[v] for v in X])
        self.n_classes = len(self._lb.classes_)

        return self

    def transform(self, X):
        """Transform an array of categories to a one-hot encoded representation.

        Parameters
        ----------
        * `X` [array-like, shape=(n_samples,)]:
            List of categories.

        Returns
        -------
        * `Xt` [array-like, shape=(n_samples, n_categories)]:
            The one-hot encoded categories.
        """
        return self._lb.transform([self.mapping_[v] for v in X])

    def inverse_transform(self, Xt):
        """Inverse transform one-hot encoded categories back to their original
           representation.

        Parameters
        ----------
        * `Xt` [array-like, shape=(n_samples, n_categories)]:
            One-hot encoded categories.

        Returns
        -------
        * `X` [array-like, shape=(n_samples,)]:
            The original categories.
        """
        Xt = np.asarray(Xt)
        return [
            self.inverse_mapping_[i] for i in self._lb.inverse_transform(Xt)
        ]
Ejemplo n.º 18
0
class PipelineLabelBinarizer(TransformerMixin):

    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=None):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=None):
        return self.encoder.transform(x)
Ejemplo n.º 19
0
def ndcg_score(ground_truth, predictions, k=5):
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)
    scores = []
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)
Ejemplo n.º 20
0
def one_hot_encode(x):
    """
    One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
    : x: List of sample Labels
    : return: Numpy array of one-hot encoded labels
    """
    # TODO: Implement Function
    labels=list(range(10))
    lb = LabelBinarizer()
    lb.fit(labels)
    return np.array(lb.transform(x))
def partb():
    def load(file_name):
        file = np.load(file_name)
        X_train =file['X_train'].T
        y_train =file['y_train']
        X_test =file['X_test'].T
        y_test =file['y_test']
        X_cv =file['X_cv'].T
        y_cv =file['y_cv']

        return X_train,y_train,X_cv,y_cv,X_test,y_test

    train_ = [0,0]
    test_ = [0,0]
    overall = []
    for i in range(14):

        X_train,y_train,X_cv,y_cv,X_test,y_test = load('pofa{}.npz'.format(i))

        from sklearn.preprocessing import LabelBinarizer
        binarizer = LabelBinarizer()
        binarizer.fit(y_train)
        Y_train = binarizer.transform(y_train).T
        Y_cv = binarizer.transform(y_cv).T


#nn.forward(X)
#nn.backprop(X,Y,graient_check=True)

        print(X_train.shape[0], Y_train.shape[0])
        nn = NeuralNetwork([X_train.shape[0],30,Y_train.shape[0]], functions=[sigmoid,softmax], derivatives=[derivative_sigmoid])

        nn.fit(X_train,Y_train,eta=0.01,momentum=0.5,minibatch=16,regularizer=0.15,max_iter=200,gradient_check=False,cv = (X_cv,Y_cv),graphs=False, lbfgs=False)

        output = nn.forward(X_train)

        y_train_output = binarizer.inverse_transform(output.T)
        y_test_output = binarizer.inverse_transform(nn.forward(X_test).T)
        print("Iteration: ",i)
        print((y_train_output==y_train).mean())
        print((y_test_output ==y_test).mean())

        overall.append((y_test == y_test_output).mean())

        train_[0] += (y_train_output==y_train).sum()
        train_[1] += y_train.shape[0]
        test_[0] += (y_test_output==y_test).sum()
        test_[1] += y_test.shape[0]

    print("Average train accuracy: ", train_[0]/train_[1],"Average test accuracy: ",test_[0]/test_[1])
    print(train_,test_)
    overall = np.array(overall)
    print(overall.mean())
Ejemplo n.º 22
0
def test_labelbinarizer_vs_sklearn():
    # Compare msmbuilder.preprocessing.LabelBinarizer
    # with sklearn.preprocessing.LabelBinarizer

    labelbinarizerr = LabelBinarizerR()
    labelbinarizerr.fit(np.concatenate(labels))

    labelbinarizer = LabelBinarizer()
    labelbinarizer.fit(labels)

    y_ref1 = labelbinarizerr.transform(labels[0])
    y1 = labelbinarizer.transform(labels)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Ejemplo n.º 23
0
class ELM(BaseEstimator):

    def __init__(self, h=60, activation='linear', random_state=None, C=100):
        self.name = 'elm'
        self.h = h
        self.activation = activation
        self.random_state = random_state
        self.C = C

        assert self.activation in ['rbf', 'sigmoid', 'linear']

    def fit(self, X, y):

        if self.random_state is None:
            self.random_state = np.random.RandomState(np.random.randint(0, np.iinfo(np.int32).max))
        elif type(self.random_state) == int:
            self.random_state = np.random.RandomState(self.random_state)

        self.lb = LabelBinarizer()
        self.W = self.random_state.normal(size=(X.shape[1], self.h))
        self.B = self.random_state.normal(size=self.h)

        if self.activation == 'rbf':
            H = _elm_vectorized_rbf(X, self.W, self.B)
        elif self.activation == 'sigmoid':
            H = _elm_sigmoid(X, self.W, self.B)
        else :
            H = X.dot(self.W)

        self.lb.fit(y)

        lam = np.eye(H.shape[1]) * (1./self.C)
        H_inv = np.linalg.inv(H.T.dot(H) + lam)
        self.beta = H_inv.dot(H.T.dot(self.lb.transform(y)))

        return self


    def decision_function(self, X):
        if self.activation == 'rbf':
            return _elm_vectorized_rbf(X, self.W, self.B).dot(self.beta)
        elif self.activation == 'sigmoid':
            return _elm_sigmoid(X, self.W, self.B).dot(self.beta)
        else :
            return X.dot(self.W).dot(self.beta)


    def predict(self, X):
        return self.lb.inverse_transform(self.decision_function(X))
Ejemplo n.º 24
0
    def fit(self, train_data):
        X_data, y_data = train_data
        self.learned = []

        lb = LabelBinarizer()
        lb.fit(y_data)
        self.lb = lb

        # We binarize the label and build a classifier for each case.
        # Thus, the number of iterations will be same as the number of the
        # classes.
        for y_bin_data in lb.transform(y_data).T:
            bin_train_data = [X_data, y_bin_data]
            params = self.fit_binary(bin_train_data)
            self.learned.append(params)
Ejemplo n.º 25
0
    def load_dataset(self):
        X, y, X_test, y_test = dataset = snippet_reader.toNumpy()

        lb = LabelBinarizer()
        lb.fit(y)

        for y_bin in lb.transform(y).T:
            y = y_bin
            break

        for y_bin in lb.transform(y_test).T:
            y_test = y_bin
            break

        return X, y, X_test, y_test
Ejemplo n.º 26
0
def load_data():
    labels=pd.read_csv("train.csv")
    bismatch=pd.read_csv("train_photo_to_biz_ids.csv")
    labels=bismatch.merge(labels,how='left',on='business_id')
    labels=labels[pd.isnull(labels['labels'])==False]
    labels['labels']=labels['labels'].map(lambda x:[int(i) for i in x.split(" ")])
    training_=os.listdir("train_photos/train244")
    train_ids=pd.DataFrame({"photo_id":[int(i.split(".")[0]) for i in training_]})
    train_ids=train_ids.merge(labels,on='photo_id',how='inner')
#    val_ids=val_ids.merge(labels,on='photo_id',how='inner')
    mlb=LabelBinarizer()
    mlb.fit(train_ids['business_id'].tolist())
#    X_train=np.array([imread('train_photos/train244/'+str(f_)+".jpg") for f_ in train_ids['photo_id'].tolist()]).astype(np.float32)
#    X_test=np.array([imread('train_photos/val244/'+str(f_)+".jpg") for f_ in val_ids['photo_id'].tolist()]).astype(np.float32)
    return train_ids,mlb
Ejemplo n.º 27
0
def ndcg_score(truth, pred):

    lb = LabelBinarizer()
    lb.fit(range(len(pred) + 1))
    T = lb.transform(truth)

    scores = []

    for y_true, y_score in zip(T, pred):
        actual = dcg_score(y_true, y_score)
        best = dcg_score(y_true, y_true)
        score = float(actual) / float(best)
        scores.append(scores)

        return np.mean(scores)
def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    # lb.fit(range(len(predictions) + 1))
    # lb.fit(range(predictions.shape[1] + 1))
    lb.fit(range(13))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / max(0.000001, float(best))
        scores.append(score)

    return np.mean(scores)
Ejemplo n.º 29
0
class AlchemyDiscretizer(BaseEstimator, TransformerMixin):
	def __init__(self, useful_features = None):
		useful_features = None or ['recreation', 'business', 'sports', 'unknown', 'arts_entertainment', 'computer_internet', 'health', 'culture_politics', 'science_technology', 'religion', 'gaming', 'law_crime', 'weather']
		self.useful_features = useful_features
		self.alchemy_category_labeler_ = LabelBinarizer()
	def fit(self, X, y = None):
		"""X: pd.DataFrame
		"""
		self.alchemy_category_labeler_.fit(X.alchemy_category.tolist())
		return self		
	def transform(self, X):
		encoded_X = self.alchemy_category_labeler_.transform(X.alchemy_category.tolist())
		result_X = X.copy()
		for i,f in enumerate(self.alchemy_category_labeler_.classes_):
			if self.useful_features is None or f in self.useful_features:
				result_X[f] = encoded_X[:, i]
		return result_X[result_X.columns - ['alchemy_category']]
Ejemplo n.º 30
0
 def _get_child_predict(self, clf, X, index=None):
     if self.stack_by_proba and hasattr(clf, 'predict_proba'):
         if self.save_stage0 and index is not None:
             proba = util.saving_predict_proba(clf, X, index)
         else:
             proba = clf.predict_proba(X)
         return proba[:, 1:]
     elif hasattr(clf, 'predict'):
         predict_result = clf.predict(X)
         if isinstance(clf, ClassifierMixin):
             lb = LabelBinarizer()
             lb.fit(predict_result)
             return lb.fit_transform(predict_result)
         else:
             return predict_result.reshape((predict_result.size, 1))
     else:
         return clf.fit_transform(X)
Ejemplo n.º 31
0
    from sklearn.svm import SVC, LinearSVC
    from sklearn import tree
    from sklearn import cluster
    from sklearn.preprocessing import LabelBinarizer, LabelEncoder
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix as conf_mat
    from viz import *
    import config as C
    import testing as T

    classes = list(vh[19])
    classes.sort()
    X_val, y_val = collect_class_vs(vh[19])
    lb = LabelBinarizer()
    le = LabelEncoder()
    lb.fit(classes)
    le.fit(classes)
    y_val_enc = le.transform(y_val)
    Y_val = lb.transform(y_val)


    ##load test vectors / generate test vectors using CNN:
    cnn_name = 'epoch_19.model'
    oname = cnn_name.split('.')[0]+'_test_pred'
    ofile = os.path.join(C.obj_dir,oname)
    #T.save_model_predictions(os.path.join(C.model_dir,cnn_name), tdir=C.test_dir, 
    #                        ofile=ofile)
    test = T.load_obj(ofile)
    X_test, y_test = collect_class_vs(test)
    y_test_enc = le.transform(y_test)
Ejemplo n.º 32
0
class OneHotVector(object):
    def __init__(self, chars: list, added: list = []):
        chars.extend(added)
        if not chars or type(chars) is not list or len(chars) == 0:
            raise Exception('values must be list and len(values)>0 %s' % chars)

        self.chars = chars
        self.encoder = LabelBinarizer(neg_label=0,
                                      pos_label=1,
                                      sparse_output=False)
        self.encoder.fit(chars)

    @property
    def classes(self):
        return self.encoder.classes_

    def __len__(self):
        return self.encoder.classes_.shape[0]

    @property
    def size(self):
        return self.encoder.classes_.shape[0]

    def __repr__(self):
        return '%s(len:%s)' % (self.__class__.__name__, self.__len__())

    def to_vector(self, char: str) -> np.ndarray:
        """
        
        :param char: character. len(c)==1
        :return:
        """
        return self.encoder.transform([char])[0]

    def to_vectors(self, chars: list) -> np.ndarray:
        """
        
        :param chars: list of characters. len(chars)>0
        :return:
        """
        if type(chars) is str or type(chars) is np.str_:
            chars = [c for c in chars]
        return self.encoder.transform(chars)

    def to_value(self, vector: np.ndarray) -> np.ndarray:
        """
        
        :param vector: one hot vector
        :return: 
        """
        if vector.ndim != 1:
            vector = vector.flatten()
        return self.encoder.inverse_transform(np.array([vector]))[0]
        # if not ch or ch == '':
        #     return ' '
        # else:
        #     return ch

    def to_values(self, vectors: np.ndarray) -> np.ndarray:
        """

        :param vectors: list of one hot vector 
        :return: 
        """
        if vectors.ndim != 2:
            vectors = vectors.reshape((len(vectors) // self.size, self.size))
        return ''.join(self.encoder.inverse_transform(vectors))

    def to_index(self, c: str) -> int:
        return np.argmax(self.to_vector(c))

    def index2value(self, index):
        if 0 < index < len(self.chars):
            return self.classes[index]
        else:
            return ''
Ejemplo n.º 33
0
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)
def main():
    NUM_BRANDS = 4004
    NUM_CATEGORIES = 1001
    NAME_MIN_DF = 10
    MAX_FEATURES_ITEM_DESCRIPTION = 39000

    if FLAGS.file_path.endswith('.tsv'):
        dat = pd.read_table(FLAGS.file_path, engine='c')
    else:
        dat = pd.read_table(FLAGS.file_path, sep=',', engine='python')

    start_time = time.time()

    handle_missing_inplace(dat)
    print('[{}] Finished to handle missing'.format(time.time() - start_time))

    cutting(dat)
    print('[{}] Finished to cut'.format(time.time() - start_time))

    to_categorical(dat)
    print('[{}] Finished to convert categorical'.format(time.time() -
                                                        start_time))

    if not FLAGS.is_training:
        with open(FLAGS.save_path + '/cv_name_save.pkl', 'rb') as pickle_in:
            cv_name = pickle_in
        with open(FLAGS.save_path + '/cv_category_save.pkl',
                  'rb') as pickle_in:
            cv_category = pickle_in
        with open(FLAGS.save_path + '/tv_desc_save.pkl', 'rb') as pickle_in:
            tv_desc = pickle_in
        with open(FLAGS.save_path + '/lb_brand_save.pkl', 'rb') as pickle_in:
            lb_brand = pickle_in
    else:
        cv_name = CountVectorizer(min_df=NAME_MIN_DF)
        cv_name.fit(dat['name'])

        cv_category = CountVectorizer()
        cv_category.fit(dat['category_name'])

        tv_desc = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                                  ngram_range=(1, 3),
                                  stop_words='english')
        tv_desc.fit(dat['item_description'])

        lb_brand = LabelBinarizer(sparse_output=True)
        lb_brand.fit(dat['brand_name'])

    X_name = cv_name.transform(dat['name'])
    print('[{}] Finished count vectorize `name`'.format(time.time() -
                                                        start_time))

    X_category = cv_category.transform(dat['category_name'])
    print('[{}] Finished count vectorize `category_name`'.format(time.time() -
                                                                 start_time))

    X_description = tv_desc.transform(dat['item_description'])
    print(
        '[{}] Finished TFIDF vectorize `item_description`'.format(time.time() -
                                                                  start_time))

    X_brand = lb_brand.transform(dat['brand_name'])
    print('[{}] Finished label binarize `brand_name`'.format(time.time() -
                                                             start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(dat[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.
          format(time.time() - start_time))

    sparse_dat = hstack(
        (X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
    print('[{}] Finished to create sparse dat'.format(time.time() -
                                                      start_time))

    ## may as well get the pickle for price here
    price_label = dat['price']

    if FLAGS.is_training:
        pickle.dump(sparse_dat, open(FLAGS.save_path + '/sparse_mat.pkl',
                                     'wb'))
        pickle.dump(price_label, open(FLAGS.save_path + '/label.pkl', 'wb'))

    ## save the trained vectorizes and stuff for future cleaning (like with validation set)

    if FLAGS.is_training:
        pickle.dump(sparse_dat,
                    open(FLAGS.save_path + '/sparse_mat_val.pkl', 'wb'))
        pickle.dump(price_label, open(FLAGS.save_path + '/label_val.pkl',
                                      'wb'))
        pickle.dump(cv_name, open(FLAGS.save_path + '/cv_name_save.pkl', 'wb'))
        pickle.dump(cv_category,
                    open(FLAGS.save_path + '/cv_category_save.pkl', 'wb'))
        pickle.dump(tv_desc, open(FLAGS.save_path + '/tv_desc_save.pkl', 'wb'))
        pickle.dump(lb_brand, open(FLAGS.save_path + '/lb_brand_save.pkl',
                                   'wb'))

    print('Done!')
Ejemplo n.º 35
0
# Prepare data:
df_train = pd.read_csv(data_path + "/emnist-letters-train.csv")
df_test = pd.read_csv(data_path + "/emnist-letters-test.csv")

y_train = df_train.iloc[:, 0].values
y_test = df_test.iloc[:, 0].values


X_train = df_train.iloc[:, 1:].values
X_test = df_test.iloc[:, 1:].values
X_train = X_train.reshape(-1, 28, 28, 1)
X_test = X_test.reshape(-1, 28, 28, 1)

# One-hot encode the y-values:
lb = LabelBinarizer()
lb.fit(y_train)
y_train_enc = lb.transform(y_train)
y_test_enc = lb.transform(y_test)


# Define hyperparameters:
num_epoch = 15
batch_size = 32




# Define model:
model = SimpleConvnet(inp_w = 28, inp_h = 28, inp_d = 1)
model.fit(X_train, y_train_enc, num_epoch = num_epoch, batch_size = batch_size, weight_save_path = weight_save_path)
Ejemplo n.º 36
0
    y_train = training_data['output2'] if predict_rating else training_data[
        'output1']
    y_test = test_data['output2'] if predict_rating else test_data['output1']
else:  #doc2vec
    training_data = pd.read_csv('./split_doc2vec/1/training_data.csv',
                                header=0)
    test_data = pd.read_csv('./split_doc2vec/1/test_data.csv', header=0)
    X_train = listify(training_data['Vector'])
    X_test = listify(test_data['Vector'])
    y_train = training_data['overall'] if predict_rating else training_data[
        'Category']
    y_test = test_data['overall'] if predict_rating else test_data['Category']

#one hot encoding of the ratings
encode_label = LabelBinarizer()
encode_label.fit(y_train)

#calc class weights
class_weight = None
if weight_class:
    y_int = np.argmax(encode_label.transform(y_train), axis=1)
    class_weight = compute_class_weight('balanced', np.unique(y_int), y_int)
    class_weight = dict(enumerate(class_weight))

#grid search params to test
num_hidden_layers = [1, 2, 3]
num_nodes = [2, 4, 8, 16, 32]
epochs = [50]
batch_size = [32]

callbacks = None
pred_gnb = gnb_model.predict(np.asarray(q_test))
#evaluate the model, for abstracts use multilabel_evaluation_multilabelbinarizer() for citations
#use multilabel_evaluation()
gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation_multilabelbinarizer(
    d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes")
#gnb_evaluation_scores, gnb_cm = evaluation.multilabel_evaluation(
#    d_test, label_encoder.inverse_transform(pred_gnb), "Gaussian Naive Bayes")
documentation_file_modelopt.write(gnb_evaluation_scores)

#split data in training and test data
d_train_single, d_test_single, q_train_single, q_test_single = train_test_split(
    datasets_single, q_fasttext, test_size=0.2)

#prepare queries and datasets for Neural Network application
label_binarizer = LabelBinarizer()
label_binarizer.fit(datasets_single)
d_train_binarized = label_binarizer.transform(d_train_single)
pickle.dump(label_binarizer, open("label_binarizer_fasttext.sav", 'wb'))
array_q_train = np.array(q_train_single)
X = np.expand_dims(array_q_train, axis=2)
array_q_test = np.array(q_test_single)
x_t = np.expand_dims(array_q_test, axis=2)
d_train_array = np.array(d_train_binarized)
d_test_array = np.array(d_test_single)
num_classes = len(label_binarizer.classes_)

#build CNN model and evaluate the model
print("CNN model evaluation")


def cnn_optimization(x_train, y_train, x_test, y_test, params):
Ejemplo n.º 38
0
train_tags = data['Status'][:train_size]
train_files_names = data['filename'][:train_size]

test_posts = data['title'][train_size:]
test_tags = data['Status'][train_size:]
test_files_names = data['filename'][train_size:]

# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)

x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()

model.compile(loss='categorical_crossentropy',
@author: mayur
"""

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer


train = pd.read_csv('/Users/mayur/Documents/GitHub/'+
                    'newtrain.csv')

trial_data = train.iloc[:10,[0,4,6,8]]

lb = LabelBinarizer()

lb.fit(trial_data)

# checking number of unique visitor ids

unique_visitors = list(trial_data.fullVisitorId.unique())

enc_channelGrouping = OneHotEncoder()
enc_channelGrouping.fit(trial_data['channelGrouping'])


"""

dict_channelGrouping = dict(enumerate(trial_data['channelGrouping'].astype(
        'category').cat.categories))

dict_socialEngagementType = dict(enumerate(
Ejemplo n.º 40
0
def binarize_tokenized(x, vocab_len):
    binarizer = LabelBinarizer()
    binarizer.fit(range(vocab_len))
    x = np.array([binarizer.transform(x) for x in x])

    return x
Ejemplo n.º 41
0
labels_flat = labels_flat[ind]
'''
# summary of classes
class_1 = labels_flat[labels_flat == 0]
class_2 = labels_flat[labels_flat == 1]
class_3 = labels_flat[labels_flat == 2]
print(class_1.shape[0])
print(class_2.shape[0])
print(class_3.shape[0])

# under-sample dataset
params, labels_flat = RandomUnderSampler(random_state=0).fit_resample(
    params, labels_flat)

# label binarization
lb.fit(labels_flat)

# feature selection
# params = SelectKBest(chi2, k=10).fit_transform(params, labels_flat)
print(params.shape)
# create classifier
clf = RandomForestClassifier(n_estimators=500,
                             random_state=666,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             n_jobs=-1,
                             criterion='entropy',
                             max_depth=None,
                             max_features='sqrt',
                             class_weight=[{
                                 0: 1,
Ejemplo n.º 42
0
class DistOneVsRestClassifier(OneVsRestClassifier):
    """
    Same as sklearn `OneVsRestClassifier` but with distributed
    training using spark. Additionally implements flexible
    ``predict_proba`` method with custom `norm` input
    designating the normalization method used after individual
    predictions are made.

    Args:
        estimator (sklearn estimator): An estimator object implementing fit and one of
            decision_function or predict_proba.
        sc (sparkContext): Spark context for spark broadcasting and rdd operations.
        norm (string): default None, Normalization method for predict_proba.
        partitions (int or 'auto'): default 'auto'
            Number of partitions to use for parallelization of parameter
            search space. Integer values or None will be used directly for `numSlices`,
            while 'auto' will set `numSlices` to the number required fits.
        max_negatives (int or float): default None
            Maximum number of negative records allowed for each binary
            estimator. Use int for hard maximum, or float for percentage
            of total negatives.
        random_state (int): default None
            Random state for limiting negatives (if max_negatives is not None).
        method (str): 'ratio' or 'multiplier'
            Method used to calculate true maximum number of negatives.
        n_splits (int): default 1
            Dials the number of splits for broadcasting
            X during fitting. Use values higher than 1 for large X.
        mlb_override (bool): pass over mlb step; this assumes
            that input `y` to `fit` is already in sparse (one-hot-encoded)
            format
        verbose (bool): print status messages
        **kwargs: Keyword arguments to be passed to `OneVsRestClassifier`.
    """
    def __init__(self, estimator, sc=None, norm=None, partitions='auto',
            max_negatives=None, random_state=None, method="ratio",
            n_splits=1, mlb_override=False, verbose=False, **kwargs):
        OneVsRestClassifier.__init__(
            self, estimator, **kwargs)
        self.norm = norm
        self.sc = sc
        self.partitions = partitions
        self.max_negatives = max_negatives
        self.random_state = random_state
        self.method = method
        self.n_splits = n_splits
        self.mlb_override = mlb_override
        self.verbose = verbose

    def fit(self, X, y, **fit_params):
        """
        Fit underlying estimators. Parallelize fit operation using spark.

        Args:
            X (array-like, shape = [n_samples, n_features]): input data
            y (array-like, shape = [n_samples, ], [n_samples, n_classes]): multi-class targets
            **fit_params (dict of string -> object): parameters passed 
                to the ``fit`` method of the estimator
        """
        _check_estimator(self, verbose=self.verbose)

        if (not self.mlb_override and not hasattr(y[0], '__array__') 
                and isinstance(y[0], Sequence)
                and not isinstance(y[0], str)):
            self.mlb = MultiLabelBinarizer()
            y = self.mlb.fit_transform(y)

        if isinstance(X, pd.DataFrame):
            X.index = list(range(len(X)))

        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        self.label_binarizer_.fit(y)
        self.classes_ = self.label_binarizer_.classes_
        self._fit(X, y, **fit_params)
        del self.sc
        if hasattr(self.estimator, "sc"):
            del self.estimator.sc
        return self

    def _fit(self, X, y, **fit_params):
        Y = self.label_binarizer_.transform(y)
        Y = Y.tocsc()
        max_negatives = self.max_negatives
        random_state = self.random_state
        n_splits = self.n_splits
        method = self.method
        estimator = _clone(self.estimator)
        if self.sc is None:
            models_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_fit_binary)(
                    estimator, X, x[1], fit_params, 
                    classes=["not %s" % x[0], x[0]],
                    max_negatives=max_negatives, 
                    random_state=random_state, method=method)
                for x in list(zip(self.classes_, list(col.toarray().ravel() for col in Y.T))))
        else:
            X = _split_X(X, n_splits, self.sc)
            partitions = _parse_partitions(self.partitions, len(self.classes_))
            estimator = self.sc.broadcast(self.estimator)
            columns = self.sc.parallelize(
                list(zip(self.classes_, list(col.toarray().ravel() for col in Y.T))),
                numSlices=partitions)
            models_ = columns.map(lambda x: _fit_binary(
                estimator, X, x[1], fit_params, classes=["not %s" % x[0], x[0]],
                max_negatives=max_negatives, random_state=random_state, method=method)).collect()
        estimators_ = [x[0] for x in models_]
        classes_ = [x[1] for x in models_]
        self.estimators_ = list([estimators_[classes_.index(x)] for x in self.classes_])
        return self

    def predict_proba(self, X):
        """
        Probability estimates.
        The returned estimates for all classes are ordered by label of classes.

        Args:
            X (array-like, shape = [n_samples, n_features]): input data

        Returns:
            T (array-like, shape = [n_samples, n_classes]): returns the probability 
                of the sample for each class in the model, where classes are 
                ordered as they are in self.classes_
        """
        probs = []
        for index in range(len(self.estimators_)):
            probs.append(self.estimators_[index].predict_proba(X)[:,1])
        out = np.array([
            [probs[y][index] for y in range(len(self.estimators_))]
            for index in range(len(probs[0]))])
        if self.norm:
            return normalize(out, norm=self.norm)
        else:
            return out
Ejemplo n.º 43
0
                                                          y_train,
                                                          test_size=0.094,
                                                          random_state=832289)
    print("No. of training samples: %d, No. of test samples: %d, No. of validation samples: %d"\
    %(len(X_train), len(X_test), len(X_valid)) )

    # Data preprocessing: converting to numpy array, normalizing data, and creating
    # one-hot labels.
    X_train = np.array(X_train)
    X_valid = np.array(X_valid)
    X_test = np.array(X_test)
    X_train = X_train.astype('float32')
    X_valid = X_valid.astype('float32')
    X_test = X_test.astype('float32')
    encoder = LabelBinarizer()
    encoder.fit(y_train)
    X_train /= 255
    X_valid /= 255
    X_test /= 255
    y_train_onehot = encoder.transform(y_train)
    y_valid_onehot = encoder.transform(y_valid)
    y_test_onehot = encoder.transform(y_test)
    data_train = [X_train, y_train_onehot, X_valid, y_valid_onehot]
    data_test = [X_test, y_test_onehot]

    batch_size = 32  # batch size
    lr = -4  # learning rate
    epochs = 25  # number of training epochs
    hyper_params = [pow(10, lr), epochs, batch_size]

def retrain(retrained_model_name, imagenet_model_name):

    # read codes and labels from file
    import csv

    with open('retrained_models/' + retrained_model_name + '/' +
              imagenet_model_name + '/labels') as f:
        reader = csv.reader(f, delimiter='\n')
        labels = np.array([each for each in reader if len(each) > 0]).squeeze()
    with open('retrained_models/' + retrained_model_name + '/' +
              imagenet_model_name + '/codes') as f:
        codes = np.fromfile(f, dtype=np.float32)
        codes = codes.reshape((len(labels), -1))

    from sklearn.preprocessing import LabelBinarizer

    lb = LabelBinarizer()
    lb.fit(labels)

    labels_vecs = lb.transform(labels)

    # GET VALIDATION
    from sklearn.model_selection import StratifiedShuffleSplit

    ss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

    train_idx, val_idx = next(ss.split(codes, labels))

    half_val_len = int(len(val_idx) / 2)
    val_idx, test_idx = val_idx[:half_val_len], val_idx[half_val_len:]

    train_x, train_y = codes[train_idx], labels_vecs[train_idx]
    val_x, val_y = codes[val_idx], labels_vecs[val_idx]
    test_x, test_y = codes[test_idx], labels_vecs[test_idx]

    print("Train shapes (x, y):", train_x.shape, train_y.shape)
    print("Validation shapes (x, y):", val_x.shape, val_y.shape)
    print("Test shapes (x, y):", test_x.shape, test_y.shape)

    inputs_ = tf.placeholder(tf.float32,
                             shape=[None, codes.shape[1]],
                             name='inputs_clf')
    labels_ = tf.placeholder(tf.int64,
                             shape=[None, labels_vecs.shape[1]],
                             name='labels_clf')

    with tf.name_scope('fc1'):
        W_fc1 = tf.Variable(tf.truncated_normal([bottleneck_size, 256],
                                                stddev=0.1),
                            name='W')
        b_fc1 = tf.Variable(tf.constant(0.1, shape=[256]), name='b')
        fc1 = tf.add(tf.matmul(inputs_, W_fc1), b_fc1)
        fc1 = tf.nn.relu(fc1)

    with tf.name_scope('fc2'):
        W_fc2 = tf.Variable(tf.truncated_normal([256, labels_vecs.shape[1]],
                                                stddev=0.1),
                            name='W')
        b_fc2 = tf.Variable(tf.constant(0.1, shape=[labels_vecs.shape[1]]),
                            name='b')
        logits = tf.add(tf.matmul(fc1, W_fc2), b_fc2)
    probs = tf.nn.softmax(logits, name='probs')

    with tf.name_scope('train_clf'):
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels_,
                                                                logits=logits)
        cost = tf.reduce_mean(cross_entropy)

        optimizer = tf.train.AdamOptimizer().minimize(cost)

    with tf.name_scope('accuracy_clf'):
        predicted = tf.nn.softmax(logits)
        correct_pred = tf.equal(tf.argmax(predicted, 1), tf.argmax(labels_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    def get_batches(x, y, n_batches=10):
        """ Return a generator that yields batches from arrays x and y. """
        batch_size = len(x) // n_batches

        for ii in range(0, n_batches * batch_size, batch_size):
            # If we're not on the last batch, grab data with size batch_size
            if ii != (n_batches - 1) * batch_size:
                X, Y = x[ii:ii + batch_size], y[ii:ii + batch_size]
            # On the last batch, grab the rest of the data
            else:
                X, Y = x[ii:], y[ii:]
            # I love generators
            yield X, Y

    epochs = 10
    iteration = 0
    saver = tf.train.Saver()
    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        for e in range(epochs):
            for x, y in get_batches(train_x, train_y):
                loss, _ = sess.run([cost, optimizer],
                                   feed_dict={
                                       inputs_: x,
                                       labels_: y
                                   })
                print("Epoch: {}/{}".format(e + 1, epochs),
                      "Iteration: {}".format(iteration),
                      "Training loss: {:.5f}".format(loss))
                iteration += 1

                if iteration % 5 == 0:
                    val_acc = sess.run(accuracy,
                                       feed_dict={
                                           inputs_: val_x,
                                           labels_: val_y
                                       })
                    print("Validation Acc: {:.4f}".format(val_acc))

        saver.save(
            sess, './retrained_models/' + retrained_model_name + '/' +
            imagenet_model_name + '/model.ckpt')
        print('Model trained and saved in ./retrained_models/' +
              retrained_model_name + '/' + imagenet_model_name + '/model.ckpt')

        test_acc = sess.run(accuracy,
                            feed_dict={
                                inputs_: test_x,
                                labels_: test_y
                            })
        print("Test accuracy: {:.4f}".format(test_acc))
        return round(100 * test_acc, 2)
for i in range(len(imgs_ordered)):
    patches = image.extract_patches_2d(imgs_ordered[i], (224,224), max_patches = 100)
    label = data['primary_microconstituent'][i]
    for patch in patches:
        x = Image.fromarray(patch).convert('RGB')
        x = np.asarray(x)
        #x = np.expand_dims(patch, axis = 2)
        x = preprocess_input(x)
        processed_imgs.append(x)
        labels.append(label)
    progbar(i, (len(imgs_ordered)-1), 20)



lb = LabelBinarizer()
lb.fit(np.asarray(data['primary_microconstituent']))
y = lb.transform(labels)
print('\nLabels Binarized, converting array')


input = np.asarray(processed_imgs)

X_train, X_test, y_train, y_test = train_test_split(
    input, y, test_size=0.1, random_state=42)


model = DenseNet169(weights=None, classes = 7)

model.summary()
model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
time_callback = TimeHistory()
Ejemplo n.º 46
0
class MultiLabelSKFlow(BaseEstimator):
    """
    This is a wrapper class for TensorFlow, so it adheres to the fit/predict naming conventions of sk-learn.
    This class handles the output layer, mini-batch learning, early stopping, threshold optimization on the validation set, and the neural metalabeler.
    
    The concrete TensorFlow model up to the last hidden layer can be specified in terms of the 'get_model' function.
    This function in turn has to accept the dataset X (np.array, or csr_matrix), and the goldstandard y (csr_matrix).
    Moreover, get_model() is expected to return the following components:
    
    x_tensor: tf.placeholder 
        Used to pass input data to the model at training and test time.
    y_tensor: tf.placeholder
        Used by to pass the ground truth to the model during training.
    last_layer: tf.Tensor 
        The TensorFlow computation graph from input layer to last hidden layer of the implemented neural network.
    params_fit: dictionary 
        Parameters to be added to the feed dictionary for training (e.g., keep_probability_placeholder -> 0.5)
    params_predict: dictionary
        Parameters to be added to the feed dictionary 
        at prediction time (e.g., keep_probability_placeholder -> 1.0)
    initializer_operations: list of (tf.Tensor, dictionary)
        A list of
        pairs consisting of operations for initializing variables (e.g.,
        embedding tables) before training starts, and the feed dictionary with data to execute
        the initialize operation.
        
    Moreover, training can be controlled by the following parameters:
    
    Parameters
    ----------
    batch_size: int, default = 5
        Batch size to use during training and at prediction time.
    num_epochs: int, default = 10
        Number of iterations over the dataset during training.
    get_model: function, default = mlp_base()
        The function that returns the underlying neural network up to the last hidden layer. See above description.
    threshold: float, default = 0.2
        Fixed threshold to use if "optimize_threshold" = False, or starting threshold when "optimize_threshold" = True.
    learning_rate: float, default = 0.1
        Initial learning rate to use for Adam.
    patience, int, default = 5
        Number of non-improving evaluations on the validation set before terminating training.
    validation_metric, function true_values, predicted_values -> float, default = f1_score
        The metric that is used for evaluating prediction on the validation set.
    optimize_threshold, boolean, default = True
        Determines whether the threshold is optimized on a validation set.
    threshold_window, array-like of float, default = np.linspace(-0.03, 0.03, num=7)
        An array of floats that are interpreted as offset from the current threshold value. When optimizing the threshold,
        each of these offsets is added to the current threshold and the validation performance is assessed. Afterwards, the
        threshold is set to the value that has yielded the best score.
    tf_model_path, str, default = ".tmp_best_models"
        A path to the folder where the weights of the best model are saved, so it can be loaded at prediction time.
    num_steps_before_validation, int, default = None
        Determines the number of batches between two performance evaluations on the validation set. If set to None, this number is determined from the size of
        the training set, i.e., it is set to one epoch.
    hidden_activation_function, TensorFlow operation, default = tf.nn.relu
        The activation function to apply after the bottleneck layer.
    bottleneck_layers, list of int, default = None
        As many layers as there are elements in this list are injected before the output layer. Element i specifies the number of units
        in bottleneck layer i.
    hidden_keep_prob, float, default = 0.5
        Specifies the keep probability of dropout to apply after each bottleneck layer.
    gpu_memory_fraction, float, default = 1.
        Specifies how much of the RAM of each available GPU TensorFlow may reserve.
    meta_labeler_phi, str, default = None
        Determines which 'phi' function from the definition of Neural MetaLabeler we use: "content", "score", or None. If none is used, MetaLabeler is not
        used at all. If "content" is used, the prediction is based on the output of the last hidden layer from the underlying neural network (given by get_model).
        If "score" is used, the prediction is based on the probabilities given by the output layer.
    meta_labeler_alpha, float, default = 0.1
        The label-classification objective is weighted by (1 - alpha), and the objective of predicting the number of labels is weighted by alpha.
    meta_labeler_min_labels, int, default = 1
        Specifies the smallest possible number of labels that can be predicted by Neural MetaLabeler.
    meta_labeler_max_labels, int, default = None
        Specifies the largest possible number of labels that can be predicted by Neural MetaLabeler. If set to None, the maximum number of labels is determined from
        the training set.
    """
    def __init__(self,
                 batch_size=5,
                 num_epochs=10,
                 get_model=mlp_base(),
                 threshold=0.2,
                 learning_rate=0.1,
                 patience=5,
                 validation_metric=lambda y1, y2: f1_score(
                     y1, y2, average="samples"),
                 optimize_threshold=True,
                 threshold_window=np.linspace(-0.03, 0.03, num=7),
                 tf_model_path=".tmp_best_models",
                 num_steps_before_validation=None,
                 hidden_activation_function=tf.nn.relu,
                 bottleneck_layers=None,
                 hidden_keep_prob=0.5,
                 gpu_memory_fraction=1.,
                 meta_labeler_phi=None,
                 meta_labeler_alpha=0.1,
                 meta_labeler_min_labels=1,
                 meta_labeler_max_labels=None):
        """
    
        """

        self.get_model = get_model

        # enable early stopping on validation set
        self.validation_data_position = None
        self.num_steps_before_validation = num_steps_before_validation

        # configurations for bottleneck layers
        self.hidden_activation_function = hidden_activation_function
        self.bottleneck_layers = bottleneck_layers
        self.hidden_keep_prob = hidden_keep_prob

        # configuration for meta-labeler
        self.meta_labeler_phi = meta_labeler_phi
        self.meta_labeler_alpha = meta_labeler_alpha
        self.num_label_binarizer = None
        self.meta_labeler_max_labels = meta_labeler_max_labels
        self.meta_labeler_min_labels = meta_labeler_min_labels

        # used by this class
        self.validation_metric = validation_metric
        self.optimize_threshold = optimize_threshold
        self.threshold_window = threshold_window
        self.patience = patience
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.threshold = threshold
        if learning_rate is None:
            self.learning_rate = self.batch_size / 512 * 0.01
        else:
            self.learning_rate = learning_rate

        # path to save the tensorflow model to
        self.TF_MODEL_PATH = tf_model_path
        self._save_model_path = self._get_save_model_path()

        # determine how much of gpu to use
        self.gpu_memory_fraction = gpu_memory_fraction

    def _get_save_model_path(self):
        TMP_FOLDER = self.TF_MODEL_PATH
        if not os.path.exists(TMP_FOLDER):
            os.makedirs(TMP_FOLDER)
        return TMP_FOLDER + "/best-model-" + self.get_model.__name__ + str(
            datetime.now())

    def _calc_num_steps(self, X):
        return int(np.ceil(X.shape[0] / self.batch_size))

    def _predict_batch(self, X_batch):
        feed_dict = {self.x_tensor: X_batch}
        feed_dict.update(self.params_predict)

        if self.meta_labeler_phi is None:
            predictions = self.session.run(self.predictions,
                                           feed_dict=feed_dict)
        else:
            predictions = self.session.run(
                [self.predictions, self.meta_labeler_prediction],
                feed_dict=feed_dict)

        return predictions

    def _make_binary_decision(self, predictions):
        if self.meta_labeler_phi is None:
            y_pred = predictions > self.threshold
        else:
            predictions, meta_labeler_predictions = predictions
            max_probability_cols = np.argmax(meta_labeler_predictions, axis=1)
            max_probability_indices = tuple(
                np.indices([meta_labeler_predictions.shape[0]
                            ])) + (max_probability_cols, )
            meta_labeler_predictions = np.zeros_like(meta_labeler_predictions)
            meta_labeler_predictions[max_probability_indices] = 1
            meta_labeler_predictions = self.num_label_binarizer.inverse_transform(
                meta_labeler_predictions, 0)
            y_pred = np.zeros_like(predictions)
            for i in range(predictions.shape[0]):
                num_labels_for_sample = meta_labeler_predictions[i]
                top_indices = (
                    -predictions[i, :]).argsort()[:num_labels_for_sample]
                y_pred[i, top_indices] = 1

        return csr_matrix(y_pred)

    def _compute_validation_score(self, session, X_val_batch, y_val_batch):

        feed_dict = {self.x_tensor: X_val_batch}
        feed_dict.update(self.params_predict)

        if self.validation_metric == "val_loss":
            return session.run(self.loss, feed_dict=feed_dict)

        elif callable(self.validation_metric):
            predictions = self._predict_batch(X_val_batch)
            y_pred = self._make_binary_decision(predictions)
            if self.optimize_threshold:
                return self.validation_metric(y_val_batch, y_pred), predictions
            else:
                return self.validation_metric(y_val_batch, y_pred)

    def _print_progress(self, epoch, batch_i, steps_per_epoch,
                        avg_validation_score, best_validation_score,
                        total_loss, meta_loss, label_loss):

        progress_string = 'Epoch {:>2}/{:>2}, Batch {:>2}/{:>2}, Loss: {:0.4f}, Validation-Score: {:0.4f}, Best Validation-Score: {:0.4f}'
        format_parameters = [
            epoch + 1, self.num_epochs, batch_i + 1, steps_per_epoch,
            total_loss, avg_validation_score, best_validation_score
        ]
        if self.meta_labeler_phi is None:
            progress_string += ', Threshold: {:0.2f}'
            format_parameters.append(self.threshold)

        else:
            progress_string += ', Label-Loss: {:0.4f}, Meta-Loss: {:0.4f}'
            format_parameters.extend([label_loss, meta_loss])

        progress_string = progress_string.format(*format_parameters)
        print(progress_string, end='\r')

    def _num_labels_discrete(self,
                             y,
                             min_number_labels=1,
                             max_number_labels=None):
        """
        Counts for each row in 'y' how many of the columns are set to 1. Outputs the result in turn as a binary indicator matrix where
        the columns 0, ..., m correspond to 'min_number_labels', 'min_number_labels' + 1, ..., 'max_number_labels'.  
        
        Parameters
        ----------
        y: (sparse) numpy array of shape [n_samples, n_classes]
            An indicator matrix denoting which classes are assigned to a sample (multiple columns per row may be 1)
        min_number_labels: int, default=1
            Minimum number of labels each sample has to have. If a sample has less than 'min_number_labels' assigned,
            the corresponding output is set to 'min_number_labels'.
        max_number_labels: int, default=None
            Maximum number of labels each sample has to have. If a sample has more than 'min_number_labels' assigned,
            the corresponding output is set to 'max_number_labels'. If 'max_number_labels' is None, it is set to the max number found
            in y.
        Returns
        ---------
        num_samples_y: (sparse) numpy array of shape [n_samples, max_number_samples - min_number_samples + 1]
        """

        num_samples_y = np.array(np.sum(y, axis=1))
        num_samples_y = num_samples_y.reshape(-1)
        num_samples_y[num_samples_y < min_number_labels] = min_number_labels

        if max_number_labels is None:
            max_number_labels = np.max(num_samples_y)

        num_samples_y[num_samples_y > max_number_labels] = max_number_labels

        # 'fit' method calls this
        if self.num_label_binarizer is None:
            self.num_label_binarizer = LabelBinarizer()
            self.num_label_binarizer.fit(num_samples_y)

        indicator_matrix_num_labels = self.num_label_binarizer.transform(
            num_samples_y)
        return indicator_matrix_num_labels

    def fit(self, X, y):
        self.y = y

        val_pos = self.validation_data_position

        if val_pos is not None:
            X_train, y_train, X_val, y_val = X[:val_pos, :], y[:val_pos, :], X[
                val_pos:, :], y[val_pos:, :]

            validation_batch_generator = BatchGenerator(
                X_val, y_val, self.batch_size, False, False)
            validation_predictions = self._calc_num_steps(X_val)
            steps_per_epoch = self._calc_num_steps(X_train)

            # determine after how many batches to perform validation
            num_steps_before_validation = self.num_steps_before_validation
            if self.num_steps_before_validation is None:
                num_steps_before_validation = steps_per_epoch
            num_steps_before_validation = int(
                min(steps_per_epoch, num_steps_before_validation))
        else:
            steps_per_epoch = self._calc_num_steps(X)
            X_train = X
            y_train = y

        # Remove previous weights, bias, inputs, etc..
        tf.reset_default_graph()
        tf.set_random_seed(1337)

        # get_model has to return a
        self.x_tensor, self.y_tensor, self.last_layer, self.params_fit, self.params_predict, initializer_operations = self.get_model(
            X, y)

        # add bottleneck layer
        if self.bottleneck_layers is not None:
            bottleneck_dropout_tensor = tf.placeholder(
                tf.float32, name="bottleneck_dropout")
            self.params_fit.update(
                {bottleneck_dropout_tensor: self.hidden_keep_prob})
            self.params_predict.update({bottleneck_dropout_tensor: 1})
            for units in self.bottleneck_layers:
                self.last_layer = tf.contrib.layers.fully_connected(
                    self.last_layer,
                    units,
                    activation_fn=self.hidden_activation_function)
                self.last_layer = tf.nn.dropout(self.last_layer,
                                                bottleneck_dropout_tensor)

        # Name logits Tensor, so that is can be loaded from disk after training
        #logits = tf.identity(logits, name='logits')
        logits = tf.contrib.layers.linear(self.last_layer,
                                          num_outputs=y.shape[1])

        # Loss and Optimizer
        losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                         labels=self.y_tensor)
        loss = tf.reduce_sum(losses, axis=1)
        self.label_loss = tf.reduce_mean(loss, axis=0)

        # prediction
        self.predictions = tf.sigmoid(logits)

        if self.meta_labeler_phi is not None:

            # compute target of meta labeler
            y_num_labels = self._num_labels_discrete(
                y_train,
                min_number_labels=self.meta_labeler_min_labels,
                max_number_labels=self.meta_labeler_max_labels)
            y_num_labels_tensor = tf.placeholder(tf.float32,
                                                 shape=(None,
                                                        y_num_labels.shape[1]),
                                                 name="y_num_labels")

            # compute logits of meta labeler
            if self.meta_labeler_phi == "content":
                meta_logits = tf.contrib.layers.linear(
                    self.last_layer, num_outputs=y_num_labels.shape[1])
            elif self.meta_labeler_phi == "score":
                meta_logits = tf.contrib.layers.linear(
                    self.predictions, num_outputs=y_num_labels.shape[1])

            # compute loss of meta labeler
            meta_labeler_loss = tf.nn.softmax_cross_entropy_with_logits(
                labels=y_num_labels_tensor, logits=meta_logits)
            self.meta_labeler_loss = tf.reduce_mean(meta_labeler_loss, axis=0)

            # compute prediction of meta labeler
            self.meta_labeler_prediction = tf.nn.softmax(meta_logits)

            # add meta labeler loss to labeling loss
            self.loss = (
                1 - self.meta_labeler_alpha
            ) * self.label_loss + self.meta_labeler_alpha * self.meta_labeler_loss
        else:
            self.loss = self.label_loss

        # optimize
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(
                self.loss)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction)
        session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.session = session
        # Initializing the variables
        session.run(tf.global_variables_initializer())
        for (init_op, init_op_feed_dict) in initializer_operations:
            session.run(init_op, feed_dict=init_op_feed_dict)

        batch_generator = BatchGenerator(X_train, y_train, self.batch_size,
                                         True, False)
        # Training cycle
        objective = 1 if self.validation_metric == "val_loss" else -1
        avg_validation_score = math.inf * objective
        best_validation_score = math.inf * objective
        epochs_of_no_improvement = 0
        most_consecutive_epochs_with_no_improvement = 0
        batches_counter = 0
        epoch = 0
        stop_early = False
        while epoch < self.num_epochs and not stop_early:

            if val_pos is not None and epochs_of_no_improvement == self.patience:
                break

            # Loop over all batches
            for batch_i in range(steps_per_epoch):
                X_batch, y_batch = batch_generator._batch_generator()
                feed_dict = {self.x_tensor: X_batch, self.y_tensor: y_batch}
                feed_dict.update(self.params_fit)

                if self.meta_labeler_phi is not None:
                    feed_dict.update({
                        y_num_labels_tensor:
                        self._num_labels_discrete(y_batch)
                    })

                session.run(optimizer, feed_dict=feed_dict)

                # overwrite parameter values for prediction step
                feed_dict.update(self.params_predict)

                # compute losses to track progress
                if self.meta_labeler_phi is not None:
                    total_loss, label_loss, meta_loss = session.run(
                        [self.loss, self.label_loss, self.meta_labeler_loss],
                        feed_dict=feed_dict)
                else:
                    total_loss = session.run(self.loss, feed_dict=feed_dict)
                    label_loss, meta_loss = None, None

                batches_counter += 1
                is_last_epoch = epoch == self.num_epochs - 1
                is_last_batch_in_epoch = batch_i == steps_per_epoch - 1
                # calculate validation loss at end of epoch if early stopping is on
                if val_pos is not None and (
                        batches_counter == num_steps_before_validation or
                    (is_last_epoch and is_last_batch_in_epoch)):

                    batches_counter = 0

                    validation_scores = []
                    weights = []

                    # save predictions so we can optimize threshold later
                    val_predictions = np.zeros(
                        (X_val.shape[0], self.y.shape[1]))
                    for i in range(validation_predictions):
                        X_val_batch, y_val_batch = validation_batch_generator._batch_generator(
                        )
                        weights.append(X_val_batch.shape[0])

                        if self.optimize_threshold:
                            batch_val_score, val_predictions[
                                i * self.batch_size:(i + 1) * self.
                                batch_size, :] = self._compute_validation_score(
                                    session, X_val_batch, y_val_batch)
                        else:
                            batch_val_score = self._compute_validation_score(
                                session, X_val_batch, y_val_batch)
                        validation_scores.append(batch_val_score)
                    avg_validation_score = np.average(
                        np.array(validation_scores), weights=np.array(weights))

                    if self.optimize_threshold:
                        best_score = -1 * math.inf
                        best_threshold = self.threshold
                        for t_diff in self.threshold_window:
                            t = self.threshold + t_diff
                            score = self.validation_metric(
                                y_val, csr_matrix(val_predictions > t))
                            if score > best_score:
                                best_threshold = t
                                best_score = score

                    is_better_score = avg_validation_score < best_validation_score if objective == 1 else avg_validation_score > best_validation_score
                    if is_better_score:
                        # save model
                        # Save model for prediction step
                        best_validation_score = avg_validation_score
                        saver = tf.train.Saver()
                        saver.save(session, self._save_model_path)

                        if most_consecutive_epochs_with_no_improvement < epochs_of_no_improvement:
                            most_consecutive_epochs_with_no_improvement = epochs_of_no_improvement
                        epochs_of_no_improvement = 0

                        # save the threshold at best model, too.
                        if self.optimize_threshold:
                            self.threshold = best_threshold
                    else:
                        epochs_of_no_improvement += 1
                        if epochs_of_no_improvement > self.patience:
                            print("No improvement in validation loss for",
                                  self.patience, "epochs. Stopping early.")
                            stop_early = True
                            break

                # print progress
                self._print_progress(epoch, batch_i, steps_per_epoch,
                                     avg_validation_score,
                                     best_validation_score, total_loss,
                                     meta_loss, label_loss)

            epoch += 1

        print('')

        print("Training of TensorFlow model finished!")
        print("Longest sequence of epochs of no improvement:",
              most_consecutive_epochs_with_no_improvement)

    def predict(self, X):

        session = self.session
        #loaded_graph = tf.Graph()
        if self.validation_data_position:
            # Load model
            loader = tf.train.import_meta_graph(self._save_model_path +
                                                '.meta')
            loader.restore(self.session, self._save_model_path)

        prediction = np.zeros((X.shape[0], self.y.shape[1]))
        batch_generator = BatchGenerator(X, None, self.batch_size, False, True)
        prediction_steps = self._calc_num_steps(X)
        for i in range(prediction_steps):
            X_batch = batch_generator._batch_generator()
            preds = self._predict_batch(X_batch)
            binary_decided_preds = self._make_binary_decision(preds)
            prediction[i * self.batch_size:(i + 1) *
                       self.batch_size, :] = binary_decided_preds.todense()

        result = csr_matrix(prediction)

        # close the session, since no longer needed
        session.close()
        return result
def roc_multiclass_cruve_nn(y_test_class, y_pred_class):
    lb = LabelBinarizer()
    lb.fit(y_test_class)
    y_test_b = lb.transform(y_test_class)
    y_pred_b = lb.transform(y_pred_class)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(3):
        fpr[i], tpr[i], _ = roc_curve(y_test_b[:, i], y_pred_b[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    # Compute micro
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_b.ravel(),
                                              y_pred_b.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    lw = 1
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(3)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(3):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= 3

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(3), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC For Neural Network')
    plt.legend(loc="lower right")
    plt.savefig('ROC For Neural Network')
    return plt.show()
Ejemplo n.º 48
0
def nnCostFunction(nn_params, *args):

    in_size, hid_size, num_labels, X, y, lam = args

    Theta1 = nn_params[0:(in_size + 1) * hid_size].reshape(
        (hid_size, in_size + 1))
    Theta2 = nn_params[(in_size + 1) * hid_size:].reshape(
        (num_labels, hid_size + 1))

    #print(Theta1.shape)
    #print(Theta2.shape)

    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    m = X.shape[0]

    X = np.hstack((np.ones((m, 1)), X))

    lb = LabelBinarizer()
    lb.fit(y)
    y = lb.transform(y)

    J = 0
    for i in range(m):
        xi = X[i, :]
        yi = y[i]
        # forward propagation
        a1 = xi
        z2 = np.dot(Theta1, a1)
        a2 = sigmoid(z2)
        a2 = np.hstack((1, a2))
        z3 = np.dot(Theta2, a2)
        a3 = sigmoid(z3)

        #print("-- a3 shape %s" % (a3.shape,))
        #print("-- yi shape %s" % (yi.shape,))

        J += sum(-yi * safe_log(a3) - (1 - yi) * safe_log(1 - a3))
        # backpropagation
        delta3 = a3 - yi
        delta2 = np.dot(Theta2.T, delta3) * sigmoidGradient(np.hstack((1, z2)))
        delta2 = delta2[1:]  #

        delta2 = delta2.reshape((-1, 1))
        delta3 = delta3.reshape((-1, 1))
        a1 = a1.reshape((-1, 1))
        a2 = a2.reshape((-1, 1))

        Theta1_grad += np.dot(delta2, a1.T)
        Theta2_grad += np.dot(delta3, a2.T)
    J /= m

    temp = 0.0
    for j in range(hid_size):
        for k in range(1, in_size + 1):  #
            temp += Theta1[j, k]**2
    for j in range(num_labels):
        for k in range(1, hid_size + 1):  #
            temp += Theta2[j, k]**2
    J += lam / (2.0 * m) * temp

    #
    Theta1_grad /= m
    Theta1_grad[:, 1:] += (lam / m) * Theta1_grad[:, 1:]
    Theta2_grad /= m
    Theta2_grad[:, 1:] += (lam / m) * Theta2_grad[:, 1:]

    #
    grad = np.hstack((np.ravel(Theta1_grad), np.ravel(Theta2_grad)))

    print "J =", J
    return J, grad
Ejemplo n.º 49
0
from gensim.models import word2vec
from scipy.stats import skew
from scipy.stats import kurtosis
from sklearn.preprocessing import LabelBinarizer

path = "dataSet//"
'''train'''
train_log = pd.read_csv(path + 'train_log.csv', encoding='utf-8', sep='\t')
train_agg = pd.read_csv(path + 'train_agg.csv', encoding='utf-8', sep='\t')
train_flg = pd.read_csv(path + 'train_flg.csv', encoding='utf-8', sep='\t')
'''test'''
test_log = pd.read_csv(path + 'test_log.csv', encoding='utf-8', sep='\t')
test_agg = pd.read_csv(path + 'test_agg.csv', encoding='utf-8', sep='\t')
'''EVT_LBL one-hot feature'''
model_one_hot = LabelBinarizer()
model_one_hot.fit(train_log['EVT_LBL'])


def return_list(group):
    return list(group)


def return_set(group):
    return set(group)


def return_set_len(group):
    return len(set(group))


def calc_continue_day(group):
Ejemplo n.º 50
0
def plot_mc_roc(y_test, y_score, interpreter=None):
    '''
    plotting function that generates roc curves for data given to it.

    :param y_test: is the testing data used
    :param y_score: is the score when the testing data was called
    :param interpreter: is what was used to preprocess
    :return a roc plot
    '''
    lw = 2
    n_classes = len(np.unique(y_test))
    classes = pd.unique(y_test)
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(np.concatenate((y_test, y_score)))
    if n_classes != 2:
        y_test = label_binarizer.transform(y_test)
        y_score = label_binarizer.transform(y_score)
    else:
        n_classes = 1
        y_test = y_test.reshape(-1, 1)
        y_score = y_score.reshape(-1, 1)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = sklearn.metrics.auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    img = plt.figure()
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    for i in range(n_classes):
        plt.plot(
            fpr[i],
            tpr[i],
            lw=lw,
            label='ROC curve of class {0} (area = {1:0.2f})'
                ''.format(
                interpreter.inverse_transform(
                [[label_binarizer.classes_[i]]])[0],
                roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    return img
#     a = 0.1
#     b = 0.9
#     grayscale_min = 0
#     grayscale_max = 255
#     return a + ( ( (image_data - grayscale_min)*(b - a) )/( grayscale_max - grayscale_min ) )

if not is_features_normal:
    train_features = (train_features / 255.0 * 0.99) + 0.01
    test_features = (test_features / 255.0 * 0.99) + 0.01
    is_features_normal = True
print('Tests Passed!')

if not is_labels_encod:  #对ABCD10个字母编码
    # Turn labels into numbers and apply One-Hot Encoding
    encoder = LabelBinarizer()  #LabelBinarizer是sklearn库里的数值便签二值化的工具
    encoder.fit(train_labels)
    train_labels = encoder.transform(train_labels)
    test_labels = encoder.transform(test_labels)

    # Change to float32, so it can be multiplied against the features in TensorFlow, which are float32
    train_labels = train_labels.astype(np.float32)
    test_labels = test_labels.astype(np.float32)
    is_labels_encod = True
#
print('Labels One-Hot Encoded')
#产生训练集和验证集
train_features, valid_features, train_labels, valid_labels = train_test_split(
    train_features, train_labels, test_size=0.05, random_state=832289)

print('Training features and labels randomized and split.')
print(" X train shape: ", X_train.shape, "\n X test shape:", X_test.shape,
      " \n y train shape:", y_train.shape, "\n y test shape:", y_test.shape)

# Pipeline: it takes a list of tuples as parameter
pipeline_1 = Pipeline([('scaler', StandardScaler()),
                       ('clf', LogisticRegression())])
# use the pipeline object as you would
# a regular classifier
pipeline_1.fit(X_train, y_train)
y_preds = pipeline_1.predict(X_test)
print(y_preds)
ac_score = accuracy_score(y_test, y_preds)
print("Log reg: ", ac_score)

#Another  Pipeline: it takes a list of tuples as parameter
pipeline_2 = Pipeline([('scaler', StandardScaler()), ('clf', SVC())])
# use the pipeline object as you would
# a regular classifier
pipeline_2.fit(X_train, y_train)
y_preds = pipeline_2.predict(X_test)
print(y_preds)
ac_score = accuracy_score(y_test, y_preds)
print("SVM classifier: ", ac_score)

# Another Example
from sklearn.preprocessing import LabelBinarizer
bin = LabelBinarizer()  #first we initialize
vec = ['cat', 'dog', 'dog', 'dog']  #we have our label list we want binarized
bin.fit(vec)
print(bin.classes_)
print(bin.transform(vec))
Ejemplo n.º 53
0
def preprocess(sentences,
               targets,
               tokenizer=None,
               summary=False,
               labelizer=None,
               statssummary=False,
               pad=False,
               replace_dig=True):

    stats = {}

    vocab = set()
    for s in sentences:
        if replace_dig:
            s = re.sub(r'\d', 'DIG', s)
        vocab |= set(s.split())

    lengths = [len(s.split()) for s in sentences]

    stats['nb_sentences'] = len(sentences)
    stats['nb_words'] = len(vocab)
    stats['nb_classes'] = len(np.unique(targets))
    stats['class_cnt'] = dict(zip(*np.unique(targets, return_counts=True)))
    stats['max_len'] = max(lengths)
    stats['weights'] = compute_class_weight('balanced', np.unique(targets),
                                            targets)

    if summary:
        print(f'Total: {len(sentences)} sentences ')
        print(f'Max sentence length {max(lengths)}')
        print(f'Number of words: {len(vocab)}')
        print(f'Number of classes: {len(np.unique(targets))}')

    if tokenizer is None:
        tokenizer = Tokenizer(stats['nb_words'], oov_token='UNK')
        tokenizer.fit_on_texts(sentences)

        if not os.path.isdir('./out/'):
            os.mkdir('./out/')

        with open('./out/tokenizer.pickle', 'wb') as file:
            pickle.dump(tokenizer, file)

    X = tokenizer.texts_to_sequences(sentences)

    if pad:
        X = pad_sequences(X, maxlen=stats['max_len'])
    else:
        X = np.asarray([np.asarray(xx) for xx in X])

    if labelizer is None:
        labelizer = LabelBinarizer()

        labelizer.fit(targets)

        with open('./out/label_encoder.pickle', 'wb') as file:
            pickle.dump(labelizer, file)

    y = labelizer.transform(targets)

    return X, y, stats
Ejemplo n.º 54
0
    joined_pred = np.zeros(n_test, dtype=object)
    for row in range(n_test):
        joined_pred[row] = " ".join(pred[row, :])

    return joined_pred


if __name__ == "__main__":
    if len(sys.argv) < 3 or not sys.argv[1].endswith(".csv"):
        print("usage: %s <dest.csv> <file1.npz> <file2.npz> ..." % sys.argv[0])
        sys.exit()

    train_df = pd.read_csv("../data/train.csv", index_col="fname")
    label_binarizer = LabelBinarizer()
    label_binarizer.fit(train_df["label"])

    test_files = np.array(find_files("../data/audio_test/"))
    test_idx = [os.path.basename(f) for f in test_files]

    dest_filename = sys.argv[1]
    predict_count = len(sys.argv[2:])
    pred = np.zeros((len(test_idx), predict_count, NUM_CLASSES))

    for k, pred_name in enumerate(sys.argv[2:]):
        print("reading", pred_name)
        pred[:, k, :] = np.load(pred_name)["predict"]

    pred = merge_predictions(pred, "geom_mean", axis=1)
    print("predictions after final merge", pred.shape)
Ejemplo n.º 55
0
def eval_individual_device(train_data_file, dname, specified_models=None):
    global root_feature, root_model, root_output, dir_tsne_plots
    """
    Assumptions: the train_data_file contains only 1 device, all possible states(tags);
    the models can only be one of the implementated: knn, kmeans, dbscan, random forest classifier
    """
    warnings.simplefilter("ignore", category=DeprecationWarning)
    warnings.simplefilter("ignore", category=FutureWarning)
    """
    Skip trained models, return if there is no model to train. 
    """
    list_all_models = model_list
    if specified_models is not None:
        list_all_models = specified_models

    list_models_todo = []
    for model_alg in list_all_models:
        """
        Prepare the directories and add only models that have not been trained yet 
        """
        model_dir = '%s/%s' % (root_model, model_alg)
        model_file = '%s/%s%s.model' % (model_dir, dname, model_alg)
        if not os.path.exists(model_file) and os.path.exists(train_data_file):
            # check .model
            # check if training data set is available
            list_models_todo.append(model_alg)

    if len(list_models_todo) < 1:
        print('skip %s, all models trained for alg: %s' %
              (dname, str(list_all_models)))
        return

    print('Training %s using algorithm(s): %s' %
          (dname, str(list_models_todo)))
    train_data = pd.read_csv(train_data_file)

    num_data_points = len(train_data)
    if num_data_points < 1:
        print('  No enough data points for %s' % dname)
        return

    print('\t#Total data points: %d ' % num_data_points)
    X_feature = train_data.drop(['device', 'state'], axis=1).fillna(-1)
    ss = StandardScaler()
    pca = PCA(n_components=20)
    X_std = ss.fit_transform(X_feature)
    # Create a PCA instance: pca
    X_std = pca.fit_transform(X_std)
    # Save components to a DataFrame
    X_std = pd.DataFrame(X_std)
    X_feature = X_std.iloc[:, :4]
    y_labels = np.array(train_data.state)
    # y_labels, example: on, off, change_color
    """
    Split data set into train & test, default fraction is 30% test
    """
    X_train, X_test, y_train, y_test = train_test_split(X_feature,
                                                        y_labels,
                                                        test_size=.3,
                                                        random_state=42)
    print('Train: %s' % len(X_train))
    print('Test: %s' % len(X_test))

    num_lables = len(set(y_labels))
    if num_lables < 2:
        print('\tNo enough labels for %s' % dname)
        return
    """
    One hot encoding y labels
    """
    lb = LabelBinarizer()
    lb.fit(y_labels)  # collect all possible labels
    y_train_bin = lb.transform(y_train)
    y_test_bin = lb.transform(y_test)
    y_test_bin_1d = np.argmax(y_test_bin, axis=1)
    """
    Train through the list of interested ML algorithms
    """
    ret_results = []
    for model_alg in list_models_todo:
        model_dir = '%s/%s' % (root_model, model_alg)
        if not os.path.exists(model_dir):
            os.system('mkdir -pv %s' % model_dir)
        model_file = f'{model_dir}/{dname}{model_alg}.model'
        label_file = '%s/%s.label.txt' % (model_dir, dname)
        single_outfile = '%s/%s.result.csv' % (model_dir, dname)
        output_file = '%s/result_%s.txt' % (root_output, model_alg)
        _acc_score = -1
        _noise = -1
        _silhouette = -1
        """
        Two steps
            1. Train (70%)
            2. Test 
            3. Evaluate 
        """
        if model_alg == 'knn':
            print('  knn: n_neighbors=%s' % num_lables)
            trained_model = KNeighborsClassifier(n_neighbors=num_lables)
            trained_model.fit(X_train, y_train_bin)

            y_predicted = trained_model.predict(X_test)
            y_predicted_1d = np.argmax(y_predicted, axis=1)
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'kmeans':
            print('  kmeans: n_clusters=%s' % num_lables)
            trained_model = MiniBatchKMeans(n_clusters=num_lables,
                                            random_state=0,
                                            batch_size=6)
            trained_model.fit(X_train)

            y_predicted_1d = trained_model.predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'spectral':
            print('  Spectral Clustering: n_clusters=%s' % num_lables)
            trained_model = SpectralClustering(n_clusters=num_lables,
                                               affinity='nearest_neighbors',
                                               random_state=0)
            trained_model.fit(X_train)

            y_predicted_1d = trained_model.fit_predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)

        elif model_alg == 'dbscan':
            print('  eps=%s' % 300)
            trained_model = DBSCAN(eps=200, min_samples=5)
            trained_model.fit(X_train)
            y_predicted_1d = trained_model.fit_predict(X_test).round()
            if len(set(y_predicted_1d)) > 1:
                _silhouette = silhouette_score(X_test, y_predicted_1d)
            _noise = list(y_predicted_1d).count(-1) * 1. / num_data_points

        elif model_alg == 'rf':
            trained_model = RandomForestClassifier(n_estimators=1000,
                                                   random_state=42)
            trained_model.fit(X_train, y_train_bin)
            y_predicted = trained_model.predict(X_test).round()
            # print(y_predicted)
            if y_predicted.ndim == 1:
                y_predicted_1d = y_predicted
            else:
                y_predicted_1d = np.argmax(y_predicted, axis=1)

        _acc_score = accuracy_score(y_test_bin_1d, y_predicted_1d)
        """
        Eval clustering based metrics
        """

        _homogeneity = -1
        _complete = -1
        _vmeasure = -1
        _ari = -1
        _f1_score = -1
        if model_alg not in ['rf']:
            """
            Metrics for clustering algorithms 
            """
            # print('y_test_bin: %s' % y_test_bin_1d)
            # print('y_predicted_1d: %s' % y_predicted_1d)
            _homogeneity = homogeneity_score(y_test_bin_1d, y_predicted_1d)
            _complete = completeness_score(y_test_bin_1d, y_predicted_1d)
            _vmeasure = v_measure_score(y_test_bin_1d, y_predicted_1d)
            _ari = adjusted_rand_score(y_test_bin_1d, y_predicted_1d)
        """
        Plot tSNE graph
        """
        figfile = '%s/%s/%s-%s.png' % (root_model, model_alg, model_alg, dname)
        pp = 30  # perplexity
        if num_data_points > 200:
            pp = 50
        tsne_plot(X_feature, y_labels, figfile, pp)
        """
        Save the model 
        """
        model_dictionary = dict({
            'standard_scaler': ss,
            'pca': pca,
            'trained_model': trained_model
        })
        pickle.dump(model_dictionary, open(model_file, 'wb'))
        """
        Save the label for onehot encoding 
        """
        # unique_labels = label_encoder.classes_.tolist()
        unique_labels = lb.classes_.tolist()
        open(label_file, 'w').write('%s\n' % '\n'.join(unique_labels))
        """
        Save eval results
        """
        # TODO: due to the multi-thread, needs to change the settings
        with open(single_outfile, 'a+') as off:
            off.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                      (dname, _acc_score, _homogeneity, _complete, _vmeasure,
                       _ari, _noise, _silhouette))
            # y_test_bin_1d, y_predicted_1d
            off.write('%s\n' % ','.join(map(str, y_test_bin_1d)))
            off.write('%s\n' % ','.join(map(str, y_predicted_1d)))

        ret_results.append([
            output_file, dname, _acc_score, _homogeneity, _complete, _vmeasure,
            _ari, _noise, _silhouette
        ])
        """
        Print to Terminal 
        """
        print('    model -> %s' % model_file)
        print('    labels -> %s' % label_file)
        print('\t' + '\n\t'.join(unique_labels) + '\n')
        if model_alg not in ['rf']:
            print('    _homogeneity: %.3f' % _homogeneity)
            print('    _completeness: %.3f' % _complete)
            print('    _vmeausre: %.3f' % _vmeasure)
            print('    _ari: %.3f' % _ari)
            print('    _silhouette: %.3f' % _silhouette)
        print('    _acc_score: %.3f' % _acc_score)
        print('    measures saved to: %s' % single_outfile)
    return ret_results
Ejemplo n.º 56
0
    return model


if __name__ == '__main__':
    # Parameters
    batch_size = 32
    epochs = 50
    k = 5
    n_models = 3

    x = np.load('../input/X_train_kaggle.npy')
    y_df = pd.read_csv('../input/y_train_final_kaggle.csv')
    y_labels = y_df['Surface'].values

    label_binarizer = LabelBinarizer()
    label_binarizer.fit(y_labels)

    scalers = {}
    for i in range(x.shape[1]):
        scalers[i] = StandardScaler()
        x[:, i, :] = scalers[i].fit_transform(x[:, i, :])

    x = np.expand_dims(x, axis=-1)
    input_shape = x.shape[1::]

    skf = StratifiedKFold(n_splits=k)

    cvscores = np.zeros((k, n_models))
    train_cvscores = np.zeros((k, n_models))

    k_index = 0
Ejemplo n.º 57
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default='./', help='Directory for input data')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--batch_size', type=int, default=50, help='Batch size')
    parser.add_argument('--training_epochs', type=int, default=500, help='Number of epochs')
    parser.add_argument('--n_distribution', type=int, default=3, help='Number of distributions')
    FLAGS, unparsed = parser.parse_known_args()
    # print([sys.argv[0]] + unparsed)
    path_dir = FLAGS.data_dir

    # Parameters
    learning_rate = FLAGS.learning_rate
    batch_size = FLAGS.batch_size
    training_epochs = FLAGS.training_epochs

    # Network Parameters
    n_distribution = FLAGS.n_distribution

    data = read_data(os.path.join(path_dir, '_data.txt'))
    data, minx, maxx = scaler_range(data, feature_range=(-1, 1))

    labels = read_data(os.path.join(path_dir, '_labels.txt'))
    lb = LabelBinarizer()
    lb.fit(labels)

    class_name = lb.classes_
    n_class = class_name.shape[0]
    if n_class == 2:
        lb.fit(np.append(labels, np.max(class_name) + 1))

    n_features = data.shape[1]
    num_hidden_1 = int(0.5 * n_features)
    num_hidden_2 = num_hidden_1
    num_hidden_3 = num_hidden_1
    num_hidden_4 = num_hidden_1
    num_hidden_5 = n_class

    imp = SimpleImputer(missing_values=np.nan, strategy='mean')

    complate_data = imp.fit_transform(data)
    gmm = GaussianMixture(n_components=n_distribution, covariance_type='diag').fit(complate_data)
    del complate_data, imp

    gmm_weights = np.log(gmm.weights_.reshape((-1, 1)))
    gmm_means = gmm.means_
    gmm_covariances = gmm.covariances_
    del gmm

    acc = np.zeros((3, 5))

    time_train = np.zeros(5)
    time_test = np.zeros(5)

    skf = StratifiedKFold(n_splits=5)
    id_acc = 0
    for trn_index, test_index in skf.split(data, labels):
        X_train = data[trn_index]
        X_lab = labels[trn_index]
        train_index, valid_index = next(StratifiedKFold(n_splits=5).split(X_train, X_lab))

        train_x = X_train[train_index, :]
        valid_x = X_train[valid_index, :]
        test_x = data[test_index, :]

        train_y = lb.transform(X_lab[train_index])
        valid_y = lb.transform(X_lab[valid_index])
        test_y = lb.transform(labels[test_index])
        if n_class == 2:
            train_y = train_y[:, :-1]
            valid_y = valid_y[:, :-1]
            test_y = test_y[:, :-1]

        with tf.Graph().as_default() as graph:

            initializer = tf.contrib.layers.variance_scaling_initializer()

            weights = {
                'h1': tf.Variable(initializer([n_features, num_hidden_1])),
                'h2': tf.Variable(initializer([num_hidden_1, num_hidden_2])),
                'h3': tf.Variable(initializer([num_hidden_2, num_hidden_3])),
                'h4': tf.Variable(initializer([num_hidden_3, num_hidden_4])),
                'h5': tf.Variable(initializer([num_hidden_4, num_hidden_5])),
            }
            biases = {
                'b1': tf.Variable(tf.random_normal([num_hidden_1])),
                'b2': tf.Variable(tf.random_normal([num_hidden_2])),
                'b3': tf.Variable(tf.random_normal([num_hidden_3])),
                'b4': tf.Variable(tf.random_normal([num_hidden_4])),
                'b5': tf.Variable(tf.random_normal([num_hidden_5])),
            }

            # Symbols
            z = tf.placeholder(shape=[None, n_features], dtype=tf.float32)
            y = tf.placeholder(shape=[None, n_class], dtype=tf.float32)

            p = tf.Variable(initial_value=gmm_weights, dtype=tf.float32)
            means = tf.Variable(initial_value=gmm_means, dtype=tf.float32)
            covs = tf.Variable(initial_value=gmm_covariances, dtype=tf.float32)

            gamma = tf.Variable(initial_value=tf.random_normal(shape=(1,), mean=2, stddev=1.), dtype=tf.float32)

            # Construct model
            predict = multilayer_perceptron(z, means, covs, p, gamma, n_distribution, weights, biases)

            y_true = prep_labels(z, y)

            # Mean squared error
            cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=predict, labels=y_true))

            l_r = learning_rate
            # Gradient descent
            optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost)

            # Initialize the variables (i.e. assign their default value)
            init = tf.global_variables_initializer()

            nr_epoch = 10

            val_weights = None
            val_biases = None
            val_p = None
            val_means = None
            val_covs = None
            val_gamma = None

            with tf.Session(graph=graph) as sess:
                sess.run(init)

                min_cost = np.inf
                n_cost_up = 0

                prev_train_cost = np.inf

                time_train[id_acc] = time()

                epoch = 0
                # Training cycle
                for epoch in range(training_epochs):
                    # print("\r[{}|{}] Step: {:d} from 5".format(epoch + 1, training_epochs, id_acc), end="")
                    # sys.stdout.flush()

                    curr_train_cost = []
                    for batch_idx in range(0, train_y.shape[0], batch_size):
                        x_batch = train_x[batch_idx:batch_idx + batch_size, :]
                        y_batch = train_y[batch_idx:batch_idx + batch_size, :]

                        temp_train_cost, _ = sess.run([cost, optimizer], feed_dict={z: x_batch, y: y_batch})
                        curr_train_cost.append(temp_train_cost)

                    curr_train_cost = np.asarray(curr_train_cost).mean()

                    if epoch > nr_epoch and (prev_train_cost - curr_train_cost) < 1e-4 < l_r:
                        l_r = l_r / 2.
                        optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost)

                    prev_train_cost = curr_train_cost

                    curr_cost = []
                    for batch_idx in range(0, valid_y.shape[0], batch_size):
                        x_batch = valid_x[batch_idx:batch_idx + batch_size, :]
                        y_batch = valid_y[batch_idx:batch_idx + batch_size, :]
                        curr_cost.append(sess.run(cost, feed_dict={z: x_batch, y: y_batch}))

                    curr_cost = np.asarray(curr_cost).mean()

                    if min_cost > curr_cost:
                        min_cost = curr_cost
                        n_cost_up = 0

                        val_weights = {
                            'h1': weights['h1'].eval(),
                            'h2': weights['h2'].eval(),
                            'h3': weights['h3'].eval(),
                            'h4': weights['h4'].eval(),
                            'h5': weights['h5'].eval(),
                        }
                        val_biases = {
                            'b1': biases['b1'].eval(),
                            'b2': biases['b2'].eval(),
                            'b3': biases['b3'].eval(),
                            'b4': biases['b4'].eval(),
                            'b5': biases['b5'].eval(),
                        }

                        val_p = p.eval()
                        val_means = means.eval()
                        val_covs = covs.eval()
                        val_gamma = gamma.eval()
                    elif epoch > nr_epoch:
                        n_cost_up = n_cost_up + 1

                    if n_cost_up == 5 and 1e-4 < l_r:
                        l_r = l_r / 2.
                        optimizer = tf.train.GradientDescentOptimizer(l_r).minimize(cost)
                    elif n_cost_up == 10:
                        break

                time_train[id_acc] = (time() - time_train[id_acc]) / (epoch + 1)

                means.load(val_means)
                covs.load(val_covs)
                p.load(val_p)
                gamma.load(val_gamma)
                for key in weights.keys():
                    weights[key].load(val_weights[key])
                for key in biases.keys():
                    biases[key].load(val_biases[key])

                correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(predict, 1))
                accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

                train_accuracy = []
                for batch_idx in range(0, train_y.shape[0], batch_size):
                    x_batch = train_x[batch_idx:batch_idx + batch_size, :]
                    y_batch = train_y[batch_idx:batch_idx + batch_size, :]

                    train_accuracy.append(accuracy.eval({z: x_batch, y: y_batch}))
                train_accuracy = np.mean(train_accuracy)

                valid_accuracy = []
                for batch_idx in range(0, valid_y.shape[0], batch_size):
                    x_batch = valid_x[batch_idx:batch_idx + batch_size, :]
                    y_batch = valid_y[batch_idx:batch_idx + batch_size, :]

                    valid_accuracy.append(accuracy.eval({z: x_batch, y: y_batch}))
                valid_accuracy = np.mean(valid_accuracy)

                time_test[id_acc] = time()
                test_accuracy = []
                for batch_idx in range(0, test_y.shape[0], batch_size):
                    x_batch = test_x[batch_idx:batch_idx + batch_size, :]
                    y_batch = test_y[batch_idx:batch_idx + batch_size, :]
                    test_accuracy.append(accuracy.eval({z: x_batch, y: y_batch}))
                test_accuracy = np.mean(test_accuracy)
                time_test[id_acc] = time() - time_test[id_acc]

                acc[0, id_acc] = train_accuracy
                acc[1, id_acc] = valid_accuracy
                acc[2, id_acc] = test_accuracy
                id_acc = id_acc + 1

    mean_acc = np.average(acc, axis=1)
    std_acc = np.std(acc, axis=1)
    sys.stdout.flush()

    print(
        "{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{:.4f};{};{};{};{}".format(
            mean_acc[0], std_acc[0], mean_acc[1], std_acc[1], mean_acc[2], std_acc[2], np.average(time_train),
            np.average(time_test), FLAGS.learning_rate, FLAGS.batch_size, FLAGS.training_epochs, FLAGS.n_distribution))
class ImageSequenceGenerator:
    def __init__(self):
        self.lb = LabelBinarizer()
        labels = set(['BOO', 'BLO', 'BOR', 'BLR', 'OLR', 'OLO', 'OOR', 'OOO'])
        self.lb.fit(list(labels))

    # label_type: "binary" or "categorical"
    def png_image_generator(self,
                            path,
                            bs,
                            folder_list,
                            difficulty="All",
                            sequence_limit=16,
                            resize_dimension=128,
                            label_type="categorical",
                            aug=None):
        f = open("{0}/{1}.txt".format(path, difficulty))

        while True:
            X_data = []
            Y_data = []

            for x in range(len(folder_list)):
                folder = random.choice(folder_list)

                label = folder.split("_")[-2]

                folder += '/difference'
                images = [
                    folder + "/" + f for f in os.listdir(folder)
                    if os.path.isfile(os.path.join(folder, f))
                ]

                img_list = [
                ]  #np.empty((16, 227,227,3)) # images from all the sequences
                flow_path_list = []
                warped_path_list = []
                diff_path_list = []

                # split the images into sequneces of length 16
                #(e.g. folder contains 20 images, then first seq is 1-16, second seq 2-17, third seq 3-18 etc)
                rnd = random.randint(0, max((len(images) - sequence_limit), 0))
                try:
                    each = images[rnd:rnd + sequence_limit]
                except IndexError:
                    each = images[rnd:len(images) - 1]
                img_seq_list = [
                ]  # only images from one 16 image sequence,  size will be (16, 227,227,3)
                one_images_seq = np.array(each)  # 1-16, 2-17, etc

                # read each image to numpy sequence
                for img in one_images_seq:
                    img_load = load_img(img,
                                        target_size=(resize_dimension,
                                                     resize_dimension))
                    img_array = img_to_array(img_load)
                    img_seq_list.append(img_array)

                X_data.append(np.asarray(img_seq_list))
                Y_data.append(label)  #folder_components[-2])
                if (len(X_data) == bs):
                    if label_type == "categorical":
                        Y_data = self.lb.transform(np.array(Y_data))
                    elif label_type == "binary":
                        Y_data = labels_to_binary(Y_data)
                    else:
                        print('Invalid label type!')
                    yield np.asarray(X_data), np.asarray(Y_data)
                    X_data = []
                    Y_data = []
assert len(set(train_labels)) == len(set(test_labels)), (
    'Something went wrong. Some classes are only in train or test data.'
)  # yapf: disable

# convert the data and labels to NumPy arrays while scaling the pixel
# intensities to the range [0, 255]
# train_data = np.array(train_data) / 255.0
# test_data = np.array(test_data) / 255.0
train_labels_text = np.array(train_labels)
test_labels_text = np.array(test_labels)

num_classes = len(set(train_labels))

# perform one-hot encoding on the labels
lb = LabelBinarizer()
lb.fit(train_labels_text)

train_labels = lb.transform(train_labels_text)
test_labels = lb.transform(test_labels_text)

if num_classes == 2:
    train_labels = to_categorical(train_labels, num_classes=num_classes)
    test_labels = to_categorical(test_labels, num_classes=num_classes)

trainX = np.stack(train_data)
trainY = np.stack(train_labels)
testX = np.stack(test_data)
testY = np.stack(test_labels)
print('Class mappings are:', lb.classes_)

print(trainX.shape, trainY.shape, testX.shape, testY.shape)
Ejemplo n.º 60
0
class SimClassifier(BaseSim, ClassifierMixin):

    """
    Sim classification.

    Parameters
    ----------
    reg_lambda : float, optional. default=0
        Sparsity strength

    reg_gamma : float or list of float, optional. default=0.1
        Roughness penalty strength of the spline algorithm

    degree : int, optional. default=3
        The order of the spline

    knot_num : int, optional. default=5
        Number of knots

    random_state : int, optional. default=0
        Random seed
    """

    def __init__(self, reg_lambda=0, reg_gamma=1e-5, knot_num=5, degree=3, random_state=0):

        super(SimClassifier, self).__init__(reg_lambda=reg_lambda,
                                reg_gamma=reg_gamma,
                                knot_num=knot_num,
                                degree=degree,
                                random_state=random_state)

    def _validate_input(self, x, y):

        """method to validate data

        Parameters
        ---------
        x : array-like of shape (n_samples, n_features)
            containing the input dataset
        y : array-like of shape (n_samples,)
            containing target values
        """

        x, y = check_X_y(x, y, accept_sparse=["csr", "csc", "coo"],
                         multi_output=True)
        if y.ndim == 2 and y.shape[1] == 1:
            y = column_or_1d(y, warn=False)

        self._label_binarizer = LabelBinarizer()
        self._label_binarizer.fit(y)
        self.classes_ = self._label_binarizer.classes_

        y = self._label_binarizer.transform(y) * 1.0
        return x, y.ravel()

    def _estimate_shape(self, x, y, xmin, xmax):

        """estimate the ridge function

        Parameters
        ---------
        x : array-like of shape (n_samples, n_features)
            containing the input dataset
        y : array-like of shape (n_samples,)
            containing the output dataset
        xmin : float
            the minimum value of beta ^ x
        xmax : float
            the maximum value of beta ^ x
        """

        self.shape_fit_ = SMSplineClassifier(knot_num=self.knot_num, reg_gamma=self.reg_gamma,
                                xmin=xmin, xmax=xmax, degree=self.degree)
        self.shape_fit_.fit(x, y)

    def predict_proba(self, x):

        """output probability prediction for given samples

        Parameters
        ---------
        x : array-like of shape (n_samples, n_features)
            containing the input dataset
        Returns
        -------
        np.array of shape (n_samples, 2)
            containing probability prediction
        """

        pred = self.decision_function(x)
        pred_proba = softmax(np.vstack([-pred, pred]).T / 2, copy=False)
        return pred_proba

    def predict(self, x):

        """output binary prediction for given samples

        Parameters
        ---------
        x : array-like of shape (n_samples, n_features)
            containing the input dataset
        Returns
        -------
        np.array of shape (n_samples,)
            containing binary prediction
        """

        pred_proba = self.predict_proba(x)[:, 1]
        return self._label_binarizer.inverse_transform(pred_proba)