def Encoding(data, general_matrix=None):
    encoder = LabelBinarizer()
    count = 0
    # encoding
    for i in range(data.shape[1]):
        if type(data[0, i]) == str:
            count += 1
            col = data[:, i]
            unique = np.unique(col if general_matrix is None else general_matrix[:, i])

            try:
                encoder.fit(unique)
            except:
                pass

            new_col = encoder.transform(col)

            # split at i and i + 1
            before, removed, after = np.hsplit(data, [i, i + 1])
            # concatenate
            data = np.concatenate((before, new_col, after), axis=1)
            before, removed, after = np.hsplit(general_matrix, [i, i + 1])
            general_matrix = np.concatenate((before, encoder.transform(general_matrix[:, i]), after), axis=1)

    print "count : %d" % count
    # return data
    return data
Example #2
0
def one_hot_encoding(y_train, y_test):
    labelBinarizer = LabelBinarizer()
    labelBinarizer.fit(y_train)

    y_train_one_hot = labelBinarizer.transform(y_train)
    y_test_one_hot = labelBinarizer.transform(y_test)
    return y_train_one_hot, y_test_one_hot
def train():
    tr, va, te = read_dataset('../mnist.pkl.gz')
    binarizer = LabelBinarizer().fit(range(10))

    x = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    preds = model.inference(x, keep_prob)
    loss, total_loss = model.loss(preds, y)
    acc = model.evaluation(preds, y)
    # learning rate: 0.1
    train_op = model.training(total_loss, 0.1)

    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    for i in xrange(10000):
        batch_xs, batch_ys = tr.next_batch(50)
        if i % 100 == 0:
            train_acc = acc.eval(feed_dict={
                x:batch_xs, y:binarizer.transform(batch_ys),
                keep_prob: 1.0}, session=sess)
            print "step: {0}, training accuracy {1}".format(i, train_acc)
            validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess)
            print("Validation accuracy : {0}".format(validation_accuracy))
        train_op.run(feed_dict={
            x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5},
                     session=sess)

    test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess)
    print("Test accuracy : ", test_accuracy)
Example #4
0
class NN_Classifier(NNBase):

  def __init__(self,layers = [], lr=0.01, epochs=None, noisy=None, verbose=False):
    
    super(NN_Classifier, self).__init__(layers=layers, lr=lr, epochs=epochs, noisy=noisy, verbose=verbose)
    self.type = 'C'
    self.error_func = CrossEntropyError
    self.accuracy_score = AccuracyScore
    self.label_binarizer = LabelBinarizer()

  def predict(self, X):
    predictions = []
    for el in X:
      current_prediction = NNBase._predict(self, row(el))
      predictions.append(current_prediction)
    predictions = np.vstack(predictions)
    current_results = coalesce(predictions)
    return self.label_binarizer.inverse_transform(current_results)

  def predict_proba(self, X):
    predictions = []
    for el in X:
      current_prediction = NNBase._predict(self, row(el))
      predictions.append(current_prediction)
    predictions = np.vstack(predictions)
    return predictions

  def fit(self, X, T):
    T_impl = self.label_binarizer.fit_transform(T)
    if not self.epochs:
      self.epochs = 1

    for num in xrange(self.epochs):
      if self.verbose:
        print "Epoch: %d" % num
      for i in xrange(len(X)):
        NNBase._update(self, row(X[i]), row(T_impl[i]))

  def error(self, X, T):
    T_impl = self.label_binarizer.transform(T)
    Y = self.predict_proba(X)
    return self.error_func.func(Y, T_impl)

  def score(self, X, T):
    Y = self.predict(X)
    return self.accuracy_score.func(Y,T)

  def analytical_gradient(self, X, T):
    T_impl = self.label_binarizer.transform(T)
    return NNBase._analytical_gradient(self, X, T_impl)

  def numerical_gradient(self, X, T):
    T_impl = self.label_binarizer.transform(T)
    return NNBase._numerical_gradient(self, X, T_impl)
def partb():
    def load(file_name):
        file = np.load(file_name)
        X_train =file['X_train'].T
        y_train =file['y_train']
        X_test =file['X_test'].T
        y_test =file['y_test']
        X_cv =file['X_cv'].T
        y_cv =file['y_cv']

        return X_train,y_train,X_cv,y_cv,X_test,y_test

    train_ = [0,0]
    test_ = [0,0]
    overall = []
    for i in range(14):

        X_train,y_train,X_cv,y_cv,X_test,y_test = load('pofa{}.npz'.format(i))

        from sklearn.preprocessing import LabelBinarizer
        binarizer = LabelBinarizer()
        binarizer.fit(y_train)
        Y_train = binarizer.transform(y_train).T
        Y_cv = binarizer.transform(y_cv).T


#nn.forward(X)
#nn.backprop(X,Y,graient_check=True)

        print(X_train.shape[0], Y_train.shape[0])
        nn = NeuralNetwork([X_train.shape[0],30,Y_train.shape[0]], functions=[sigmoid,softmax], derivatives=[derivative_sigmoid])

        nn.fit(X_train,Y_train,eta=0.01,momentum=0.5,minibatch=16,regularizer=0.15,max_iter=200,gradient_check=False,cv = (X_cv,Y_cv),graphs=False, lbfgs=False)

        output = nn.forward(X_train)

        y_train_output = binarizer.inverse_transform(output.T)
        y_test_output = binarizer.inverse_transform(nn.forward(X_test).T)
        print("Iteration: ",i)
        print((y_train_output==y_train).mean())
        print((y_test_output ==y_test).mean())

        overall.append((y_test == y_test_output).mean())

        train_[0] += (y_train_output==y_train).sum()
        train_[1] += y_train.shape[0]
        test_[0] += (y_test_output==y_test).sum()
        test_[1] += y_test.shape[0]

    print("Average train accuracy: ", train_[0]/train_[1],"Average test accuracy: ",test_[0]/test_[1])
    print(train_,test_)
    overall = np.array(overall)
    print(overall.mean())
Example #6
0
    def load_dataset(self):
        X, y, X_test, y_test = dataset = snippet_reader.toNumpy()

        lb = LabelBinarizer()
        lb.fit(y)

        for y_bin in lb.transform(y).T:
            y = y_bin
            break

        for y_bin in lb.transform(y_test).T:
            y_test = y_bin
            break

        return X, y, X_test, y_test
Example #7
0
def our_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    # print "Y_true combined", y_true_combined
    # print "Y_pred combined", y_pred_combined
        
    tagset = set(lb.classes_)
    # print "tagset: ", tagset
    tagset = sorted(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset
    )
Example #8
0
def get_abalone19():
    """Loads abalone dataset, maps gender feature to binary features, adds
    new label to create abalone19 imbalanced binary classification dataset."""
    raw_data = pd.read_csv(ABALONE_FILE, sep=',')
    genders = list(raw_data.ix[:, 'gender'])
    cts_data = raw_data.drop(labels='gender', axis=1)

    # initialize & fit preprocesser
    lbz = LabelBinarizer()
    lbz.fit(genders)

    # encode categorical var
    encoded_genders = pd.DataFrame(lbz.transform(genders))
    encoded_genders.columns = ['gender_' + k for k in lbz.classes_]

    # recombine encoded data & return
    new_data = pd.concat(objs=[encoded_genders, cts_data], axis=1)
    new_data['label'] = raw_data['rings'].map(
        lambda k: 1 if k > 10 else 0)               # binary clf task
    new_data = new_data.drop('rings', axis=1)

    # standardize cts features
    if STANDARDIZE:
        for col in new_data.ix[:, 3:-1]:
            mean = new_data[col].mean()
            std = new_data[col].std()
            new_data[col] = new_data[col].map(lambda k: (k - mean) / float(std))

    pos_recs = new_data['label'].sum()
    print 'total pos class pct = {} %\n'.format(
        round(100 * pos_recs / float(len(new_data)), 3))

    return new_data
Example #9
0
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    _, y_true = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=0, n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=1, n_samples=n_samples)

    # Be sure to have at least one empty label
    y_true += ([],)
    y_pred += ([],)
    n_samples += 1

    lb = LabelBinarizer().fit([range(n_classes)])
    y_true_binary_indicator = lb.transform(y_true)
    y_pred_binary_indicator = lb.transform(y_pred)

    for name, metrics in METRICS_WITH_NORMALIZE_OPTION.items():
        # List of list of labels
        measure = metrics(y_true, y_pred, normalize=True)
        assert_greater(measure, 0, msg="We failed to test correctly the normalize option")
        assert_almost_equal(
            metrics(y_true, y_pred, normalize=False) / n_samples, measure, err_msg="Failed with %s" % name
        )

        # Indicator matrix format
        measure = metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=True)
        assert_greater(measure, 0, msg="We failed to test correctly the normalize option")
        assert_almost_equal(
            metrics(y_true_binary_indicator, y_pred_binary_indicator, normalize=False) / n_samples,
            measure,
            err_msg="Failed with %s" % name,
        )
Example #10
0
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!

    Note: This function was copied from
    http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb

    Args:
        y_true: True labels, list of strings
        y_pred: Predicted labels, list of strings
    Returns:
        classification report as string
    """
    lbin = LabelBinarizer()
    y_true_combined = lbin.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lbin.transform(list(chain.from_iterable(y_pred)))

    #tagset = set(lbin.classes_) - {NO_NE_LABEL}
    tagset = set(lbin.classes_)
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lbin.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels=[class_indices[cls] for cls in tagset],
        target_names=tagset,
    )
def bio_classification_report(y_true, y_pred):
    """Evaluates entity extraction accuracy.

    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    Taken from https://github.com/scrapinghub/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
    """
    from sklearn.preprocessing import LabelBinarizer
    from itertools import chain
    from sklearn.metrics import classification_report

    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
            y_true_combined,
            y_pred_combined,
            labels=[class_indices[cls] for cls in tagset],
            target_names=tagset,
    )
class CategoricalToNumerical(object):

    def __init__(self, dimensionality_reducer=None, verify=True):
        pass
        """Takes in a dimensionality reducer in order to convert categorical features into numerical.
        """
        if dimensionality_reducer is None:
            dimensionality_reducer = RandomizedPCA(1)
        self.dimensionality_reducer = dimensionality_reducer
        self.verify = verify
        self.binarizer = LabelBinarizer()

    def fit(self, X, y=None):
        self._verify(X, self.verify)
        binarized = self.binarizer.fit_transform(X)
        self.dimensionality_reducer.fit(binarized)

    def transform(self, X):
        self._verify(X, False)
        binarized = self.binarizer.transform(X)
        result = self.dimensionality_reducer.transform(binarized).flatten()
        assert X.shape == result.shape
        return result

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def _verify(self, X, verify):
        if verify:
            assert is_categorical(X)
        else:
            assert isinstance(X, np.ndarray)
            assert len(X.shape) == 1
Example #13
0
def report(test_y, pred_y):
    lb = LabelBinarizer()
    test_y_combined = lb.fit_transform(list(chain.from_iterable(test_y)))
    pred_y_combined = lb.transform(list(chain.from_iterable(pred_y)))
    tagset = sorted(set(lb.classes_))
    class_indices = {cls: idx for idx, cls in enumerate(tagset)}
    print(classification_report(test_y_combined, pred_y_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset))
Example #14
0
class BusinessCategoriesFeature(BaseEstimator):
	"""
	WARNING!!!
	Works only with a modified version of LabelBinarizer.

	A binarization of the reviews' business categories.
	"""

	def __init__(self, data=None):
		self.data = data

	def __create_labels_list(self, review_list):
		labels = []
		for review in review_list:
			business = self.data.get_business_for_review(review)
			labels.append(business['categories'])
		return labels

	def fit(self, X, y):
		self.binarizer = LabelBinarizer()
		labels = self.__create_labels_list(X)
		self.binarizer.fit(labels)
		return self

	def transform(self, X):
		labels = self.__create_labels_list(X)
		binarized_labels = self.binarizer.transform(labels)
		return binarized_labels.astype(float)
Example #15
0
    def bio_classification_report(y_true, y_pred):
        """
        Classification report for a list of BIO-encoded sequences.
        It computes token-level metrics and discards "O" labels.

        Note that it requires scikit-learn 0.15+ (or a version from
        github master) to calculate averages properly!
        """
        lb = LabelBinarizer()
        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {'O'}
        tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        labs = [class_indices[cls] for cls in tagset]

        return((precision_recall_fscore_support(y_true_combined,
                                                y_pred_combined,
                                                labels=labs,
                                                average=None,
                                                sample_weight=None)),
               (classification_report(
                   y_true_combined,
                   y_pred_combined,
                   labels=[class_indices[cls] for cls in tagset],
                   target_names=tagset,
               )), labs)
Example #16
0
def logloss(act, pred):
    epsilon = 10 ** -15
    pred = np.maximum(np.minimum(pred, 1 - epsilon), epsilon)
    lb = LabelBinarizer()
    lb.fit(act)
    act_binary = lb.transform(act)
    logloss = - np.sum(np.multiply(act_binary, np.log(pred))) / pred.shape[0]
    return logloss
Example #17
0
    def fit(self, Xt, yt, Xh, yh, callback=None):
        lbin = LabelBinarizer()
        lbin.fit(yt)
        Yt_multi = lbin.transform(yt)
        Yh_multi = lbin.transform(yh)
        sample_weight_train = np.ones(Xt.shape[0])
        sample_weight_test = np.ones(Xh.shape[0])


        if Yt_multi.shape[1] == 1:
            Yt_multi = np.hstack([1 - Yt_multi, Yt_multi])
            Yh_multi = np.hstack([1 - Yh_multi, Yh_multi])
            print('warning: only two classes detected')

        n_classes = Yt_multi.shape[1]
        n_features = Xt.shape[1]

        if self.alpha0 is None:
            self.alpha0 = np.zeros(n_classes * n_features)  # if not np.all(np.unique(yt) == np.array([-1, 1])):
        #     raise ValueError
        x0 = np.zeros(n_features * n_classes)

        # assert x0.size == self.alpha0.size

        def h_func_grad(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_loss_grad(
                x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[:2]

        def h_hessian(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_grad_hess(
                x, Xt, Yt_multi, np.exp(alpha), sample_weight_train)[1]

        def g_func_grad(x, alpha):
            # x = x.reshape((-1,Yt_multi.shape[1]))
            return _multinomial_loss_grad(
                x, Xh, Yh_multi, np.zeros(alpha.size),
                sample_weight_test)[:2]

        def h_crossed(x, alpha):
            # return x.reshape((n_classes, -1)) * alpha
            # x = x.reshape((-1,Yt_multi.shape[1]))
            tmp = np.exp(alpha) * x
            return sparse.dia_matrix(
                (tmp, 0),
                shape=(n_features * n_classes, n_features * n_classes))

        opt = hoag_lbfgs(
            h_func_grad, h_hessian, h_crossed, g_func_grad, x0,
            callback=callback,
            tolerance_decrease=self.tolerance_decrease,
            lambda0=self.alpha0, maxiter=self.max_iter,
            verbose=self.verbose)

        self.coef_ = opt[0]
        self.alpha_ = opt[1]
        return self
Example #18
0
    def load_dataset2(self):
        X, y, X_test, y_test = dataset = snippet_reader.toNumpy()
        X, y = shuffle(X, y)

        lb = LabelBinarizer()
        lb.fit(y)

        for y_bin in lb.transform(y).T:
            return X, y_bin
Example #19
0
def X_train_generatetor_infinite(dim=128,maxlen=500,batch_size=128,name="X_train.csv",events=None):
    X_train = pd.read_csv(path+name)
    group_le = LabelEncoder()
    group_lb = LabelBinarizer()
    labels = group_le.fit_transform(X_train['group'].values)
    labels = group_lb.fit_transform(labels)
    del labels
    
    ##################
    #   Phone Brand
    ##################
    # print("# Read Phone Brand")
    phone_brand_device_model = pd.read_csv(path+'phone_brand_device_model.csv',
                    dtype={'device_id': np.str})
    phone_brand_device_model.drop_duplicates('device_id', keep='first', inplace=True)
    phone_brand_le = LabelEncoder()
    phone_brand_device_model['phone_brand'] = phone_brand_le.fit_transform(phone_brand_device_model['phone_brand'])

    device_model_le = LabelEncoder()
    phone_brand_device_model['device_model'] = phone_brand_le.fit_transform(phone_brand_device_model['device_model'])


    while 1:
        data = pd.read_csv(path+name,iterator=True,chunksize=batch_size,
                    dtype={'device_id': np.str})
        for X_train in data:
            X_train = pd.merge(X_train,phone_brand_device_model,how='left',on='device_id', left_index=True)
            phone_brand = X_train['phone_brand'].values
            device_model = X_train['device_model'].values


            X_train["app_lab"] = X_train["device_id"].map(events)
            y_train = X_train['group'].values
            
            X_train['gender'][X_train['gender']=='M']=1
            X_train['gender'][X_train['gender']=='F']=0

            y_train_gender = X_train['gender'].values
            y_train_age = X_train['age'].values
            # take log transformation
            y_train_age = np.log(y_train_age)

            X_train.fillna('0 ',inplace=True)
            y_train = group_le.transform(y_train)
            y_train = group_lb.transform(y_train)
            x_train = X_train["app_lab"].values
            x_train = [ x.split(' ') for x in  x_train]
            for i in range(len(x_train)):
                x_train[i] = [ np.int8(idx) for idx in x_train[i] if (idx!='nan' and idx!='')]

            x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
            
            x_train = [x_train,phone_brand,device_model]
            y_train = [y_train,y_train_gender,y_train_age]

            yield (x_train,y_train)
 def encode(self, data, label, value_set=None):
     le =LabelBinarizer()
     if value_set is None:
         encoded = le.fit_transform(data[label])
     else:
         le.fit(value_set)
         encoded = le.transform(data[label])
     for i in range(encoded.shape[1]):
         new_label = '{0}_is_{1}'.format(label, i)
         data[new_label] = encoded[:,i]
Example #21
0
class _CategoricalEncoder:
    """OneHotEncoder that can handle categorical variables."""

    def __init__(self):
        """Convert labeled categories into one-hot encoded features."""
        self._lb = LabelBinarizer()

    def fit(self, X):
        """Fit a list or array of categories.

        Parameters
        ----------
        * `X` [array-like, shape=(n_categories,)]:
            List of categories.
        """
        self.mapping_ = {v: i for i, v in enumerate(X)}
        self.inverse_mapping_ = {i: v for v, i in self.mapping_.items()}
        self._lb.fit([self.mapping_[v] for v in X])
        self.n_classes = len(self._lb.classes_)

        return self

    def transform(self, X):
        """Transform an array of categories to a one-hot encoded representation.

        Parameters
        ----------
        * `X` [array-like, shape=(n_samples,)]:
            List of categories.

        Returns
        -------
        * `Xt` [array-like, shape=(n_samples, n_categories)]:
            The one-hot encoded categories.
        """
        return self._lb.transform([self.mapping_[v] for v in X])

    def inverse_transform(self, Xt):
        """Inverse transform one-hot encoded categories back to their original
           representation.

        Parameters
        ----------
        * `Xt` [array-like, shape=(n_samples, n_categories)]:
            One-hot encoded categories.

        Returns
        -------
        * `X` [array-like, shape=(n_samples,)]:
            The original categories.
        """
        Xt = np.asarray(Xt)
        return [
            self.inverse_mapping_[i] for i in self._lb.inverse_transform(Xt)
        ]
def one_hot_encode(x):
    """
    One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
    : x: List of sample Labels
    : return: Numpy array of one-hot encoded labels
    """
    # TODO: Implement Function
    labels=list(range(10))
    lb = LabelBinarizer()
    lb.fit(labels)
    return np.array(lb.transform(x))
Example #23
0
def ndcg_score(ground_truth, predictions, k=5):
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)
    scores = []
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)
Example #24
0
class PipelineLabelBinarizer(TransformerMixin):

    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=None):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=None):
        return self.encoder.transform(x)
Example #25
0
def test_averaging_multiclass(n_samples=50, n_classes=3):
    random_state = check_random_state(0)
    y_true = random_state.randint(0, n_classes, size=(n_samples,))
    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
    y_score = random_state.uniform(size=(n_samples, n_classes))

    lb = LabelBinarizer().fit(y_true)
    y_true_binarize = lb.transform(y_true)
    y_pred_binarize = lb.transform(y_pred)

    for name in METRICS_WITH_AVERAGING:
        yield (check_averaging, name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)
def small_word_conv(dataset_path):
    docs, y, test_docs, test_y = nli2013_train_test_split(dataset_path)

    logging.info('preprocessing, padding and binarizing data ...')
    docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in docs]
    test_docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in test_docs]

    vocab = Dictionary(docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                for s in docs],
                               max_length=100, padding_word=0))
    y = bin.fit_transform(y)

    test_x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                     for s in test_docs],
                                    max_length=100, padding_word=0))
    test_y = bin.transform(test_y)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(11, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x, y, batch_size=32, nb_epoch=10, validation_data=[test_x, test_y])

    print(accuracy_score(np.argwhere(test_y)[:, 1], model.predict_classes(test_x)))
    def evaluate(self, y_true, y_pred):
        lb = LabelBinarizer()
        y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
        y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

        tagset = set(lb.classes_) - {"O"}
        tagset = sorted(tagset, key=lambda tag: tag.split("-", 1)[::-1])
        class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

        return classification_report(
            y_true_combined, y_pred_combined, labels=[class_indices[cls] for cls in tagset], target_names=tagset
        )
Example #28
0
def _create_covertype(directory):
    urlbase = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/"
    destdir = os.path.join(_DATA_DIRECTORY, "raw")
    fn = _download_file(urlbase, "covtype.data.gz", destdir)
    with gzip.open(fn, "rb") as gzfile:
        X = pd.read_csv(gzfile, header=None).values

    X, y = X[:, :-1].astype(np.float64), X[:, -1]
    y -= 1  # make classes 0-based

    # split into test- and validationset
    idx = range(X.shape[0])
    from sklearn.cross_validation import train_test_split

    X, Xtest, y, ytest = train_test_split(X, y, test_size=0.1)
    X, Xval, y, yval = train_test_split(X, y, test_size=0.25)

    from sklearn.preprocessing import LabelBinarizer

    lb = LabelBinarizer()
    y = lb.fit_transform(y)
    yval = lb.transform(yval)
    ytest = lb.transform(ytest)

    # Most values are binary, except for these, so let's standardize them
    quant_idx = [0, 1, 2, 3, 4, 5, 9]  # real numbers
    int_idx = [6, 7, 8]  # integers from [0, 255)
    from sklearn.preprocessing import StandardScaler as Scaler

    scaler = Scaler()
    X[:, quant_idx + int_idx] = scaler.fit_transform(X[:, quant_idx + int_idx])
    Xval[:, quant_idx + int_idx] = scaler.transform(Xval[:, quant_idx + int_idx])
    Xtest[:, quant_idx + int_idx] = scaler.transform(Xtest[:, quant_idx + int_idx])
    data = [["train", X, y], ["valid", Xval, yval], ["test", Xtest, ytest]]
    m = np.zeros(X.shape[1])
    m[quant_idx + int_idx] = scaler.mean_
    s = np.ones(X.shape[1])
    s[quant_idx + int_idx] = scaler.std_
    other = {"center": m, "scale": s}
    _store(data, os.path.join(_DATA_DIRECTORY, "covertype.hdf5"), other)
Example #29
0
def test_averaging_multiclass(name):
    n_samples, n_classes = 50, 3
    random_state = check_random_state(0)
    y_true = random_state.randint(0, n_classes, size=(n_samples, ))
    y_pred = random_state.randint(0, n_classes, size=(n_samples, ))
    y_score = random_state.uniform(size=(n_samples, n_classes))

    lb = LabelBinarizer().fit(y_true)
    y_true_binarize = lb.transform(y_true)
    y_pred_binarize = lb.transform(y_pred)

    check_averaging(name, y_true, y_true_binarize,
                    y_pred, y_pred_binarize, y_score)
def get_classification_report(validation_y, validation_pred):
    """ Returns the classification report for the given classify.
    It uses predicts the labels of the validation set and uses
    that as a bases for testing the perfomance of the classifier
    """

    lb = LabelBinarizer()
    val_y = lb.fit_transform(list(validation_y))
    val_pred = lb.transform(list(validation_pred))

    tagset = get_classnames()

    return classification_report(val_y,val_pred,target_names=tagset)
Example #31
0
                K.clear_session()

                model = Sequential()
                rbflayer = RBFLayer(
                    g_param,
                    initializer=InitCentersRandom(
                        oDataSet.attributes[oData.Training_indexes[train]]),
                    betas=g2_param,
                    input_shape=(base.shape[1], ))
                model.add(rbflayer)
                model.add(
                    Dense(len(oDataSet.labelsNames), activation='sigmoid'))
                model.compile(loss='categorical_crossentropy',
                              optimizer=_OPTIMIZER)
                model.fit(oDataSet.attributes[oData.Training_indexes[train]],
                          lb.transform(
                              oDataSet.labels[oData.Training_indexes[train]]),
                          batch_size=50,
                          epochs=epochs,
                          verbose=0)

                y_pred = model.predict(
                    oDataSet.attributes[oData.Training_indexes[test]]).argmax(
                        axis=1)
                y_true = oDataSet.labels[oData.Training_indexes[test]]
                grid_result[g1, g2, k_slice] = accuracy_score(y_true, y_pred)
                print(grid_result)
                k_slice += 1
    best_p = GRID_NEURON[np.unravel_index(
        np.argmax(np.mean(grid_result, axis=2)), grid_result.shape[:2])[0]]
    best_b = GRID_B[np.unravel_index(np.argmax(np.mean(grid_result, axis=2)),
                                     grid_result.shape[:2])[1]]
Example #32
0
# converting data and labels to np array
data = np.array(data, dtype="float")
labels = np.array(labels)

# scaling the values of data between 0 and 1
data = data / 255.0

# Split the training data into separate train and test sets
(train_x, val_x, train_y, val_y) = train_test_split(data,
                                                    labels,
                                                    test_size=0.3,
                                                    random_state=13)

# one hot encoding
lb = LabelBinarizer().fit(train_y)
train_y = lb.transform(train_y)
val_y = lb.transform(val_y)

# building model
model = Sequential()
model.add(
    Conv2D(40, (5, 5),
           padding="same",
           input_shape=(40, 40, 1),
           activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Conv2D(100, (5, 5), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
Example #33
0
def nnCostFunction(nn_params, *args):
    """NNのコスト関数とその偏微分を求める"""
    in_size, hid_size, num_labels, X, y, lam = args

    # ニューラルネットの全パラメータを行列形式に復元
    Theta1 = nn_params[0:(in_size + 1) * hid_size].reshape(
        (hid_size, in_size + 1))
    Theta2 = nn_params[(in_size + 1) * hid_size:].reshape(
        (num_labels, hid_size + 1))

    # パラメータの偏微分
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)

    # 訓練データ数
    m = X.shape[0]

    # 訓練データの1列目にバイアス項に対応する1を追加
    X = np.hstack((np.ones((m, 1)), X))

    # 教師ラベルを1-of-K表記に変換
    lb = LabelBinarizer()
    lb.fit(y)
    y = lb.transform(y)

    J = 0
    for i in range(m):
        xi = X[i, :]
        yi = y[i]
        # forward propagation
        a1 = xi
        z2 = np.dot(Theta1, a1)
        a2 = sigmoid(z2)
        a2 = np.hstack((1, a2))
        z3 = np.dot(Theta2, a2)
        a3 = sigmoid(z3)
        J += sum(-yi * safe_log(a3) - (1 - yi) * safe_log(1 - a3))
        # backpropagation
        delta3 = a3 - yi
        delta2 = np.dot(Theta2.T, delta3) * sigmoidGradient(np.hstack((1, z2)))
        delta2 = delta2[1:]  # バイアス項に対応する要素を除外
        # ベクトル x ベクトル = 行列の演算をしなければならないので
        # 縦ベクトルへのreshapeが必要
        # 行数に-1を指定すると自動的に入る
        delta2 = delta2.reshape((-1, 1))
        delta3 = delta3.reshape((-1, 1))
        a1 = a1.reshape((-1, 1))
        a2 = a2.reshape((-1, 1))
        # 正則化ありのときのデルタの演算
        Theta1_grad += np.dot(delta2, a1.T)
        Theta2_grad += np.dot(delta3, a2.T)
    J /= m

    # 正則化項
    temp = 0.0
    for j in range(hid_size):
        for k in range(1, in_size + 1):  # バイアスに対応する重みは加えない
            temp += Theta1[j, k]**2
    for j in range(num_labels):
        for k in range(1, hid_size + 1):  # バイアスに対応する重みは加えない
            temp += Theta2[j, k]**2
    J += lam / (2.0 * m) * temp

    # 偏微分の正則化項
    Theta1_grad /= m
    Theta1_grad[:, 1:] += (lam / m) * Theta1_grad[:, 1:]
    Theta2_grad /= m
    Theta2_grad[:, 1:] += (lam / m) * Theta2_grad[:, 1:]

    # ベクトルに変換
    grad = np.hstack((np.ravel(Theta1_grad), np.ravel(Theta2_grad)))

    print "J =", J
    return J, grad
    4.0"""

    return np.round(number * 2) / 2


min_val = -40
max_val = 40

y_train = round_of_rating(saturate(y_train, min_val, max_val))

r_int = 0.5
slist = np.arange(min_val, max_val + r_int,
                  r_int) * 2  #multiply by 2 to allow labelbinarizer to work
lb = LabelBinarizer()
lb.fit(slist)
ylabels = lb.transform(y_train * 2)

# In[17]:

print(x_train.shape)
print(xfcss_train.shape)
print(ylabels.shape)

# In[18]:

nsamps = x_train.shape[0]
n80p = int(np.floor(nsamps * 0.8))
rannums = np.array(random.sample(range(1, nsamps, 1), n80p))
s_nfiles = np.arange(nsamps)
test_set = np.setdiff1d(s_nfiles, rannums)
Example #35
0
def task_1_tuttocompleto(df):
    print(
        "======================== task_1_tuttocompleto ============================="
    )
    del df['entity_charOffset']
    del df['entity_id']

    print(df.shape)

    df2 = df.copy(deep=True)

    data = construct_dataset_tutto(df, df2)

    print('bitno', df.shape)  #df nema izbrisan entity type

    headers2 = [
        'token_name', 'token_tag', 'sentence_id', 'sentence_text',
        'entity_name'
    ]

    df2 = pd.DataFrame(data, columns=headers2)

    df_train, df_test = train_test_split(df2,
                                         test_size=0.2,
                                         random_state=22,
                                         shuffle=False)

    text_train = df_train['sentence_text'].as_matrix()
    text_test = df_test['sentence_text'].as_matrix()

    print('text_train.shape', text_train.shape)

    sw = stopwords.words("english")
    vectorizer = TfidfVectorizer(lowercase=True,
                                 binary=True,
                                 stop_words=sw,
                                 sublinear_tf=True,
                                 norm=None)

    x_train = vectorizer.fit_transform(text_train).toarray()
    x_test = vectorizer.transform(text_test).toarray()
    token_name_train = vectorizer.transform(
        df_train['token_name'].as_matrix()).toarray()
    token_name_test = vectorizer.transform(
        df_test['token_name'].as_matrix()).toarray()

    #this is an attempt to concatenate token tags to the dataset, memory leak problems
    #token_name_train = np.column_stack((token_name_train, df_train['token_tag']))
    #token_name_test = np.column_stack((token_name_test, df_test['token_tag']))
    #[:,None]
    x_train = np.concatenate((x_train, token_name_train), axis=1)
    x_test = np.concatenate((x_test, token_name_test), axis=1)

    del sw
    del vectorizer
    del token_name_train
    del token_name_test
    del df2
    del data
    #return x_train, x_test, y_train, y_test, df_train['token_tag'], df_test['token_tag']
    y_train = df_train['entity_name'].astype("category").cat.codes.as_matrix()
    y_test = df_test['entity_name'].astype("category").cat.codes.as_matrix()

    lb = LabelBinarizer()

    y_train = lb.fit_transform(y_train)
    y_test = lb.transform(y_test)

    pred = simple_nn(x_train, x_test, y_train, 5)

    #pred = lb.inverse_transform(pred)
    y_train = lb.inverse_transform(y_train)
    y_test = lb.inverse_transform(y_test)

    pred_list = [pred]

    print(accuracy_score(pred, y_test))
    print(f1_score(pred, y_test, average='macro'))

    lgr = LogisticRegression(C=0.05, class_weight='balanced')
    lgr.fit(x_train, y_train)
    pred1 = lgr.predict(x_test)

    pred_list.append(pred1)

    print(accuracy_score(pred1, y_test))
    print(f1_score(pred1, y_test, average='macro'))

    svc = LinearSVC(C=0.0004, class_weight='balanced')
    svc.fit(x_train, y_train)
    pred2 = svc.predict(x_test)

    pred_list.append(pred2)

    print(accuracy_score(pred2, y_test))
    print(f1_score(pred2, y_test, average='macro'))

    #gb = ensemble.GradientBoostingClassifier()
    #gb.fit(x_train, y_train)
    #pred = gb.predict(x_test)

    #pred_list.append(pred)

    final_pred = []
    for i in range(len(pred_list[0])):
        temp = [0, 0, 0, 0, 0]
        for j in range(len(pred_list)):
            temp[pred_list[j][i]] += 1

        final_pred.append(np.argmax(temp))

    pred = final_pred

    print(pred)
    print(accuracy_score(pred, y_test))
    print(f1_score(pred, y_test, average='macro'))

    np.save('my_pred', pred)
    np.save('y_test', y_test)

    joblib.dump(lb, 'lb')
Example #36
0
def class_report(y_true, y_pred, y_score, alpha, average='micro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" %
              (y_true.shape, y_pred.shape))
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(y_pred, return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    acc = accuracy_score(y_true=y_true, y_pred=y_pred)

    metrics_summary = precision_recall_fscore_support(y_true=y_true,
                                                      y_pred=y_pred,
                                                      labels=labels)

    avg = list(
        precision_recall_fscore_support(y_true=y_true,
                                        y_pred=y_pred,
                                        average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(list(metrics_summary),
                                   index=metrics_sum_index,
                                   columns=labels)

    support = class_report_df.loc['support']
    total = support.sum()
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred'] = pred_cnt
    class_report_df['pred'].iloc[-1] = total
    """
    matrix = confusion_matrix(y_true, y_pred)
    accs = matrix.diagonal() / matrix.sum(axis=1)
    print("accuracies")
    print(accs)
    """

    if not (y_score is None):
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        auc_delong = dict()
        auc_ci = dict()
        auc_cov = dict()
        accs = dict()
        for label_it, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), y_score[:, label_it])

            y_true_imed = (y_true == label).astype(int)
            y_pred_imed = (y_pred == label).astype(int)
            y_score_imed = y_score[:, label_it]

            auc_dl, auc_co, ci = calculate_auc_ci(y_pred=y_pred_imed,
                                                  y_true=y_true_imed,
                                                  y_score=y_score_imed,
                                                  alpha=alpha,
                                                  print_results=False)
            auc_delong[label] = auc_dl
            auc_cov[label] = auc_co
            auc_ci[label] = ci

            accs[label] = accuracy_score(y_true=y_true_imed,
                                         y_pred=y_pred_imed)

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), y_score.ravel())

            roc_auc["avg / total"] = auc(fpr["avg / total"],
                                         tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([fpr[i] for i in labels]))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        accs["avg / total"] = np.mean(list(accs.values()))
        auc_delong["avg / total"] = ""  #np.mean(list(auc_delong.values()))
        auc_cov["avg / total"] = ""  #np.mean(list(auc_cov.values()))
        auc_ci["avg / total"] = ""

        class_report_df['accuracy'] = pd.Series(accs)
        class_report_df['AUC'] = pd.Series(roc_auc)

        class_report_df['AUC DeLong'] = pd.Series(auc_delong)
        class_report_df['AUC COV'] = pd.Series(auc_cov)
        class_report_df['AUC CI (' + str(alpha * 100) +
                        ' %)'] = pd.Series(auc_ci)

    return class_report_df
    pad = ['TOKEN'] * SERIES_LENGTH
    for index in range(len(inputs) - SERIES_LENGTH - 1):
        yield [
            inputs[index], pad + inputs[max(0, index - SERIES_LENGTH):index]
        ]
        pad = pad[1:]


training = [fizzbuzz(num) for num in range(1, 1000)]
training_inputs = list(create_input_features(training))

lb = LabelBinarizer()
lb.fit(training + ['TOKEN'])

X = np.array(
    [lb.transform(features).flatten() for label, features in training_inputs])
y = np.array([label for label, features in training_inputs])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=6251)

#clf = LogisticRegression(tol=1e-6)
regr = RandomForestClassifier(max_depth=20, random_state=6251)
regr.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
predicted = regr.predict(X_test)
accuracy_score(y_test, predicted)
Example #38
0
def main():
    """Train ensemble model.
    """
    # construct the argument parse and parse the arguments
    args = argparse.ArgumentParser()
    args.add_argument("-o",
                      "--output",
                      required=True,
                      help="path to output directory")
    args.add_argument("-m",
                      "--models",
                      required=True,
                      help="path to output models directory")
    args.add_argument("-n",
                      "--num-models",
                      type=int,
                      default=5,
                      help="# of models to train")
    args = vars(args.parse_args())

    # load the training and testing data, then scale it into the range [0, 1]
    ((train_x, train_y), (test_x, test_y)) = cifar10.load_data()
    train_x = train_x.astype("float") / 255.0
    test_x = test_x.astype("float") / 255.0

    # convert the labels from integers to vectors
    label_binarizer = LabelBinarizer()
    train_y = label_binarizer.fit_transform(train_y)
    test_y = label_binarizer.transform(test_y)

    # initialize the label names for the CIFAR-10 dataset
    label_names = [
        "airplane", "automobile", "bird", "cat", "deer", "dog", "frog",
        "horse", "ship", "truck"
    ]

    # construct the image generator for data augmentation
    augmentation = ImageDataGenerator(rotation_range=10,
                                      width_shift_range=0.1,
                                      height_shift_range=0.1,
                                      horizontal_flip=True,
                                      fill_mode="nearest")

    # loop over the number of models to train
    for i in np.arange(0, args["num_models"]):
        # initialize the optimizer and model
        print("[INFO] training model {}/{}".format(i + 1, args["num_models"]))
        opt = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
        model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10)
        model.compile(loss="categorical_crossentropy",
                      optimizer=opt,
                      metrics=["accuracy"])
        # train the network
        model_fit = model.fit_generator(augmentation.flow(train_x,
                                                          train_y,
                                                          batch_size=64),
                                        validation_data=(test_x, test_y),
                                        epochs=40,
                                        steps_per_epoch=len(train_x) // 64,
                                        verbose=1)
        # save the model to disk
        path = [args["models"], "model_{}.model".format(i)]
        model.save(os.path.sep.join(path))

        # evaluate the network
        predictions = model.predict(test_x, batch_size=64)
        report = classification_report(test_y.argmax(axis=1),
                                       predictions.argmax(axis=1),
                                       target_names=label_names)

        # save the classification report to file
        path = [args["output"], "model_{}.txt".format(i)]
        f = open(os.path.sep.join(path), "w")
        f.write(report)
        f.close()

        # plot the training loss and accuracy
        path = [args["output"], "model_{}.png".format(i)]
        plt.style.use("ggplot")
        plt.figure()
        plt.plot(np.arange(0, 40),
                 model_fit.history["loss"],
                 label="train_loss")
        plt.plot(np.arange(0, 40),
                 model_fit.history["val_loss"],
                 label="val_loss")
        plt.plot(np.arange(0, 40), model_fit.history["acc"], label="train_acc")
        plt.plot(np.arange(0, 40),
                 model_fit.history["val_acc"],
                 label="val_acc")
        plt.title("Training Loss and Accuracy for model {}".format(i))
        plt.xlabel("Epoch #")
        plt.ylabel("Loss/Accuracy")
        plt.legend()
        plt.savefig(os.path.sep.join(path))
        plt.close()
Example #39
0
    ])

if not is_features_normal:
    train_features = normalize_grayscale(train_features)
    test_features = normalize_grayscale(test_features)
    is_features_normal = True

print('Tests Passed!')

# In[10]:

if not is_labels_encod:
    # Turn labels into numbers and apply One-Hot Encoding
    encoder = LabelBinarizer()
    encoder.fit(train_labels)
    train_labels = encoder.transform(train_labels)
    test_labels = encoder.transform(test_labels)

    # Change to float32, so it can be multiplied against the features in TensorFlow, which are float32
    train_labels = train_labels.astype(np.float32)
    test_labels = test_labels.astype(np.float32)
    is_labels_encod = True

print('Labels One-Hot Encoded')

# In[11]:

assert is_features_normal, 'You skipped the step to normalize the features'
assert is_labels_encod, 'You skipped the step to One-Hot Encode the labels'

# Get randomized datasets for training and validation
Example #40
0
class ShapeletModel(BaseEstimator, ClassifierMixin):
    """Learning Time-Series Shapelets model.


    Learning Time-Series Shapelets was originally presented in [1]_.

    Parameters
    ----------
    n_shapelets_per_size: dict
        Dictionary giving, for each shapelet size (key),
        the number of such shapelets to be trained (value)
    max_iter: int (default: 1000)
        Number of training epochs.
    batch_size: int (default:256)
        Batch size to be used.
    verbose_level: {0, 1, 2} (default: 2)
        `keras` verbose level.
    optimizer: str or keras.optimizers.Optimizer (default: "sgd")
        `keras` optimizer to use for training.
    weight_regularizer: float or None (default: None)
        `keras` regularizer to use for training the classification (softmax) layer.
        If None, no regularization is performed.

    Attributes
    ----------
    shapelets_: numpy.ndarray of objects, each object being a time series
        Set of time-series shapelets.
    shapelets_as_time_series_: numpy.ndarray of shape (n_shapelets, sz_shp, d) where \
    sz_shp is the maximum of all shapelet sizes
        Set of time-series shapelets formatted as a ``tslearn`` time series dataset.

    Note
    ----
        This implementation requires a dataset of equal-sized time series.

    Examples
    --------
    >>> from tslearn.generators import random_walk_blobs
    >>> X, y = random_walk_blobs(n_ts_per_blob=20, sz=64, d=2, n_blobs=2)
    >>> clf = ShapeletModel(n_shapelets_per_size={10: 5}, max_iter=1, verbose_level=0)
    >>> clf.fit(X, y).shapelets_.shape
    (5,)
    >>> clf.shapelets_[0].shape
    (10, 2)
    >>> clf.predict(X).shape
    (40,)
    >>> clf.transform(X).shape
    (40, 5)
    >>> params = clf.get_params(deep=True)
    >>> sorted(params.keys())
    ['batch_size', 'max_iter', 'n_shapelets_per_size', 'optimizer', 'verbose_level', 'weight_regularizer']
    >>> clf.set_params(batch_size=128)  # doctest: +NORMALIZE_WHITESPACE
    ShapeletModel(batch_size=128, max_iter=1, n_shapelets_per_size={10: 5},
           optimizer='sgd', verbose_level=0, weight_regularizer=0.0)
    >>> clf2 = ShapeletModel(n_shapelets_per_size={10: 5, 20: 10}, max_iter=1, verbose_level=0)
    >>> clf2.fit(X, y).shapelets_.shape
    (15,)
    >>> clf2.shapelets_[0].shape
    (10, 2)
    >>> clf2.shapelets_[5].shape
    (20, 2)
    >>> clf2.shapelets_as_time_series_.shape
    (15, 20, 2)
    >>> clf2.predict(X).shape
    (40,)
    >>> clf2.transform(X).shape
    (40, 15)
    >>> clf2.locate(X).shape
    (40, 15)
    >>> import sklearn
    >>> cv_results = sklearn.model_selection.cross_validate(clf, X, y, return_train_score=False)
    >>> cv_results['test_score'].shape
    (3,)

    References
    ----------
    .. [1] J. Grabocka et al. Learning Time-Series Shapelets. SIGKDD 2014.
    """
    def __init__(self,
                 n_shapelets_per_size,
                 max_iter=1000,
                 batch_size=256,
                 verbose_level=2,
                 optimizer="sgd",
                 weight_regularizer=0.):
        self.n_shapelets_per_size = n_shapelets_per_size
        self.n_classes = None
        self.optimizer = optimizer
        self.max_iter = max_iter
        self.weight_regularizer = weight_regularizer
        self.model = None
        self.transformer_model = None
        self.locator_model = None
        self.batch_size = batch_size
        self.verbose_level = verbose_level
        self.categorical_y = False
        self.label_binarizer = None
        self.binary_problem = False

        self.d = None

    @property
    def _n_shapelet_sizes(self):
        return len(self.n_shapelets_per_size)

    @property
    def shapelets_(self):
        total_n_shp = sum(self.n_shapelets_per_size.values())
        shapelets = numpy.empty((total_n_shp, ), dtype=object)
        idx = 0
        for i, shp_sz in enumerate(sorted(self.n_shapelets_per_size.keys())):
            n_shp = self.n_shapelets_per_size[shp_sz]
            for idx_shp in range(idx, idx + n_shp):
                shapelets[idx_shp] = numpy.zeros((shp_sz, self.d))
            for di in range(self.d):
                for inc, shp in enumerate(
                        self.model.get_layer("shapelets_%d_%d" %
                                             (i, di)).get_weights()[0]):
                    shapelets[idx + inc][:, di] = shp
            idx += n_shp
        assert idx == total_n_shp
        return shapelets

    @property
    def shapelets_as_time_series_(self):
        total_n_shp = sum(self.n_shapelets_per_size.values())
        shp_sz = max(self.n_shapelets_per_size.keys())
        non_formatted_shapelets = self.shapelets_
        d = non_formatted_shapelets[0].shape[1]
        shapelets = numpy.zeros((total_n_shp, shp_sz, d)) + numpy.nan
        for i in range(self._n_shapelet_sizes):
            sz = non_formatted_shapelets[i].shape[0]
            shapelets[i, :sz, :] = non_formatted_shapelets[i]
        return shapelets

    def fit(self, X, y):
        """Learn time-series shapelets.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.
        y : array-like of shape=(n_ts, )
            Time series labels.
        """
        n_ts, sz, d = X.shape
        self.d = d
        if y.ndim == 1:
            self.label_binarizer = LabelBinarizer().fit(y)
            y_ = self.label_binarizer.transform(y)
            # if y_.shape[1] == 1:
            #     y_ = numpy.hstack((y_, 1 - y_))
        else:
            y_ = y
            self.categorical_y = True
            assert y_.shape[
                1] != 2, "Binary classification case, monodimensional y should be passed."
        if y_.ndim == 1:
            n_classes = 2
        else:
            n_classes = y_.shape[1]
        self._set_model_layers(X=X, ts_sz=sz, d=d, n_classes=n_classes)
        self.model.compile(
            loss="categorical_crossentropy"
            if n_classes > 2 else "binary_crossentropy",
            optimizer=self.optimizer,
            metrics=[categorical_accuracy, categorical_crossentropy]
            if n_classes > 2 else [binary_accuracy, binary_crossentropy])
        self.transformer_model.compile(loss="mean_squared_error",
                                       optimizer=self.optimizer)
        self.locator_model.compile(loss="mean_squared_error",
                                   optimizer=self.optimizer)
        self._set_weights_false_conv(d=d)
        self.model.fit([X[:, :, di].reshape((n_ts, sz, 1)) for di in range(d)],
                       y_,
                       batch_size=self.batch_size,
                       epochs=self.max_iter,
                       verbose=self.verbose_level)
        return self

    def predict(self, X):
        """Predict class probability for a given set of time series.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        Returns
        -------
        array of shape=(n_ts, ) or (n_ts, n_classes), depending on the shape of the \
        label vector provided at training time.
            Index of the cluster each sample belongs to or class probability matrix, depending on
            what was provided at training time.
        """
        X_ = to_time_series_dataset(X)
        n_ts, sz, d = X_.shape
        categorical_preds = self.model.predict(
            [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)],
            batch_size=self.batch_size,
            verbose=self.verbose_level)
        if self.categorical_y:
            return categorical_preds
        else:
            if categorical_preds.shape[1] == 2:
                categorical_preds = categorical_preds[:, 0]
            return self.label_binarizer.inverse_transform(categorical_preds)

    def transform(self, X):
        """Generate shapelet transform for a set of time series.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        Returns
        -------
        array of shape=(n_ts, n_shapelets)
            Shapelet-Transform of the provided time series.
        """
        X_ = to_time_series_dataset(X)
        n_ts, sz, d = X_.shape
        pred = self.transformer_model.predict(
            [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)],
            batch_size=self.batch_size,
            verbose=self.verbose_level)
        return pred

    def locate(self, X):
        """Compute shapelet match location for a set of time series.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        Returns
        -------
        array of shape=(n_ts, n_shapelets)
            Location of the shapelet matches for the provided time series.
        """
        X_ = to_time_series_dataset(X)
        n_ts, sz, d = X_.shape
        locations = self.locator_model.predict(
            [X_[:, :, di].reshape((n_ts, sz, 1)) for di in range(self.d)],
            batch_size=self.batch_size,
            verbose=self.verbose_level)
        return locations.astype(numpy.int)

    def _set_weights_false_conv(self, d):
        shapelet_sizes = sorted(self.n_shapelets_per_size.keys())
        for i, sz in enumerate(shapelet_sizes):
            for di in range(d):
                self.model.get_layer("false_conv_%d_%d" % (i, di)).set_weights(
                    [numpy.eye(sz).reshape((sz, 1, sz))])

    def _set_model_layers(self, X, ts_sz, d, n_classes):
        inputs = [
            Input(shape=(ts_sz, 1), name="input_%d" % di) for di in range(d)
        ]
        shapelet_sizes = sorted(self.n_shapelets_per_size.keys())
        pool_layers = []
        pool_layers_locations = []
        for i, sz in enumerate(sorted(shapelet_sizes)):
            transformer_layers = [
                Conv1D(filters=sz,
                       kernel_size=sz,
                       trainable=False,
                       use_bias=False,
                       name="false_conv_%d_%d" % (i, di))(inputs[di])
                for di in range(d)
            ]
            shapelet_layers = [
                LocalSquaredDistanceLayer(self.n_shapelets_per_size[sz],
                                          X=X,
                                          name="shapelets_%d_%d" % (i, di))(
                                              transformer_layers[di])
                for di in range(d)
            ]
            if d == 1:
                summed_shapelet_layer = shapelet_layers[0]
            else:
                summed_shapelet_layer = add(shapelet_layers)
            pool_layers.append(
                GlobalMinPooling1D(name="min_pooling_%d" %
                                   i)(summed_shapelet_layer))
            pool_layers_locations.append(
                GlobalArgminPooling1D(name="min_pooling_%d" %
                                      i)(summed_shapelet_layer))
        if len(shapelet_sizes) > 1:
            concatenated_features = concatenate(pool_layers)
            concatenated_locations = concatenate(pool_layers_locations)
        else:
            concatenated_features = pool_layers[0]
            concatenated_locations = pool_layers_locations[0]
        outputs = Dense(units=n_classes if n_classes > 2 else 1,
                        activation="softmax" if n_classes > 2 else "sigmoid",
                        kernel_regularizer=l2(self.weight_regularizer)
                        if self.weight_regularizer > 0 else None,
                        name="classification")(concatenated_features)
        self.model = Model(inputs=inputs, outputs=outputs)
        self.transformer_model = Model(inputs=inputs,
                                       outputs=concatenated_features)
        self.locator_model = Model(inputs=inputs,
                                   outputs=concatenated_locations)

    def get_weights(self, layer_name=None):
        """Return model weights (or weights for a given layer if `layer_name` is provided).

        Parameters
        ----------
        layer_name: str or None (default: None)
            Name of the layer for which  weights should be returned.
            If None, all model weights are returned.
            Available layer names with weights are:
            - "shapelets_i_j" with i an integer for the shapelet id and j an integer for the dimension
            - "classification" for the final classification layer

        Returns
        -------
        list
            list of model (or layer) weights

        Examples
        --------
        >>> from tslearn.generators import random_walk_blobs
        >>> X, y = random_walk_blobs(n_ts_per_blob=100, sz=256, d=1, n_blobs=3)
        >>> clf = ShapeletModel(n_shapelets_per_size={10: 5}, max_iter=0, verbose_level=0)
        >>> clf.fit(X, y).get_weights("classification")[0].shape
        (5, 3)
        """
        if layer_name is None:
            return self.model.get_weights()
        else:
            return self.model.get_layer(layer_name).get_weights()
Example #41
0
ap.add_argument("-m", "--model", required=True, help="path to output model")
ap.add_argument("-o", "--output", required=True, help="path to output directory (logs, plots, etc.)")
args = vars(ap.parse_args())

print("[INFO] loading CIFAR-10 data ...")
((train_x, train_y), (test_x, test_y)) = cifar10.load_data()
train_x = train_x.astype("float")
test_x = test_x.astype("float")

mean = np.mean(train_x, axis=0)
train_x -= mean
test_x -= mean

lb = LabelBinarizer()
train_y = lb.fit_transform(train_y)
test_y = lb.transform(test_y)

aug = ImageDataGenerator(width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, fill_mode="nearest")

fig_path = os.path.sep.join([args["output"], "{}.png".format(os.getpid())])
json_path = os.path.sep.join([args["output"], "{}.json".format(os.getpid())])
# callbacks = [TrainingMonitor(fig_path, json_path=json_path), LearningRateScheduler(poly_decay)]
callbacks = [LearningRateScheduler(poly_decay)]

print("[INFO] compiling model ...")
opt = SGD(lr=INIT_LR, momentum=0.9)
model = MiniGoogleNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

print("[INFO] training model ...")
model.fit_generator(aug.flow(train_x, train_y, batch_size=64), validation_data=(test_x, test_y),
Example #42
0
def do_vgg_train(path_input,
                 width,
                 height,
                 basename,
                 vgg_size,
                 fc_size,
                 logLevel="WARN"):
    """Train a VGG-like convolutional network
    """
    logvgg = logging.getLogger(f"{__name__}.console.trainvgg")
    logvgg.setLevel(logLevel)

    model_file = f"{basename}.model"
    label_bin_file = f"{basename}.pickle"
    plot_file = f"{basename}.png"
    logvgg.debug(f"mf {model_file} lbf {label_bin_file} pf {plot_file}")

    data, labels = load_dataset(path_input, width, height, "INFO")

    # partition the data into training and testing splits using 75% of
    # the data for training and the remaining 25% for testing
    (trainX, testX, trainY, testY) = train_test_split(data,
                                                      labels,
                                                      test_size=0.25)

    # convert the labels from integers to vectors (for 2-class, binary
    # classification you should use Keras' to_categorical function
    # instead as the scikit-learn's LabelBinarizer will not return a
    # vector)
    lb = LabelBinarizer()
    trainY = lb.fit_transform(trainY)
    testY = lb.transform(testY)

    # construct the image generator for data augmentation
    # rotation is ok, shear/shift/flip reduced
    aug = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.01,
        height_shift_range=0.01,
        shear_range=0.002,
        zoom_range=0.02,
        horizontal_flip=False,
        fill_mode="nearest",
    )

    if vgg_size == "small":
        # TODO fc_size set from here
        model = SmallVGGNet.build(width=width,
                                  height=height,
                                  depth=3,
                                  classes=len(lb.classes_))
    elif vgg_size == "middle":
        # default value of fc_size
        if fc_size == -1:
            fc_size = 512
        model = MiddleVGGNet.build(
            width=width,
            height=height,
            depth=3,
            classes=len(lb.classes_),
            fully_connected_size=fc_size,
        )
    else:
        logvgg.critical(f"Unrecognized dimension {vgg_size}, stopping.")
        return -1

    # initialize our initial learning rate, # of epochs to train for, and batch size
    INIT_LR = 0.01
    EPOCHS = 75
    #  EPOCHS = 3
    BS = 32
    # TODO fiddle with this

    # initialize the model and optimizer (you'll want to use
    # binary_crossentropy for 2-class classification)
    logvgg.info("Training network...")
    opt = SGD(lr=INIT_LR, decay=INIT_LR / EPOCHS)
    model.compile(loss="categorical_crossentropy",
                  optimizer=opt,
                  metrics=["accuracy"])
    # TODO fiddle with this

    # save model summary
    summary_file = f"{basename}_summary.txt"
    with open(summary_file, "w") as sf:
        model.summary(line_length=100, print_fn=lambda x: sf.write(f"{x}\n"))
        # using an actual logger: print_fn=logger.info

    # save the model structure in JSON format
    config = model.get_config()
    config_json_file = f"{basename}_structure.json"
    with open(config_json_file, "w") as jf:
        json.dump(config, jf)

    # train the network
    H = model.fit_generator(
        aug.flow(trainX, trainY, batch_size=BS),
        validation_data=(testX, testY),
        steps_per_epoch=len(trainX) // BS,
        epochs=EPOCHS,
    )

    # save the model and label binarizer to disk
    logvgg.info("Serializing network and label binarizer...")
    model.save(model_file)
    with open(label_bin_file, "wb") as f:
        f.write(pickle.dumps(lb))

    # evaluate the network
    logvgg.info("Evaluating network...")
    predictions = model.predict(testX, batch_size=32)
    report = classification_report(testY.argmax(axis=1),
                                   predictions.argmax(axis=1),
                                   target_names=lb.classes_)
    logvgg.info(f"\n{report}")
    report_file = f"{basename}_report.txt"
    with open(report_file, "w") as rf:
        rf.write(report)

    # plot the training loss and accuracy
    N = np.arange(0, EPOCHS)
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(N, H.history["loss"], label="train_loss")
    plt.plot(N, H.history["val_loss"], label="val_loss")
    plt.plot(N, H.history["acc"], label="train_acc")
    plt.plot(N, H.history["val_acc"], label="val_acc")
    plt.title("Training Loss and Accuracy (SmallVGGNet)")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.savefig(plot_file)
Example #43
0
'''
data = data.reshape(data.shape[1:])
data = data.transpose()
'''
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.25,
                                                  random_state=42)

# convert the labels from integers to vectors (for 2-class, binary
# classification you should use Keras' to_categorical function
# instead as the scikit-learn's LabelBinarizer will not return a
# vector)
lb = LabelBinarizer()
trainY = lb.fit_transform(trainY)
testY = lb.transform(testY)

# define the 3072-1024-512-3 architecture using Keras
model = Sequential()
model.add(Dense(1024, input_shape=(3072, ), activation="sigmoid"))
model.add(Dense(512, activation="sigmoid"))
model.add(Dense(len(lb.classes_), activation="softmax"))

# initialize our initial learning rate and # of epochs to train for
INIT_LR = 0.01
EPOCHS = 75

# compile the model using SGD as our optimizer and categorical
# cross-entropy loss (you'll want to use binary_crossentropy
# for 2-class classification)
print("[INFO] training network...")
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument('-w',
                             '--weights',
                             required=True,
                             help='Path to best model weights file.')
arguments = vars(argument_parser.parse_args())

print('[INFO] Loading CIFAR-10 data...')
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train = X_train.astype('float') / 255.0
X_test = X_test.astype('float') / 255.0

label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y_train)
y_test = label_binarizer.transform(y_test)

print('[INFO] Compiling model...')
optimizer = SGD(lr=0.01, decay=0.01 / 40, momentum=0.9, nesterov=True)
model = MiniVGGNet.build(width=32, height=32, depth=3, classes=10)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

checkpoint = ModelCheckpoint(arguments['weights'],
                             monitor='val_loss',
                             save_best_only=True)
callbacks = [checkpoint]

print('[INFO] Training network...')
H = model.fit(X_train,
Example #45
0
def roc_multiclass_cruve_rf(y_test_class, y_pred_class):
    lb = LabelBinarizer()
    lb.fit(y_test_class)
    y_test_b = lb.transform(y_test_class)
    y_pred_b = lb.transform(y_pred_class)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr[0], tpr[0], _ = roc_curve(y_test_b[:, 0], y_pred_b[:, 0])
    roc_auc[0] = auc(fpr[0], tpr[0])
    # Compute micro
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_b.ravel(),
                                              y_pred_b.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    lw = 1
    # First aggregate all false positive rates
    all_fpr = fpr[0]

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    mean_tpr += np.interp(all_fpr, fpr[0], tpr[0])

    # Finally average it and compute AUC
    mean_tpr /= 3

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)
    list = [0]
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(list, colors):
        plt.plot(fpr[0],
                 tpr[0],
                 color=color,
                 lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(0, roc_auc[0]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC For Random Forest')
    plt.legend(loc="lower right")
    plt.savefig('ROC For RF')
    return plt.show()
Example #46
0
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

CSV_FILE_PATH = 'F://验证码识别/data.csv'  # CSV 文件路径
df = pd.read_csv(CSV_FILE_PATH)  # 读取CSV文件

# 数据集的特征
features = ['v' + str(i + 1) for i in range(16 * 20)]
print(features.shape)

raise TypeError()
labels = df['label'].unique()
# 对样本的真实标签进行标签二值化
lb = LabelBinarizer()
lb.fit(labels)
y_ture = pd.DataFrame(lb.transform(df['label']),
                      columns=['y' + str(i) for i in range(31)])
y_bin_columns = list(y_ture.columns)

for col in y_bin_columns:
    df[col] = y_ture[col]

# 将数据集分为训练集和测试集,训练集70%, 测试集30%
x_train, x_test, y_train, y_test = train_test_split(df[features], df[y_bin_columns], \
                                                    train_size = 0.7, test_size=0.3, random_state=123)

# 构建RNN网络
# 模型保存地址
MODEL_SAVE_PATH = 'logs/RNN_train.ckpt'
# RNN初始化
element_size = 16
Example #47
0
df.columns = [x.lower().replace('.', '_') for x in df.columns]

labels, uniques = pd.factorize(df.species)
df['label'] = labels

train, test = train_test_split(df, test_size=0.2)

names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
all_features = np.hstack([column(df, i) for i in names])
train_features = np.hstack([column(train, i) for i in names])
test_features = np.hstack([column(test, i) for i in names])

lb = LabelBinarizer()
lb.fit(df.label)

all_labels = lb.transform(df.label).astype('float')
train_labels = lb.transform(train.label).astype('float')
test_labels = lb.transform(test.label).astype('float')

# Softmax Regression

k = lb.classes_.size

x = tf.placeholder(tf.float32, [None, 4])
y = tf.placeholder(tf.float32, [None, 3])

w = tf.Variable(tf.truncated_normal([4, k], stddev=0.1))
b = tf.Variable(tf.truncated_normal([k], stddev=0.1))
y_ = tf.nn.softmax(tf.matmul(x, w) + b)

# Loss Function
    labels.append(label)

# scale the raw pixel intensities to the range [0, 1]
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)

# partition the data into training and testing splits using 75% of
# the data for training and the remaining 25% for testing
(trainX, testX, trainY, testY) = train_test_split(data,
                                                  labels,
                                                  test_size=0.25,
                                                  random_state=42)

# convert the labels from integers to vectors
lb = LabelBinarizer().fit(trainY)
trainY = lb.transform(trainY)
testY = lb.transform(testY)

# initialize the model
print("[INFO] compiling model...")
model = LeNet.build(width=28, height=28, depth=1, classes=9)
opt = SGD(lr=0.01)
model.compile(loss="categorical_crossentropy",
              optimizer=opt,
              metrics=["accuracy"])

# train the network
print("[INFO] training network...")
H = model.fit(trainX,
              trainY,
              validation_data=(testX, testY),
Example #49
0
def task1_2(df):
    print("======================== task 1_2 =============================")
    df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)

    print(df_train.shape)

    del df['sentence_id']
    del df['entity_id']
    del df['entity_charOffset']

    print(df_train.shape)

    text_train = df_train['sentence_text'].as_matrix()
    text_test = df_test['sentence_text'].as_matrix()

    sw = stopwords.words("english")
    vectorizer = TfidfVectorizer(lowercase=True,
                                 binary=True,
                                 stop_words=sw,
                                 sublinear_tf=True,
                                 norm=None)

    x_train = vectorizer.fit_transform(text_train).toarray()
    x_test = vectorizer.transform(text_test).toarray()

    entity_name_train = vectorizer.transform(
        df_train['entity_name'].as_matrix()).toarray()
    entity_name_test = vectorizer.transform(
        df_test['entity_name'].as_matrix()).toarray()

    x_train = np.concatenate((x_train, entity_name_train), axis=1)
    x_test = np.concatenate((x_test, entity_name_test), axis=1)

    print(df.head())
    new_data = []
    all_tags = []
    for i in range(len(df_train)):
        text = nltk.word_tokenize(df_train['sentence_text'][i])
        tagged_sent = nltk.pos_tag(text)
        found = False
        for t in tagged_sent:
            if (t[0].lower() in df_train['entity_name'][i].lower()
                    and found == False):
                new_data.append(t[1])
                all_tags.append(t[1])
                found = True
        if (found == False):
            new_data.append('0')
            all_tags.append('0')
    dftrain_tag = pd.DataFrame(new_data, columns=['entity_tag'])
    #print(x_train.shape)
    #print(len(new_data))

    new_datat = []
    for i in range(10343, 10343 + len(df_test)):
        text = nltk.word_tokenize(df_test['sentence_text'][i])
        tagged_sent = nltk.pos_tag(text)
        found = False
        for t in tagged_sent:
            if (t[0].lower() in df_test['entity_name'][i].lower()
                    and found == False):
                new_datat.append(t[1])
                all_tags.append(t[1])
                found = True
        if (found == False):
            new_datat.append('0')
            all_tags.append('0')
    dftest_tag = pd.DataFrame(new_datat, columns=['entity_tag'])

    df_alltags = pd.DataFrame(all_tags, columns=['entity_tag'])
    #print(x_test.shape)
    #print(len(new_datat))
    #print(dftrain_tag.head())
    tags = df_alltags['entity_tag'].unique()
    tags_dict = dict(zip(tags, range(len(tags))))
    dftrain_tag = dftrain_tag.replace(tags_dict)
    dftest_tag = dftest_tag.replace(tags_dict)
    #print(dftrain_tag.head())

    x_train = np.concatenate((x_train, dftrain_tag), axis=1)
    x_test = np.concatenate((x_test, dftest_tag), axis=1)
    #print(x_train[0:6])

    y_train = df_train['entity_type'].astype("category").cat.codes.as_matrix()
    y_test = df_test['entity_type'].astype("category").cat.codes.as_matrix()

    lb = LabelBinarizer()

    y_train = lb.fit_transform(y_train)
    y_test = lb.transform(y_test)

    pred = simple_nn(x_train, x_test, y_train)

    #pred = lb.inverse_transform(pred)
    y_train = lb.inverse_transform(y_train)
    y_test = lb.inverse_transform(y_test)

    pred_list = [pred]

    print(accuracy_score(pred, y_test))
    print(f1_score(pred, y_test, average='macro'))

    lgr = LogisticRegression(C=0.05, class_weight='balanced')
    lgr.fit(x_train, y_train)
    pred1 = lgr.predict(x_test)

    pred_list.append(pred1)

    print(accuracy_score(pred1, y_test))
    print(f1_score(pred1, y_test, average='macro'))

    svc = LinearSVC(C=0.0004, class_weight='balanced')
    svc.fit(x_train, y_train)
    pred2 = svc.predict(x_test)

    pred_list.append(pred2)

    print(accuracy_score(pred2, y_test))
    print(f1_score(pred2, y_test, average='macro'))

    rfc = ensemble.RandomForestClassifier(class_weight='balanced')
    rfc.fit(x_train, y_train)
    pred3 = rfc.predict(x_test)

    pred_list.append(pred3)

    print(accuracy_score(pred3, y_test))
    print(f1_score(pred3, y_test, average='macro'))

    #gb = ensemble.GradientBoostingClassifier()
    #gb.fit(x_train, y_train)
    #pred = gb.predict(x_test)

    #pred_list.append(pred)

    final_pred = []
    for i in range(len(pred_list[0])):
        temp = [0, 0, 0, 0]
        for j in range(len(pred_list)):
            temp[pred_list[j][i]] += 1

        final_pred.append(np.argmax(temp))

    pred = final_pred

    print(pred)
    print(accuracy_score(pred, y_test))
    print(f1_score(pred, y_test, average='macro'))

    preds = list(set(pred))
    br = []

    for j in range(len(preds)):
        br.append(0)

    for i in pred:
        for j in range(len(preds)):
            if i == preds[j]:
                br[j] += 1
    print(br)

    preds = list(set(y_test))
    br = []

    for j in range(len(preds)):
        br.append(0)

    for i in y_test:
        for j in range(len(preds)):
            if i == preds[j]:
                br[j] += 1
    print(br)
Example #50
0
START_EPOCH = 50

# load the training and testing data, converting the images from
# integers to floats
print("[INFO] loading CIFAR-10 data...")
((x_train, y_train), (x_test, y_test)) = cifar10.load_data()
x_train = x_train.astype("float")
x_test = x_test.astype("float")
# apply mean subtraction to the data
mean = np.mean(x_train, axis=0)
x_train -= mean
x_test -= mean
# convert the labels from integers to vectors
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
# construct the image generator for data augmentation
aug = ImageDataGenerator(width_shift_range=0.1,
                         height_shift_range=0.1, horizontal_flip=True,
                         fill_mode="nearest")
# if there is no specific model checkpoint supplied, then initialize
# the network (ResNet-56) and compile the model
if MODEL_PATH:
    print("[INFO] loading {}...".format(MODEL_PATH))
    model = load_model(MODEL_PATH)
    # update the learning rate
    print("[INFO] old learning rate: {}".format(
        K.get_value(model.optimizer.lr)))
    K.set_value(model.optimizer.lr, 1e-2)
    print("[INFO] new learning rate: {}".format(
        K.get_value(model.optimizer.lr)))
Example #51
0
def multiclass_roc_auc_score(y_test, y_pred, average="weighted"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return metrics.roc_auc_score(y_test, y_pred, average=average)
Example #52
0
train = 'train.csv'
test = 'test.csv'
#out = sys.argv[4]
batch_size = 1000
lr = 0.01
activation = "sigmoid" # sigmoid or relu
hidden_layers = [32,16] #no of units in each layer


train_x, train_y = read_data(train)
train_x = train_x / 255
test_y = train_y[train_y.shape[0]-10000:]
# train_x = scale(train_x)
lb = LabelBinarizer()
lb.fit([i for i in range(10)]) #since 10 outputs are possible
train_y = lb.transform(train_y)

#original test dataset
#test_x, test_y = read_data(test)

#taking test split from train for validation
test_x = train_x[train_y.shape[0]-10000:]
#test_y = train_y[train_y.shape[0]-10000:]
train_x = train_x[:-10000]
train_y = train_y[:-10000]


# In[76]:


#(self, num_inputs, num_hidden_units_list, activation)
Example #53
0
x_train, x_test, y_train_1000, y_test_1000 = train_test_split(
    data_1000['sequence'],
    data_1000['classification'],
    test_size=0.2,
    random_state=123)

# In[34]:

print(x_train.shape)
print(x_test.shape)

# In[35]:

lb = LabelBinarizer()
y_train = lb.fit_transform(y_train_1000)
y_test = lb.transform(y_test_1000)
print('number of classes %d' % y_train.shape[1])

# In[36]:


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}

    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
Example #54
0
class SNLI:
    def __init__(self, w2vec):
        self.data = {}
        self.data['X'] = {}
        self.data['y'] = {}
        self.data['X']['train'], self.data['y']['train'] = self.loadData(
            'train')
        self.data['X']['test'], self.data['y']['test'] = self.loadData('test')
        self.data['X']['dev'], self.data['y']['dev'] = self.loadData('dev')
        self.le = LabelBinarizer()
        self.w2vec = w2vec
        self.le.fit(['entailment', 'neutral', 'contradiction'])

    def loadData(self, dataset, onlyGoldLabels=True, tokenize=True):
        """
		onlyGoldLabels = True
		some sentences don't have final label, only have the 5 labels from annotators which don't agree. Ignores such sentences

		tokenize:
		splits sentences into tokens
		"""
        y = []
        X = []
        with open('../data/snli/snli_1.0_' + dataset + '.txt') as datafile:
            prev = None
            for line in datafile:
                if prev is None:
                    prev = line
                    continue
                parts = line.split("\t")

                if onlyGoldLabels:
                    if parts[0] == '-':
                        continue
                else:
                    raise NotImplementedError
                y.append(parts[0])
                X.append(
                    [self.preprocess(parts[5]),
                     self.preprocess(parts[6])])
        return X, y

    def preprocess(self, sentence, removePunct=True, lowerCase=False):
        sentence = sentence.translate(None, string.punctuation)
        sentence = sentence.lower()
        return word_tokenize(sentence)

    def getMaxLengths(self):
        maxLen = [None, None]
        for ds in self.data['X']:
            for sent in self.data['X'][ds]:
                if maxLen[0] is None or len(sent[0]) > maxLen[0]:
                    maxLen[0] = len(sent[0])
                if maxLen[1] is None or len(sent[1]) > maxLen[1]:
                    maxLen[1] = len(sent[1])
        return maxLen

    def getX(self, dataset, start_index, end_index):
        premise = []
        hypothesis = []

        sentences = self.data['X'][dataset][start_index:end_index]

        for pair in sentences:
            prem = []
            for w in pair[0]:
                try:
                    toappend = self.w2vec.convertWord(w)
                except KeyError:
                    toappend = self.w2vec.unkWordRep()
                prem.append(toappend)
            premise.append(np.asarray(prem))

            hyp = []
            for w in pair[1]:
                try:
                    toappend = self.w2vec.convertWord(w)
                except KeyError:
                    toappend = self.w2vec.unkWordRep()
                hyp.append(toappend)
            hypothesis.append(np.asarray(hyp))

        rval = np.asarray(premise), np.asarray(hypothesis)
        return rval

    def getY(self, dataset, start_index=None, end_index=None):
        #converts label to 0,1,2
        if start_index is not None and end_index is not None:
            return self.le.transform(
                self.data['y'][dataset][start_index:end_index])
        else:
            return self.le.transform(self.data['y'][dataset])

    def getData(self, dataset):
        return self.getX(dataset), self.getY(dataset)
Example #55
0
traindata=np.asarray(traindata)

trainlabel=np.asarray(trainlabel)
testdata=np.asarray(testdata)
testlabel=np.asarray(testlabel)
testdata = testdata.astype("float32")
traindata = traindata.astype("float32")


testlabel=np.asarray(testlabel)
print('Training data shape : ', traindata.shape, trainlabel.shape)
print('Testing data shape : ', testdata.shape, testlabel.shape)
from sklearn.preprocessing import LabelBinarizer
lblbin = LabelBinarizer()
train_labels_onehot = lblbin.fit_transform(trainlabel)
test_labels_onehot = lblbin.transform(testlabel)
detail(train_labels_onehot)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
model = Sequential()
# first convolution: CONV => RELU => POOL
model.add(Input(shape=(3,)))
model.add(Dense(3))
model.add(Activation("softmax"))
model.add(Dense(3))
model.add(Activation("relu"))
model.add(Dense(2))
model.add(Dense(3))
model.add(Activation("softmax"))
model.add(Dense(2))
class ActualTreatmentPredictor(_BasePredictor):
    """Returns the most likely treatments for a patient.

    Args:
        prediction_model: The model used to make predictions
        preprocessor: The preprocessor used to transform the patient features into a format that can be
        used by the prediction_model
        recommendation_probability_threshold: The probability threshold that a potential recommendation needs to have
        a higher probability than to be considered a possible treatment.
    """
    def __init__(self,
                 prediction_model,
                 preprocessor,
                 recommendation_probability_threshold=0.05):
        super().__init__(prediction_model, preprocessor)

        self._treatment_label_binarizer = LabelBinarizer()
        self._recommendation_probability_threshold = recommendation_probability_threshold

    def _pre_fit_hook(self, data):
        self._treatment_label_binarizer.fit(data.treatment.unique())

    def _get_outcome_data_for_training(self, data):
        return self._treatment_label_binarizer.transform(data.treatment.values)

    def _get_predicted_value(self, prediction):
        return self._treatment_label_binarizer.inverse_transform(prediction)

    def get_possible_treatments(self, data):
        """Returns the most likely treatments for a patient.

        Args:
            data: A dataframe containing patient features as well as a sample_id column. The sample_id column
            is needed because there can be many most likely treatments for a sample_id and the column is used
            to reconcile the treatment with the record.

        Returns:
            A dataframe with the following columns:
            sample_id: The sample_id the treatment is for.
            treatment: The treatment category

        """
        self._checked_is_trained()

        # leave comment on structure
        probabilities_sectioned_by_treatment = self._pipeline.predict_proba(
            data)
        ordered_treatments = self._treatment_label_binarizer.classes_
        treatment_dfs = []

        for (treatment, probabilities_for_treatment) in zip(
                ordered_treatments, probabilities_sectioned_by_treatment):
            probability_of_treatment = [
                prob[1] if len(prob) > 1 else 0
                for prob in probabilities_for_treatment
            ]
            df = pd.DataFrame({
                "treatment": treatment,
                "probability_of_treatment": probability_of_treatment,
                "sample_id": range(len(probability_of_treatment))
            })
            treatment_dfs.append(df)

        combined_df = pd.concat(treatment_dfs)

        # Get all treatments that have a probability greater than the threshold
        sample_with_high_probability = \
            combined_df[combined_df.probability_of_treatment > self._recommendation_probability_threshold]

        # Get the top probability for a sample_id. This treatment will be used if there is no treatment for the
        # sample_id greater than the threshold
        top_treatment_per_sample_id = combined_df.groupby(
            "sample_id")["probability_of_treatment"].nlargest(
                1).reset_index().drop('level_1', axis=1)

        # Find top treatments for samples that have not treatment above the threshold. This is a rare case but can
        # happen.
        samples_ids_with_high_prob = set(
            sample_with_high_probability.sample_id.unique())
        all_sample_ids = set(combined_df.sample_id.unique())
        ids_not_in_high_prob = all_sample_ids - samples_ids_with_high_prob

        top_treatments_for_samples_missing_high_prob =\
            top_treatment_per_sample_id[top_treatment_per_sample_id.sample_id.isin(ids_not_in_high_prob)]

        return pd.concat([
            sample_with_high_probability,
            top_treatments_for_samples_missing_high_prob
        ])
Example #57
0
def zad3(y_test, y_pred, average):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)
Example #58
0
def multiclass_roc_auc_score(y_true, y_pred, average='macro'):
    lb = LabelBinarizer()
    lb.fit(y_true)
    y_true = lb.transform(y_true)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_true, y_pred, average="weighted")
Example #59
0
    # save label_maker
    print('Saving object to convert output to labels ' + OUT_FOLDER +
          '/label_maker.' + MODEL_NAME + '.pickle')

    with open(OUT_FOLDER + '/label_maker.' + MODEL_NAME + '.pickle',
              'wb') as handle:
        pickle.dump(label_maker, handle, protocol=pickle.HIGHEST_PROTOCOL)

    labels = list(np.repeat('Coronaviridae',len(Coronaviridae_reads))) + \
             list(np.repeat('Influenza',len(Influenza_reads))) + \
             list(np.repeat('Metapneumovirus',len(Metapneumovirus_reads))) + \
             list(np.repeat('Rhinovirus',len(Rhinovirus_reads))) + \
             list(np.repeat('Sars_cov_2',len(Sars_cov_2_reads))) + \
             list(np.repeat('Human',len(Human)))

    labels_proces = label_maker.transform(labels)

    # Tokenize the vocabulary
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(total_sequences)
    print('Converting reads into k-mers of lenght ' + str(K_MERS))
    sequences_preproces = tokenizer.texts_to_sequences(total_sequences)

    max_length = max([len(s.split()) for s in total_sequences])
    # pad sequences
    sequences_preproces = pad_sequences(sequences_preproces,
                                        maxlen=max_length,
                                        padding='post')

    print('Saving tokenizer object ' + OUT_FOLDER + '/tokenizer.' +
          MODEL_NAME + '.pickle')
Example #60
0
    data.append(image)
    labels.append(label)

# scale the raw pixel intensities to the range [0, 1] (this improves training)
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)

# Split the training data into separate train and test sets
(X_train, X_test, Y_train, Y_test) = train_test_split(data,
                                                      labels,
                                                      test_size=0.25,
                                                      random_state=0)

# Convert the labels (letters) into one-hot encodings that Keras can work with
lb = LabelBinarizer().fit(Y_train)
Y_train = lb.transform(Y_train)
Y_test = lb.transform(Y_test)

# Save the mapping from labels to one-hot encodings.
# We'll need this later when we use the model to decode what it's predictions mean
with open(MODEL_LABELS_FILENAME, "wb") as f:
    pickle.dump(lb, f)

# Build the neural network!
model = Sequential()

# First convolutional layer with max pooling
model.add(
    Conv2D(20, (5, 5),
           padding="same",
           input_shape=(20, 20, 1),