Esempio n. 1
0
def normalize_data(data, target):
    data.replace({'None': np.nan}, inplace=True)
    types = pd.read_csv('data/datatypes.csv')
    for i, row in types.iterrows():
        data[row['feature']] = data[row['feature']].astype(row['type'])
    data['memFreq'].fillna(0, inplace=True)
    data['memtRFC'].fillna(0, inplace=True)

    os_le = LabelEncoder()
    cpu_full_le = LabelEncoder()
    cpu_arch_le = LabelEncoder()
    mem_type_le = LabelEncoder()
    data['cpuFull'] = cpu_full_le.fit_transform(data['cpuFull'])
    data['os'] = os_le.fit_transform(data['os'])
    data['cpuArch'] = cpu_arch_le.fit_transform(data['cpuArch'])
    data['memType'] = mem_type_le.fit_transform(data['memType'])
    # drop single value columns
    data = data.drop(['cacheL3IsShared', 'BMI', 'CLF_._Cache_Line_Flush', 'CMOV_._Conditionnal_Move_Inst.',
                      'CX8_._CMPXCHG8B', 'FXSR.FXSAVE.FXRSTOR', 'IA.64_Technology',
                      'MMX_Technology', 'SSE', 'SSE2', 'SSE4a', 'SSE5', 'TBM', 'X3DNow_Pro_Technology'], axis=1)

    data['C0'] = np.log(data['n'] * data['m'] * data['k'])
    data = data.drop(['m', 'n', 'k'], axis=1)
    return data, target, {
        'os': os_le,
        'cpuFull': cpu_full_le,
        'cpuArch': cpu_arch_le,
        'memType': mem_type_le,
    }
Esempio n. 2
0
def test_label_encoder_fit_transform():
    # Test fit_transform
    le = LabelEncoder()
    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])

    le = LabelEncoder()
    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(ret, [1, 1, 2, 0])
Esempio n. 3
0
def test_label_encoder_fit_transform():
    # Test fit_transform
    le = LabelEncoder()
    ret = le.fit_transform([1, 1, 4, 5, -1, 0])
    assert_array_equal(ret, [2, 2, 3, 4, 0, 1])

    le = LabelEncoder()
    ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(ret, [1, 1, 2, 0])
Esempio n. 4
0
def design_matrix(sample_labels):
    
    factors_dict = []
    n_factors = 0
    
    for i in range(sample_labels.shape[1]):
        unique_labels = np.unique(sample_labels[:,i])
        if len(unique_labels) == 1:
            label_factors = 0
        else:
            label_factors = len(unique_labels)
        
        n_factors+=label_factors
        factors_dict.append(label_factors)
    
    X = np.zeros((sample_labels.shape[0], n_factors))
    
    lb = LabelEncoder()
    factor_labels = []
    offset = 0
    for i, factor in enumerate(factors_dict):
        if factor == 0:
            continue
        index = lb.fit_transform(sample_labels.T[i])
        for j in range(sample_labels.shape[0]):
            X[j,index[j]+offset] = 1
        
        factor_labels.append(lb.classes_)
        
        offset+=factor
    
    return X, np.hstack(factor_labels), factors_dict
Esempio n. 5
0
def davies_bouldin_index(X, labels, metric='euclidean'):
    """Compute the Davies Bouldin index.
  The index is defined as the ratio of within-cluster
  and between-cluster distances.
  Parameters
  ----------
  X : array-like, shape (``n_samples``, ``n_features``)
      List of ``n_features``-dimensional data points. Each row corresponds
      to a single data point.
  labels : array-like, shape (``n_samples``,)
      Predicted labels for each sample.
  Returns
  -------
  score : float
      The resulting Davies-Bouldin index.
  References
  ----------
  .. [1] `Davies, David L.; Bouldin, Donald W. (1979).
     "A Cluster Separation Measure". IEEE Transactions on
     Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_
  """
    X, labels = check_X_y(X, labels)
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    n_samples, _ = X.shape
    n_labels = len(le.classes_)

    check_number_of_labels(n_labels, n_samples)
    intra_dists = np.zeros(n_labels)
    centroids = np.zeros((n_labels, len(X[0])), np.float32)
    # print("Start")
    # print(labels)
    # print(X)
    for k in range(n_labels):
        cluster_k = X[labels == k]
        mean_k = np.mean(cluster_k, axis=0)
        centroids[k] = mean_k
        # print("Process")
        # print(mean_k)
        # print(cluster_k)
        intra_dists[k] = np.average(
            pairwise_distances(cluster_k, [mean_k], metric=metric))
    centroid_distances = pairwise_distances(centroids, metric=metric)
    with np.errstate(divide='ignore', invalid='ignore'):
        if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \
          np.all(centroid_distances == 0.0):
            return 0.0
        scores = (intra_dists[:, None] + intra_dists) / centroid_distances
        # remove inf values
        scores[scores == np.inf] = np.nan
        return np.nanmax(scores, axis=1)
def test_label_encoder(values, classes, unknown):
    # Test LabelEncoder's transform, fit_transform and
    # inverse_transform methods
    le = LabelEncoder()
    le.fit(values)
    assert_array_equal(le.classes_, classes)
    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
    le = LabelEncoder()
    ret = le.fit_transform(values)
    assert_array_equal(ret, [1, 0, 2, 0, 2])

    with pytest.raises(ValueError, match="unseen labels"):
        le.transform(unknown)
Esempio n. 7
0
def test_label_encoder(values, classes, unknown):
    # Test LabelEncoder's transform, fit_transform and
    # inverse_transform methods
    le = LabelEncoder()
    le.fit(values)
    assert_array_equal(le.classes_, classes)
    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
    le = LabelEncoder()
    ret = le.fit_transform(values)
    assert_array_equal(ret, [1, 0, 2, 0, 2])

    with pytest.raises(ValueError, match="unseen labels"):
        le.transform(unknown)
Esempio n. 8
0
    def construct_features(self):
        '''
        Construct features.
        '''
        # Parse date features.
        print "Parsing date features"
        parsed_train_X = self.parse_date_feature(self.train_x[:, 0])
        parsed_test_X = self.parse_date_feature(self.test_x[:, 0])

        # Parse other features.
        print "Parsing all features"
        total_train = len(self.train_x)
        total_test = len(self.test_x)

        for index_feature in range(1, len(self.train_x[0])):
            print "Processing feature ", index_feature

            # Check if we have a categorical feature.
            labels = np.unique(self.train_x[:, index_feature])

            # If we have string or binary labels, we have a categorical feature.
            if type(self.train_x[0, index_feature]) == np.str or len(labels) == 2:
                # We have a categorical feature.

                # Encode it in the one hot format.
                original_data = np.hstack((self.train_x[:, index_feature],
                                           self.test_x[:, index_feature]))

                label_encoder = LabelEncoder()
                data_label_encoded = label_encoder.fit_transform(original_data)
                encoder = OneHotEncoder()
                data_encoded = encoder.fit_transform(data_label_encoded.reshape((len(data_label_encoded), 1)))
                data_encoded = np.asarray(data_encoded.todense()).astype(np.bool)

                # Add encoded feature to data.
                parsed_train_X = np.hstack((parsed_train_X, data_encoded[0:total_train, :]))
                parsed_test_X = np.hstack((parsed_test_X, data_encoded[total_train:, :]))
                del data_encoded
            else:
                # We have a numeric feature.

                # Just add it to the data.
                parsed_train_X = np.hstack((parsed_train_X,
                                            self.train_x[:, index_feature].reshape((total_train, 1))))
                parsed_test_X = np.hstack((parsed_test_X,
                                           self.test_x[:, index_feature].reshape((total_test, 1))))

        self.train_x = parsed_train_X
        self.test_x = parsed_test_X
Esempio n. 9
0
class BaiduQA:
  def __init__(self, pt):
    self.labels, self.docs = self.load(pt)
    self.doc_num = len(self.docs)
    print("doc_num=%d" % self.doc_num)

    self.label_encoder = LabelEncoder()
    self.ys = self.label_encoder.fit_transform(self.labels)
    self.label_num = len(self.label_encoder.classes_)
    print("label_num=%d" % self.label_num)

    self.tokenizer = Tokenizer(split=" ")
    self.tokenizer.fit_on_texts(self.docs)
    self.xs = self.tokenizer.texts_to_sequences(self.docs)
    self.voca_size = max(self.tokenizer.word_index.values()) + 1
    print("voca_size=%d" % self.voca_size)

  @staticmethod
  def load(pt):
    labels = []
    docs = []
    print("read:" + pt)
    lines = open(pt).readlines()
    shuffle(lines)
    for l in lines:
      label, doc = l.strip().split("\t")[1].split("|")
      labels.append(label)
      docs.append(doc)

    print("n(doc)=%d" % len(labels))
    return labels, docs

  def split(self):
    train_ys, test_ys, train_xs, test_xs = train_test_split(self.ys, self.xs, train_size=0.75)
    return train_ys, test_ys, train_xs, test_xs

  def next_batch(self, batch_size):
    i = 0
    while True:
      if i + batch_size > self.doc_num:
        i == 0
      yield (self.ys[i: i + batch_size], self.xs[i: i + batch_size])
Esempio n. 10
0
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg


# load dataset
dataset = read_csv('data/pollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:, 4] = encoder.fit_transform(values[:, 4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

# specify the number of lag hours
n_hours = 1
n_features = 8  ### NOT USED YET
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]],
              axis=1,
              inplace=True)
Esempio n. 11
0
class CustomNNCategorical(CustomNNBase):
    """
    Base for custom sk classifier implementing NN using keras
    
    implement an MLP for classification with custom metric cohen kappa, no custom loss for now @todo
    """
    def __init__(self,
                 hidden=[200, 100, 50, 20],
                 dropout=[0.1, 0.1],
                 reg=[0.05, 0.05],
                 h_act=[relu],
                 epoch=500,
                 batch_size=32,
                 cbEarly="metric",
                 loss="categorical_crossentropy",
                 optimizer='adam',
                 metrics=['cohen_kappa'],
                 kappa_weights="quadratic",
                 validation=0.2,
                 smooth_cb=True):
        '''        
        :param hidden:
        :param dropout:  dropout[0] is assigned to input then hidden
        :param reg: ularization 
        :param h_act: hidden_actication
        :param epoch:
        :param batch_size:
        :param cbEarly: "metric" or an EarlyStopping instance
        :param loss:
        :param optimizer:
        :param metrics: "Accuracy" or 'cohen_kappa'
        :param kappa_weights: compatible with sk(ex:"quadratic", None) ignored if metrics != 'cohen_kappa'
        :param smooth_cb: if True EarlyStopping use val_cohen_kappa smoothed (left avg window 3),
         only with val_cohen_kappa
        
        :note restore_best_weights requires keras 2.2.3
        
        '''
        assert loss in ["categorical_crossentropy", lossOCC, lossOCCQuadratic]
        CustomNNBase.__init__(self, epoch, loss, optimizer, metrics,
                              batch_size)
        # 'categorical_crossentropy', OCC.lossOCCQuadratic, lossOCC
        assert (len(hidden) > 0) & (len(hidden)+1 >= len(dropout)) & \
            (len(hidden) >= len(reg)) & (len(hidden) >= len(h_act))

        self.hidden = hidden
        self.dropout = dropout
        self.reg = reg
        self.h_act = h_act
        self.validation = validation

        self.final_activation = softmax

        self.cbEarly = cbEarly
        self.smooth_cb = smooth_cb

        self.cbReduceLR = ReduceLROnPlateau(monitor='loss',
                                            factor=0.8,
                                            patience=3,
                                            verbose=0,
                                            mode='auto',
                                            min_delta=0.0001,
                                            cooldown=0,
                                            min_lr=0)

        #         self.cbReduceLR = ReduceLROnPlateau(monitor='loss', factor=0.5,
        #                               patience=4, min_lr=0.000001, verbose=0)

        #         ReduceLROnPlateau(monitor='val_loss', factor=0.2,
        #                               patience=2, min_lr=0.000001, verbose=0)
        #

        self.kappa_weights = kappa_weights
        if len(self.metrics) > 1: raise "TODO"

    def __compile(self, input_shape, output_shape):
        ter = lambda x, i: None if len(x) <= i else x[i]
        reg = [regularizers.l2(i) for i in self.reg
               ]  #@TODO ALSO USE L1 FOR BETTER FEATURE SELECTION
        h_act = self.h_act * round(len(self.hidden) / len(self.h_act))

        self.model = Sequential()

        self.model.add(InputLayer(input_shape=(input_shape, )))
        if not ter(self.dropout, 0) is None:
            self.model.add(Dropout(ter(self.dropout, 0)))

        for i in range(0, len(self.hidden)):
            self.model.add(
                Dense(self.hidden[i],
                      activation=h_act[i],
                      kernel_regularizer=ter(reg, i),
                      bias_regularizer=ter(reg, i)))
            if not ter(self.dropout, i + 1) is None:
                self.model.add(Dropout(ter(self.dropout,
                                           i + 1)))  # first for input

        self.model.add(Dense(output_shape, activation=self.final_activation))

        self.model.compile(optimizer=self.optimizer,
                           loss=self.loss,
                           metrics=self.metrics)

    def __category_to_output(self, y):
        self.label_encoder = LabelEncoder()
        y = self.label_encoder.fit_transform(y)
        target = to_categorical(y, num_classes=np.unique(y).size)
        return target

    def __output_to_category(self, output):
        pred = [np.argmax(i) for i in output]
        pred = self.label_encoder.inverse_transform(pred)
        return pred

    def cohen_kappa_metric_keras(self, y_true, y_pred):
        '''
        Do not work as a metric because kappa is not linear and keras make a weighted avg of batches score
        :deprecated @see Cohen_kappa_logger
        '''
        raise "deprecated @see Cohen_kappa_logger"
        return tf.py_func(self.cohen_kappa_score, [y_true, y_pred], tf.float32)

    def cohen_kappa_score(self, y_true, y_pred):
        raise "deprecated @see Cohen_kappa_logger"
        y_pred = self.__output_to_category(y_pred)
        y_true = self.__output_to_category(y_true)

        score = metrics.cohen_kappa_score(y_true,
                                          y_pred,
                                          weights=self.kappa_weights)
        return score.astype(np.float32)

    def break_on_epoch_n(self, threshold, sec=60):
        self.n_epoch = len(self.history.history["loss"])
        if self.n_epoch > threshold:
            sleep(sec)  # cool down

    def _fit_val(self, X, output):
        # @todo clean below
        if type(self.validation) is float:
            self.history = self.model.fit(X,
                                          output,
                                          validation_split=self.validation,
                                          epochs=self.epoch,
                                          batch_size=self.batch_size,
                                          callbacks=self.callback_list,
                                          verbose=0)

        elif type(self.validation) is tuple:
            assert self.validation[0].shape[1] == X.shape[
                1], "X_validation must be transformed with prep first"
            self.validation = (self.validation[0],
                               self.__category_to_output(self.validation[1]))
            self.history = self.model.fit(X,
                                          output,
                                          validation_data=self.validation,
                                          epochs=self.epoch,
                                          batch_size=self.batch_size,
                                          callbacks=self.callback_list,
                                          verbose=0)

        elif self.validation is None:
            self.history = self.model.fit(X,
                                          output,
                                          epochs=self.epoch,
                                          batch_size=self.batch_size,
                                          callbacks=self.callback_list,
                                          verbose=0)
        else:
            raise "unknown validation type"

    def _kappa_disambiguation(self, X, output):
        '''
        :param X:
        :param output:
        '''
        self.metric_plot = None
        self.patience = 20  #for cbEarly is enoughfrom observation  @todo in init

        if self.metrics[0] == "accuracy":
            self.metric_plot = "acc"
            raise "min_delta must be redefined according to val_acc"
            if self.use_smooth_cb:
                raise 'not available for acc self.use_smooth_cb'
            if self.cbEarly == "metric":
                self.cbEarly = EarlyStopping(
                    monitor='val_acc' if self.validation else "acc",
                    min_delta=0.0001,
                    patience=self.patience,
                    verbose=0,
                    mode='auto')
            self.kappa_logger = None

        elif self.metrics[0] == 'cohen_kappa':
            self.metrics = None  # 'cohen_kappa_metric' cannot be supported @see explication in Cohen_kappa_logger
            self.metric_plot = 'cohen_kappa'
            if self.cbEarly == "metric":
                if self.validation:
                    monitor = "val_cohen_kappa_smoothed" if self.smooth_cb else "val_cohen_kappa"
                else:
                    if not self.smooth_cb:
                        monitor = "cohen_kappa"
                    else:
                        raise "No cohen_kappa_smoothed"
                print("monitor", monitor)
                self.cbEarly = EarlyStopping(
                    monitor=monitor if self.validation else "cohen_kappa",
                    min_delta=0.00000001,
                    patience=self.patience,  # a large patience is necessary!
                    verbose=0,
                    mode='max',
                    restore_best_weights=True)

            if type(self.validation) is float:
                X, X_val, output, y_val = train_test_split(
                    X, output, test_size=self.validation)
            elif type(self.validation) is tuple:
                assert self.validation[0].shape[1] == X.shape[
                    1], "X_validation must be transformed with prep first"
                X_val = self.validation[0]
                y_val = self.__category_to_output(self.validation[1])
            elif not self.validation is None:
                raise "unknown validation type"

            #             self.validation = None # can slightly reduce computation but need val_loss for callback LRReduceOnPlateau

            self.kappa_logger = Cohen_kappa_logger(
                output_to_category=self.__output_to_category,
                X_train=X,
                y_train=output,
                X_val=X_val,
                y_val=y_val,
                kappa_weights=self.kappa_weights)

        else:
            print(self.metrics[0])
            raise "not implemented"
        return X, output

    def fit(self, X, y=None):
        '''
        :param X:
        :param y:
        :param cbEarly: Parameter for early stopping
        '''
        output = self.__category_to_output(y)

        X, output = self._kappa_disambiguation(X, output)

        output_shape = output.shape[1]
        input_shape = X.shape[1]
        self.__compile(input_shape, output_shape)

        self.callback_list = []
        for cb in [self.kappa_logger, self.cbReduceLR, self.cbEarly]:
            if cb: self.callback_list.append(cb)

        self._fit_val(X, output)

        self.break_on_epoch_n(50)
        return self

    def predict(self, X, y=None):
        try:
            getattr(self, "history")
        except AttributeError:
            raise RuntimeError("Call fit first.")

        preds = self.model.predict(X)
        preds = self.__output_to_category(preds)
        return preds

    def plot_history(self, plotname="NN", saving_file=None):
        '''
        :param plotname:
        :param saving_file: filename where to save plots
        :return plt , to avoid carbage collection and closing of the windows
        '''
        history = self.history
        plot = (saving_file is None)
        #         print("History acc", history.history['acc'])
        #         print("History loss", history.history['loss'])
        #         print("History lr", history.history['lr'])
        #         print("Acc train (last)", history.history['acc'][-5:-1])

        import matplotlib.pyplot as plt

        if plot: plt.ion()
        if plot: plt.show()

        fig = plt.figure()
        plt.grid(True)
        plt.title(plotname)
        #         print("possible plot", history.history.keys())
        if self.metric_plot in history.history.keys():
            plt.subplot(221)
            plt.plot(history.history[self.metric_plot])
            plt.ylabel(self.metric_plot + "  ")
            if plot: plt.draw()

        if "val_" + self.metric_plot in history.history.keys():
            plt.subplot(222)
            #         print("possible plot", history.history.keys())
            plt.plot(history.history["val_" + self.metric_plot])
            plt.ylabel("val_" + self.metric_plot + "  ")
            if plot: plt.draw()

            if False:
                print("self.patience last epochs")
                print(history.history["val_" +
                                      self.metric_plot][-(self.patience + 1):])

        plt.subplot(223)
        plt.plot(history.history['loss'])
        plt.ylabel('"loss" ' + "  " + plotname)
        if plot: plt.draw()

        plt.subplot(224)
        if "val_cohen_kappa_smoothed" in history.history.keys():
            plt.plot(history.history['val_cohen_kappa_smoothed'])
            plt.ylabel("val_cohen_kappa_smoothed")
        else:
            plt.plot(history.history['lr'])
            plt.ylabel('"lr"' + "  " + plotname)
        if plot: plt.draw()
        if plot: plt.pause(1)

        if saving_file:
            fig.savefig(saving_file)
            plt = None  # send to carbage

        return plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.label import LabelEncoder
from utils import *
import tensorflow as tf
import glob
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.6 #固定使用显存比例
config.gpu_options.allow_growth = True #动态分配显存

BATCH_SIZE = 64
EPOCHS = 10

data_train_csv = pd.read_csv('data/labels.csv')
filenames = data_train_csv.id.values
le = LabelEncoder()
labels = le.fit_transform(data_train_csv.breed)
N_CLASS = len(le.classes_)

filenames_train , filenames_val ,labels_train, labels_val =\
    train_test_split(filenames,labels,test_size=0.1,stratify=labels)
filenames_test = [i.split('/')[-1].split('.')[0] for i in glob.glob('data/test/*')]
EPOCH_TRAIN_SIZE = len(filenames_train)//BATCH_SIZE + 1
EPOCH_VAL_SIZE =   len(filenames_val)//BATCH_SIZE + 1
EPOCH_TEST_SIZE = len(filenames_test)//BATCH_SIZE + 1
sess=tf.Session(config=config)

x_train, y_train = get_train_dataset(filenames_train,labels_train,BATCH_SIZE,rootdir='data/train')
x_val,y_val = get_train_dataset(filenames_val,labels_val,BATCH_SIZE,rootdir='data/train')
#x_test,id_test = get_test_dataset(filenames_test,BATCH_SIZE,rootdir='data/test')

endpoints_train= get_inceptionV3(x_train,y_train,n_class=N_CLASS,reuse=False,is_training=False,mode='dev')
print df_all
'''
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN',strategy='mean',axis=0)
df = imp.fit_transform(df)
df = pd.DataFrame(df,columns=df_all.columns)
print df
'''
'''先转换为纯数字型'''
from sklearn.preprocessing.label import LabelEncoder
class_label_encoder = LabelEncoder()
df = df_all.values
df1 = df11.values
df2 = df22.values
for i in range(14):
    df[:, i] = class_label_encoder.fit_transform(df[:, i])
    print df[:, i]
    df1[:, i] = class_label_encoder.fit_transform(df1[:, i])
    df2[:, i] = class_label_encoder.fit_transform(df2[:, i])
df = pd.DataFrame(df, columns=df_all.columns)
'''划分数据集'''
'''
NUMBER = 1309
X_train = df[:NUMBER].drop(['survived'],1)
X_test = df[NUMBER:].drop(['survived'],1)
survivor = df['survived']
Y_train = survivor[:NUMBER]
Y_test = survivor[NUMBER:]
print Y_test
'''
NUMBER = 1309
Esempio n. 14
0
def dataset(dataset_name):

    if (dataset_name == 'cifar10'):
        print("| Preparing CIFAR-10 dataset...")
        sys.stdout.write("| ")
        trainset = torchvision.datasets.CIFAR10(root='./data',
                                                train=True,
                                                download=True,
                                                transform=transform_training())
        testset = torchvision.datasets.CIFAR10(root='./data',
                                               train=False,
                                               download=False,
                                               transform=transform_testing())
        outputs = 10
        inputs = 3

    elif (dataset_name == 'cifar100'):
        print("| Preparing CIFAR-100 dataset...")
        sys.stdout.write("| ")
        trainset = torchvision.datasets.CIFAR100(
            root='./data',
            train=True,
            download=True,
            transform=transform_training())
        testset = torchvision.datasets.CIFAR100(root='./data',
                                                train=False,
                                                download=False,
                                                transform=transform_testing())
        outputs = 100
        inputs = 3

    elif (dataset_name == 'mnist'):
        print("| Preparing MNIST dataset...")
        trainset = torchvision.datasets.MNIST(root='./data',
                                              train=True,
                                              download=True,
                                              transform=transform_training())
        testset = torchvision.datasets.MNIST(root='./data',
                                             train=False,
                                             download=False,
                                             transform=transform_testing())
        outputs = 10
        inputs = 1

    elif (dataset_name == 'fashionmnist'):
        print("| Preparing FASHIONMNIST dataset...")
        sys.stdout.write("| ")
        trainset = torchvision.datasets.FashionMNIST(
            root='./data',
            train=True,
            download=True,
            transform=transform_training())
        testset = torchvision.datasets.FashionMNIST(
            root='./data',
            train=False,
            download=False,
            transform=transform_testing())
        outputs = 10
        inputs = 1
    elif (dataset_name == 'stl10'):
        print("| Preparing STL10 dataset...")
        sys.stdout.write("| ")
        trainset = torchvision.datasets.STL10(root='./data',
                                              split='train',
                                              download=True,
                                              transform=transform_training())
        testset = torchvision.datasets.STL10(root='./data',
                                             split='test',
                                             download=False,
                                             transform=transform_testing())
        outputs = 10
        inputs = 3

    elif (dataset_name == 'dog-breed'):
        print("| Preparing DOG-BREED dataset...")

        data_train_csv = pd.read_csv('./data/dog-breed/labels.csv')
        filenames = data_train_csv.id.values
        le = LabelEncoder()
        labels = le.fit_transform(data_train_csv.breed)

        filenames_train, filenames_val, labels_train, labels_val = train_test_split(
            filenames, labels, test_size=0.3, stratify=labels, shuffle=True)
        trainset = get_train_dataset(filenames_train,
                                     labels_train,
                                     cf.batch_size,
                                     rootdir='./data/dog-breed/train')
        testset = get_train_dataset(filenames_val,
                                    labels_val,
                                    cf.batch_size,
                                    rootdir='./data/dog-breed/train')
        outputs = 120
        inputs = 3

    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=cf.batch_size,
                                              shuffle=True,
                                              num_workers=4)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=cf.batch_size,
                                             shuffle=True,
                                             num_workers=4)

    return trainloader, testloader, outputs, inputs
Esempio n. 15
0
def design_matrix(sample_labels, interaction_indices=None):
    """
    Parameters
    ---------
    sample_labels: 
        a numpy matrix, for each sample a vector with the conditions
        which we would like to model.
        cols represent the type of conditions we want to model,
        row represent a combination of conditions that are represented by the row-variable.
        if we have a 2x3 design we build this matrix:
        [[0,0],
         [0,1],
         [0,2],
         [1,0],
         [1,1],
         [1,2]]
        
        
    
    Returns
    -------
    X: the design matrix.
    factor_labels: the labels of the design-matrix columns
    factor_num : number of factors for each condition
    
    """
        
    factor_num = []
    n_factors = 0
    
    for i in range(sample_labels.shape[1]):
        unique_labels = np.unique(sample_labels[:,i])
        if len(unique_labels) == 1:
            label_factors = 0
        else:
            label_factors = len(unique_labels)
        
        n_factors+=label_factors
        factor_num.append(label_factors)
    
    n_interactions = 0
    if interaction_indices != None:
        interaction_factors = np.array(factor_num)[[interaction_indices]]
        n_interactions = np.prod(interaction_factors)
        Xint = np.zeros((sample_labels.shape[0], n_interactions))
    
    
    X = np.zeros((sample_labels.shape[0], n_factors))
    
    lb = LabelEncoder()
    factor_labels = []
    offset = 0
    for i, factor in enumerate(factor_num):
        if factor == 0:
            continue
        index = lb.fit_transform(sample_labels.T[i])
        for j in range(sample_labels.shape[0]):
            X[j,index[j]+offset] = 1
        
        factor_labels.append(lb.classes_)
        
        offset += factor
    
    if interaction_indices != None:
        interaction_product = [np.arange(v).tolist() for v in interaction_factors]
        interaction_gen = cartesian(interaction_product)
        
        # This is buggy!!
        Xint = np.zeros((sample_labels.shape[0], n_interactions))
        offset = interaction_indices[0] * np.sum(factor_num[:interaction_indices[0]])
        offset = np.int(offset)
        for i, int_indices in enumerate(interaction_gen):
            
            index1 = offset + int_indices[0]
            index2 = offset + int_indices[1] + factor_num[interaction_indices[0]]
            
            Xint[:,i] = X[:,index1] * X[:,index2]
            
            factor1 = interaction_indices[0]
            factor2 = interaction_indices[1]

            new_label = factor_labels[factor1][int_indices[0]] + "_" + \
                        factor_labels[factor2][int_indices[1]]
                        
            factor_labels.append(new_label)
        
        X = np.hstack((X, Xint))
        
    return X, np.hstack(factor_labels), factor_num
Esempio n. 16
0
def design_matrix(sample_labels, interaction_indices=None):
    """
    Parameters
    ---------
    sample_labels: 
        a numpy matrix, for each sample a vector with the conditions
        which we would like to model.
        cols represent the type of conditions we want to model,
        row represent a combination of conditions that are represented by the row-variable.
        if we have a 2x3 design we build this matrix:
        [[0,0],
         [0,1],
         [0,2],
         [1,0],
         [1,1],
         [1,2]]
        
        
    
    Returns
    -------
    X: the design matrix.
    factor_labels: the labels of the design-matrix columns
    factor_num : number of factors for each condition
    
    """
        
    factor_num = []
    n_factors = 0
    
    for i in range(sample_labels.shape[1]):
        unique_labels = np.unique(sample_labels[:,i])
        if len(unique_labels) == 1:
            label_factors = 0
        else:
            label_factors = len(unique_labels)
        
        n_factors+=label_factors
        factor_num.append(label_factors)
    
    n_interactions = 0
    if interaction_indices != None:
        interaction_factors = np.array(factor_num)[[interaction_indices]]
        n_interactions = np.prod(interaction_factors)
        Xint = np.zeros((sample_labels.shape[0], n_interactions))
    
    
    X = np.zeros((sample_labels.shape[0], n_factors))
    
    lb = LabelEncoder()
    factor_labels = []
    offset = 0
    for i, factor in enumerate(factor_num):
        if factor == 0:
            continue
        index = lb.fit_transform(sample_labels.T[i])
        for j in range(sample_labels.shape[0]):
            X[j,index[j]+offset] = 1
        
        factor_labels.append(lb.classes_)
        
        offset += factor
    
    if interaction_indices != None:
        interaction_product = [np.arange(v).tolist() for v in interaction_factors]
        interaction_gen = cartesian(interaction_product)
        
        # This is buggy!!
        Xint = np.zeros((sample_labels.shape[0], n_interactions))
        offset = interaction_indices[0] * np.sum(factor_num[:interaction_indices[0]])
        offset = np.int(offset)
        for i, int_indices in enumerate(interaction_gen):
            
            index1 = offset + int_indices[0]
            index2 = offset + int_indices[1] + factor_num[interaction_indices[0]]
            
            Xint[:,i] = X[:,index1] * X[:,index2]
            
            factor1 = interaction_indices[0]
            factor2 = interaction_indices[1]

            new_label = factor_labels[factor1][int_indices[0]] + "_" + \
                        factor_labels[factor2][int_indices[1]]
                        
            factor_labels.append(new_label)
        
        X = np.hstack((X, Xint))
        
    return X, np.hstack(factor_labels), factor_num
Esempio n. 17
0
    def predict(self):

        try:
            #filename如果未定义则会抛出异常
            path = self.path.get()
            mylist = os.listdir(path)

            feeling_list = []
            for item in mylist:
                if item[6:-16] == '02' and int(item[18:-4]) % 2 == 0:
                    feeling_list.append('female_calm')

                elif item[6:-16] == '02' and int(item[18:-4]) % 2 == 1:
                    feeling_list.append('male_calm')

                elif item[6:-16] == '03' and int(item[18:-4]) % 2 == 0:
                    feeling_list.append('female_happy')

                elif item[6:-16] == '03' and int(item[18:-4]) % 2 == 1:
                    feeling_list.append('male_happy')

                elif item[6:-16] == '04' and int(item[18:-4]) % 2 == 0:
                    feeling_list.append('female_sad')

                elif item[6:-16] == '04' and int(item[18:-4]) % 2 == 1:
                    feeling_list.append('male_sad')

                elif item[6:-16] == '05' and int(item[18:-4]) % 2 == 0:
                    feeling_list.append('female_angry')

                elif item[6:-16] == '05' and int(item[18:-4]) % 2 == 1:
                    feeling_list.append('male_angry')

                elif item[6:-16] == '06' and int(item[18:-4]) % 2 == 0:
                    feeling_list.append('female_fearful')

                elif item[6:-16] == '06' and int(item[18:-4]) % 2 == 1:
                    feeling_list.append('male_fearful')

            labels = pd.DataFrame(feeling_list)
            #showinfo('提示', '提取测试集')
            df = pd.DataFrame(columns=['feature'])
            bookmark = 0
            for index, y in enumerate(mylist):
                if mylist[index][6:-16] != '01' and mylist[index][
                        6:-16] != '07' and mylist[index][
                            6:-16] != '08' and mylist[
                                index][:2] != 'su' and mylist[
                                    index][:1] != 'n' and mylist[
                                        index][:1] != 'd' and mylist[
                                            index][:1] != 'A':
                    X, sample_rate = librosa.load(path + '\\' + y,
                                                  res_type='kaiser_fast',
                                                  duration=2.5,
                                                  sr=22050 * 2,
                                                  offset=0.5)
                    sample_rate = np.array(sample_rate)
                    mfccs = np.mean(librosa.feature.mfcc(y=X,
                                                         sr=sample_rate,
                                                         n_mfcc=13),
                                    axis=0)
                    feature = mfccs
                    #[float(i) for i in feature]
                    #feature1=feature[:135]
                    df.loc[bookmark] = [feature]
                    bookmark = bookmark + 1

            df3 = pd.DataFrame(df['feature'].values.tolist())

            #将特征和对应的情感存到同一张表中,情感所在列的列名为‘0’

            newdf = pd.concat([df3, labels], axis=1)
            rnewdf = newdf.rename(index=str, columns={"0": "label"})

            rnewdf = shuffle(newdf)
            rnewdf = rnewdf.fillna(0)

            #print(rnewdf)

            #将表格分为训练集和测试集

            newdf1 = np.random.rand(len(rnewdf)) < 0.2
            train = rnewdf[newdf1]
            test = rnewdf[~newdf1]

            #特征值为0到倒数第一列,标签值为最后一列
            testfeatures = test.iloc[:, :-1]

            testlabel = test.iloc[:, -1:]

            X_test = np.array(testfeatures)
            y_test = np.array(testlabel)

            lb = LabelEncoder()

            y_test = np_utils.to_categorical(lb.fit_transform(y_test))

            #print(y_train)

            #创建CNN模型

            print('提取测试集...')
            x_testcnn = np.expand_dims(X_test, axis=2)

            print(x_testcnn)

            print('测试...')
            #showinfo('提示', '正在测试...')
            preds = self.model.predict(x_testcnn, batch_size=32, verbose=1)

            preds1 = preds.argmax(axis=1)

            abc = preds1.astype(int).flatten()

            predictions = (lb.inverse_transform((abc)))

            preddf = pd.DataFrame({'predicted_values': predictions})
            actual = y_test.argmax(axis=1)
            abc123 = actual.astype(int).flatten()

            #print(abc)

            actualvalues = (lb.inverse_transform((abc123)))

            actualdf = pd.DataFrame({'actual_values': actualvalues})

            finaldf = actualdf.join(preddf)

            finaldf.to_csv('H:\\预测实际对照表.csv', index=False)
            showinfo("提示", "表格打印完成,已保存到H盘目录下")
            print('\n\n输出预测值与实际值的对比表格:\n\n')

            print(
                finaldf.groupby('actual_values').count().join(
                    finaldf.groupby('predicted_values').count()))

            #showinfo("预测值与实际值的对比", finaldf.groupby('actual_values').count())
        except FileNotFoundError:

            showwarning('warning', '该路径不存在,请重新输入')
Esempio n. 18
0
            label_path = os.path.join(abs_path, article_file)
            if article_file.endswith('.json') and os.path.isfile(label_path):
                with open(label_path, 'r', encoding='utf-8') as f:
                    json_file = json.load(f)
                    for k, v in json_file.items():
                        words = re.findall('[a-zA-Z0-9]+', v)
                article_list.append(words)
                X.append(words)
                Y.append(label)

        if article_list:
            dic[label] = article_list  # 字典 key标签 value文本

dic
encoder = LabelEncoder()
encoder.fit_transform(Y)

values = []
values.extend(dic.values())
all_sentences = []
for each_label in values:
    for each_article in each_label:
        all_sentences.append(' '.join(each_article))

# 装载语料
sentences = word2vec.Text8Corpus(all_sentences)

# 写入corpusSegDone_1.txt
with open(inp, 'w') as fin:
    fin.write('\n'.join(all_sentences))
Esempio n. 19
0
def train_classifier(input_data,
                     output_dir,
                     pretrained_model='bert-base-multilingual-cased',
                     cache_dir=None,
                     maxlen=64,
                     batch_size=32,
                     num_epochs=100,
                     logging_steps=1,
                     train_logs=None,
                     **kwargs):

    # read input data stream
    texts, choices = [], []
    for item in input_data:
        texts.append(item['input'][0])
        choices.append(item['output'][0])

    le = LabelEncoder()
    choices_ids = le.fit_transform(choices)

    tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                              cache_dir=cache_dir)

    train_dataloader = prepare_texts(texts, tokenizer, maxlen, RandomSampler,
                                     batch_size, choices_ids)

    model = BertForSequenceClassification.from_pretrained(
        pretrained_model,
        num_labels=len(le.classes_),
        output_attentions=False,
        output_hidden_states=False,
        cache_dir=cache_dir)
    model.to(device)

    total_steps = len(train_dataloader) * num_epochs
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    global_step = 0
    total_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(num_epochs, desc='Epoch')
    if train_logs:
        tb_writer = SummaryWriter(
            logdir=os.path.join(train_logs, os.path.basename(output_dir)))
    else:
        tb_writer = None
    loss_queue = deque(maxlen=10)
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc='Iteration')
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            outputs = model(**inputs)
            loss = outputs[0]
            loss.backward()
            total_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1
            if global_step % logging_steps == 0:
                last_loss = (total_loss - logging_loss) / logging_steps
                loss_queue.append(last_loss)
                if tb_writer:
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', last_loss, global_step)
                logging_loss = total_loss

        # slope-based early stopping
        if len(loss_queue) == loss_queue.maxlen:
            slope = calc_slope(loss_queue)
            if tb_writer:
                tb_writer.add_scalar('slope', slope, global_step)
            if abs(slope) < 1e-2:
                break

    if tb_writer:
        tb_writer.close()

    model_to_save = model.module if hasattr(
        model,
        'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    return {
        'model_path': output_dir,
        'batch_size': batch_size,
        'maxlen': maxlen,
        'pretrained_model': pretrained_model,
        'choices_map': list(map(str, le.classes_))
    }
                              index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


big_train = train_df[feature_columns_to_use]
big_train_imputed = DataFrameImputer().fit_transform(big_train)

big_test = train_df[feature_columns_to_use]
big_test_imputed = DataFrameImputer().fit_transform(big_test)

le = LabelEncoder()
for feature in nonnumeric_columns:
    big_train_imputed[feature] = le.fit_transform(big_train_imputed[feature])
    big_test_imputed[feature] = le.fit_transform(big_test_imputed[feature])

train_X = big_train_imputed[0:train_df.shape[0]]
test_X = big_test_imputed[0:test_df.shape[0]]
train_y = train_df['click']
test_Y = train_df['click']
target = 'click'
idcol = 'id'


# test_results = pd.read_csv('test_results.csv')


def modelfit(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
Esempio n. 21
0
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn import preprocessing
import pandas as pd

df = pd.read_excel('F:\\WeiWeiHe\\titanic.xls')
'''df.convert_objects(convert_numeric=True)''' ''
'''df.fillna(0, inplace=True)'''

from sklearn.preprocessing.label import LabelEncoder
class_label_encoder = LabelEncoder()
print df.head()
df = df.values
df[:, 2] = class_label_encoder.fit_transform(df[:, 2])
df[:, 3] = class_label_encoder.fit_transform(df[:, 3])
df[:, 7] = class_label_encoder.fit_transform(df[:, 7])
df[:, 9] = class_label_encoder.fit_transform(df[:, 9])
df[:, 10] = class_label_encoder.fit_transform(df[:, 10])
df[:, 11] = class_label_encoder.fit_transform(df[:, 11])
df[:, 13] = class_label_encoder.fit_transform(df[:, 13])

from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
df = imp.fit_transform(df)
df = pd.DataFrame(df,
                  columns=[
                      'pclass', 'survived', 'name', 'sex', 'age', 'sibsp',
                      'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat',
                      'body', 'home.dest'