def normalize_data(data, target): data.replace({'None': np.nan}, inplace=True) types = pd.read_csv('data/datatypes.csv') for i, row in types.iterrows(): data[row['feature']] = data[row['feature']].astype(row['type']) data['memFreq'].fillna(0, inplace=True) data['memtRFC'].fillna(0, inplace=True) os_le = LabelEncoder() cpu_full_le = LabelEncoder() cpu_arch_le = LabelEncoder() mem_type_le = LabelEncoder() data['cpuFull'] = cpu_full_le.fit_transform(data['cpuFull']) data['os'] = os_le.fit_transform(data['os']) data['cpuArch'] = cpu_arch_le.fit_transform(data['cpuArch']) data['memType'] = mem_type_le.fit_transform(data['memType']) # drop single value columns data = data.drop(['cacheL3IsShared', 'BMI', 'CLF_._Cache_Line_Flush', 'CMOV_._Conditionnal_Move_Inst.', 'CX8_._CMPXCHG8B', 'FXSR.FXSAVE.FXRSTOR', 'IA.64_Technology', 'MMX_Technology', 'SSE', 'SSE2', 'SSE4a', 'SSE5', 'TBM', 'X3DNow_Pro_Technology'], axis=1) data['C0'] = np.log(data['n'] * data['m'] * data['k']) data = data.drop(['m', 'n', 'k'], axis=1) return data, target, { 'os': os_le, 'cpuFull': cpu_full_le, 'cpuArch': cpu_arch_le, 'memType': mem_type_le, }
def test_label_encoder_fit_transform(): # Test fit_transform le = LabelEncoder() ret = le.fit_transform([1, 1, 4, 5, -1, 0]) assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) le = LabelEncoder() ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(ret, [1, 1, 2, 0])
def test_label_encoder_fit_transform(): # Test fit_transform le = LabelEncoder() ret = le.fit_transform([1, 1, 4, 5, -1, 0]) assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) le = LabelEncoder() ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(ret, [1, 1, 2, 0])
def design_matrix(sample_labels): factors_dict = [] n_factors = 0 for i in range(sample_labels.shape[1]): unique_labels = np.unique(sample_labels[:,i]) if len(unique_labels) == 1: label_factors = 0 else: label_factors = len(unique_labels) n_factors+=label_factors factors_dict.append(label_factors) X = np.zeros((sample_labels.shape[0], n_factors)) lb = LabelEncoder() factor_labels = [] offset = 0 for i, factor in enumerate(factors_dict): if factor == 0: continue index = lb.fit_transform(sample_labels.T[i]) for j in range(sample_labels.shape[0]): X[j,index[j]+offset] = 1 factor_labels.append(lb.classes_) offset+=factor return X, np.hstack(factor_labels), factors_dict
def davies_bouldin_index(X, labels, metric='euclidean'): """Compute the Davies Bouldin index. The index is defined as the ratio of within-cluster and between-cluster distances. Parameters ---------- X : array-like, shape (``n_samples``, ``n_features``) List of ``n_features``-dimensional data points. Each row corresponds to a single data point. labels : array-like, shape (``n_samples``,) Predicted labels for each sample. Returns ------- score : float The resulting Davies-Bouldin index. References ---------- .. [1] `Davies, David L.; Bouldin, Donald W. (1979). "A Cluster Separation Measure". IEEE Transactions on Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_ """ X, labels = check_X_y(X, labels) le = LabelEncoder() labels = le.fit_transform(labels) n_samples, _ = X.shape n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(X[0])), np.float32) # print("Start") # print(labels) # print(X) for k in range(n_labels): cluster_k = X[labels == k] mean_k = np.mean(cluster_k, axis=0) centroids[k] = mean_k # print("Process") # print(mean_k) # print(cluster_k) intra_dists[k] = np.average( pairwise_distances(cluster_k, [mean_k], metric=metric)) centroid_distances = pairwise_distances(centroids, metric=metric) with np.errstate(divide='ignore', invalid='ignore'): if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \ np.all(centroid_distances == 0.0): return 0.0 scores = (intra_dists[:, None] + intra_dists) / centroid_distances # remove inf values scores[scores == np.inf] = np.nan return np.nanmax(scores, axis=1)
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)
def construct_features(self): ''' Construct features. ''' # Parse date features. print "Parsing date features" parsed_train_X = self.parse_date_feature(self.train_x[:, 0]) parsed_test_X = self.parse_date_feature(self.test_x[:, 0]) # Parse other features. print "Parsing all features" total_train = len(self.train_x) total_test = len(self.test_x) for index_feature in range(1, len(self.train_x[0])): print "Processing feature ", index_feature # Check if we have a categorical feature. labels = np.unique(self.train_x[:, index_feature]) # If we have string or binary labels, we have a categorical feature. if type(self.train_x[0, index_feature]) == np.str or len(labels) == 2: # We have a categorical feature. # Encode it in the one hot format. original_data = np.hstack((self.train_x[:, index_feature], self.test_x[:, index_feature])) label_encoder = LabelEncoder() data_label_encoded = label_encoder.fit_transform(original_data) encoder = OneHotEncoder() data_encoded = encoder.fit_transform(data_label_encoded.reshape((len(data_label_encoded), 1))) data_encoded = np.asarray(data_encoded.todense()).astype(np.bool) # Add encoded feature to data. parsed_train_X = np.hstack((parsed_train_X, data_encoded[0:total_train, :])) parsed_test_X = np.hstack((parsed_test_X, data_encoded[total_train:, :])) del data_encoded else: # We have a numeric feature. # Just add it to the data. parsed_train_X = np.hstack((parsed_train_X, self.train_x[:, index_feature].reshape((total_train, 1)))) parsed_test_X = np.hstack((parsed_test_X, self.test_x[:, index_feature].reshape((total_test, 1)))) self.train_x = parsed_train_X self.test_x = parsed_test_X
class BaiduQA: def __init__(self, pt): self.labels, self.docs = self.load(pt) self.doc_num = len(self.docs) print("doc_num=%d" % self.doc_num) self.label_encoder = LabelEncoder() self.ys = self.label_encoder.fit_transform(self.labels) self.label_num = len(self.label_encoder.classes_) print("label_num=%d" % self.label_num) self.tokenizer = Tokenizer(split=" ") self.tokenizer.fit_on_texts(self.docs) self.xs = self.tokenizer.texts_to_sequences(self.docs) self.voca_size = max(self.tokenizer.word_index.values()) + 1 print("voca_size=%d" % self.voca_size) @staticmethod def load(pt): labels = [] docs = [] print("read:" + pt) lines = open(pt).readlines() shuffle(lines) for l in lines: label, doc = l.strip().split("\t")[1].split("|") labels.append(label) docs.append(doc) print("n(doc)=%d" % len(labels)) return labels, docs def split(self): train_ys, test_ys, train_xs, test_xs = train_test_split(self.ys, self.xs, train_size=0.75) return train_ys, test_ys, train_xs, test_xs def next_batch(self, batch_size): i = 0 while True: if i + batch_size > self.doc_num: i == 0 yield (self.ys[i: i + batch_size], self.xs[i: i + batch_size])
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] # put it all together agg = concat(cols, axis=1) agg.columns = names # drop rows with NaN values if dropnan: agg.dropna(inplace=True) return agg # load dataset dataset = read_csv('data/pollution.csv', header=0, index_col=0) values = dataset.values # integer encode direction encoder = LabelEncoder() values[:, 4] = encoder.fit_transform(values[:, 4]) # ensure all data is float values = values.astype('float32') # normalize features scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) # specify the number of lag hours n_hours = 1 n_features = 8 ### NOT USED YET # frame as supervised learning reframed = series_to_supervised(scaled, n_hours, 1) # drop columns we don't want to predict reframed.drop(reframed.columns[[9, 10, 11, 12, 13, 14, 15]], axis=1, inplace=True)
class CustomNNCategorical(CustomNNBase): """ Base for custom sk classifier implementing NN using keras implement an MLP for classification with custom metric cohen kappa, no custom loss for now @todo """ def __init__(self, hidden=[200, 100, 50, 20], dropout=[0.1, 0.1], reg=[0.05, 0.05], h_act=[relu], epoch=500, batch_size=32, cbEarly="metric", loss="categorical_crossentropy", optimizer='adam', metrics=['cohen_kappa'], kappa_weights="quadratic", validation=0.2, smooth_cb=True): ''' :param hidden: :param dropout: dropout[0] is assigned to input then hidden :param reg: ularization :param h_act: hidden_actication :param epoch: :param batch_size: :param cbEarly: "metric" or an EarlyStopping instance :param loss: :param optimizer: :param metrics: "Accuracy" or 'cohen_kappa' :param kappa_weights: compatible with sk(ex:"quadratic", None) ignored if metrics != 'cohen_kappa' :param smooth_cb: if True EarlyStopping use val_cohen_kappa smoothed (left avg window 3), only with val_cohen_kappa :note restore_best_weights requires keras 2.2.3 ''' assert loss in ["categorical_crossentropy", lossOCC, lossOCCQuadratic] CustomNNBase.__init__(self, epoch, loss, optimizer, metrics, batch_size) # 'categorical_crossentropy', OCC.lossOCCQuadratic, lossOCC assert (len(hidden) > 0) & (len(hidden)+1 >= len(dropout)) & \ (len(hidden) >= len(reg)) & (len(hidden) >= len(h_act)) self.hidden = hidden self.dropout = dropout self.reg = reg self.h_act = h_act self.validation = validation self.final_activation = softmax self.cbEarly = cbEarly self.smooth_cb = smooth_cb self.cbReduceLR = ReduceLROnPlateau(monitor='loss', factor=0.8, patience=3, verbose=0, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0) # self.cbReduceLR = ReduceLROnPlateau(monitor='loss', factor=0.5, # patience=4, min_lr=0.000001, verbose=0) # ReduceLROnPlateau(monitor='val_loss', factor=0.2, # patience=2, min_lr=0.000001, verbose=0) # self.kappa_weights = kappa_weights if len(self.metrics) > 1: raise "TODO" def __compile(self, input_shape, output_shape): ter = lambda x, i: None if len(x) <= i else x[i] reg = [regularizers.l2(i) for i in self.reg ] #@TODO ALSO USE L1 FOR BETTER FEATURE SELECTION h_act = self.h_act * round(len(self.hidden) / len(self.h_act)) self.model = Sequential() self.model.add(InputLayer(input_shape=(input_shape, ))) if not ter(self.dropout, 0) is None: self.model.add(Dropout(ter(self.dropout, 0))) for i in range(0, len(self.hidden)): self.model.add( Dense(self.hidden[i], activation=h_act[i], kernel_regularizer=ter(reg, i), bias_regularizer=ter(reg, i))) if not ter(self.dropout, i + 1) is None: self.model.add(Dropout(ter(self.dropout, i + 1))) # first for input self.model.add(Dense(output_shape, activation=self.final_activation)) self.model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics) def __category_to_output(self, y): self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) target = to_categorical(y, num_classes=np.unique(y).size) return target def __output_to_category(self, output): pred = [np.argmax(i) for i in output] pred = self.label_encoder.inverse_transform(pred) return pred def cohen_kappa_metric_keras(self, y_true, y_pred): ''' Do not work as a metric because kappa is not linear and keras make a weighted avg of batches score :deprecated @see Cohen_kappa_logger ''' raise "deprecated @see Cohen_kappa_logger" return tf.py_func(self.cohen_kappa_score, [y_true, y_pred], tf.float32) def cohen_kappa_score(self, y_true, y_pred): raise "deprecated @see Cohen_kappa_logger" y_pred = self.__output_to_category(y_pred) y_true = self.__output_to_category(y_true) score = metrics.cohen_kappa_score(y_true, y_pred, weights=self.kappa_weights) return score.astype(np.float32) def break_on_epoch_n(self, threshold, sec=60): self.n_epoch = len(self.history.history["loss"]) if self.n_epoch > threshold: sleep(sec) # cool down def _fit_val(self, X, output): # @todo clean below if type(self.validation) is float: self.history = self.model.fit(X, output, validation_split=self.validation, epochs=self.epoch, batch_size=self.batch_size, callbacks=self.callback_list, verbose=0) elif type(self.validation) is tuple: assert self.validation[0].shape[1] == X.shape[ 1], "X_validation must be transformed with prep first" self.validation = (self.validation[0], self.__category_to_output(self.validation[1])) self.history = self.model.fit(X, output, validation_data=self.validation, epochs=self.epoch, batch_size=self.batch_size, callbacks=self.callback_list, verbose=0) elif self.validation is None: self.history = self.model.fit(X, output, epochs=self.epoch, batch_size=self.batch_size, callbacks=self.callback_list, verbose=0) else: raise "unknown validation type" def _kappa_disambiguation(self, X, output): ''' :param X: :param output: ''' self.metric_plot = None self.patience = 20 #for cbEarly is enoughfrom observation @todo in init if self.metrics[0] == "accuracy": self.metric_plot = "acc" raise "min_delta must be redefined according to val_acc" if self.use_smooth_cb: raise 'not available for acc self.use_smooth_cb' if self.cbEarly == "metric": self.cbEarly = EarlyStopping( monitor='val_acc' if self.validation else "acc", min_delta=0.0001, patience=self.patience, verbose=0, mode='auto') self.kappa_logger = None elif self.metrics[0] == 'cohen_kappa': self.metrics = None # 'cohen_kappa_metric' cannot be supported @see explication in Cohen_kappa_logger self.metric_plot = 'cohen_kappa' if self.cbEarly == "metric": if self.validation: monitor = "val_cohen_kappa_smoothed" if self.smooth_cb else "val_cohen_kappa" else: if not self.smooth_cb: monitor = "cohen_kappa" else: raise "No cohen_kappa_smoothed" print("monitor", monitor) self.cbEarly = EarlyStopping( monitor=monitor if self.validation else "cohen_kappa", min_delta=0.00000001, patience=self.patience, # a large patience is necessary! verbose=0, mode='max', restore_best_weights=True) if type(self.validation) is float: X, X_val, output, y_val = train_test_split( X, output, test_size=self.validation) elif type(self.validation) is tuple: assert self.validation[0].shape[1] == X.shape[ 1], "X_validation must be transformed with prep first" X_val = self.validation[0] y_val = self.__category_to_output(self.validation[1]) elif not self.validation is None: raise "unknown validation type" # self.validation = None # can slightly reduce computation but need val_loss for callback LRReduceOnPlateau self.kappa_logger = Cohen_kappa_logger( output_to_category=self.__output_to_category, X_train=X, y_train=output, X_val=X_val, y_val=y_val, kappa_weights=self.kappa_weights) else: print(self.metrics[0]) raise "not implemented" return X, output def fit(self, X, y=None): ''' :param X: :param y: :param cbEarly: Parameter for early stopping ''' output = self.__category_to_output(y) X, output = self._kappa_disambiguation(X, output) output_shape = output.shape[1] input_shape = X.shape[1] self.__compile(input_shape, output_shape) self.callback_list = [] for cb in [self.kappa_logger, self.cbReduceLR, self.cbEarly]: if cb: self.callback_list.append(cb) self._fit_val(X, output) self.break_on_epoch_n(50) return self def predict(self, X, y=None): try: getattr(self, "history") except AttributeError: raise RuntimeError("Call fit first.") preds = self.model.predict(X) preds = self.__output_to_category(preds) return preds def plot_history(self, plotname="NN", saving_file=None): ''' :param plotname: :param saving_file: filename where to save plots :return plt , to avoid carbage collection and closing of the windows ''' history = self.history plot = (saving_file is None) # print("History acc", history.history['acc']) # print("History loss", history.history['loss']) # print("History lr", history.history['lr']) # print("Acc train (last)", history.history['acc'][-5:-1]) import matplotlib.pyplot as plt if plot: plt.ion() if plot: plt.show() fig = plt.figure() plt.grid(True) plt.title(plotname) # print("possible plot", history.history.keys()) if self.metric_plot in history.history.keys(): plt.subplot(221) plt.plot(history.history[self.metric_plot]) plt.ylabel(self.metric_plot + " ") if plot: plt.draw() if "val_" + self.metric_plot in history.history.keys(): plt.subplot(222) # print("possible plot", history.history.keys()) plt.plot(history.history["val_" + self.metric_plot]) plt.ylabel("val_" + self.metric_plot + " ") if plot: plt.draw() if False: print("self.patience last epochs") print(history.history["val_" + self.metric_plot][-(self.patience + 1):]) plt.subplot(223) plt.plot(history.history['loss']) plt.ylabel('"loss" ' + " " + plotname) if plot: plt.draw() plt.subplot(224) if "val_cohen_kappa_smoothed" in history.history.keys(): plt.plot(history.history['val_cohen_kappa_smoothed']) plt.ylabel("val_cohen_kappa_smoothed") else: plt.plot(history.history['lr']) plt.ylabel('"lr"' + " " + plotname) if plot: plt.draw() if plot: plt.pause(1) if saving_file: fig.savefig(saving_file) plt = None # send to carbage return plt
from sklearn.model_selection import train_test_split from sklearn.preprocessing.label import LabelEncoder from utils import * import tensorflow as tf import glob config = tf.ConfigProto() #config.gpu_options.per_process_gpu_memory_fraction = 0.6 #固定使用显存比例 config.gpu_options.allow_growth = True #动态分配显存 BATCH_SIZE = 64 EPOCHS = 10 data_train_csv = pd.read_csv('data/labels.csv') filenames = data_train_csv.id.values le = LabelEncoder() labels = le.fit_transform(data_train_csv.breed) N_CLASS = len(le.classes_) filenames_train , filenames_val ,labels_train, labels_val =\ train_test_split(filenames,labels,test_size=0.1,stratify=labels) filenames_test = [i.split('/')[-1].split('.')[0] for i in glob.glob('data/test/*')] EPOCH_TRAIN_SIZE = len(filenames_train)//BATCH_SIZE + 1 EPOCH_VAL_SIZE = len(filenames_val)//BATCH_SIZE + 1 EPOCH_TEST_SIZE = len(filenames_test)//BATCH_SIZE + 1 sess=tf.Session(config=config) x_train, y_train = get_train_dataset(filenames_train,labels_train,BATCH_SIZE,rootdir='data/train') x_val,y_val = get_train_dataset(filenames_val,labels_val,BATCH_SIZE,rootdir='data/train') #x_test,id_test = get_test_dataset(filenames_test,BATCH_SIZE,rootdir='data/test') endpoints_train= get_inceptionV3(x_train,y_train,n_class=N_CLASS,reuse=False,is_training=False,mode='dev')
print df_all ''' from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN',strategy='mean',axis=0) df = imp.fit_transform(df) df = pd.DataFrame(df,columns=df_all.columns) print df ''' '''先转换为纯数字型''' from sklearn.preprocessing.label import LabelEncoder class_label_encoder = LabelEncoder() df = df_all.values df1 = df11.values df2 = df22.values for i in range(14): df[:, i] = class_label_encoder.fit_transform(df[:, i]) print df[:, i] df1[:, i] = class_label_encoder.fit_transform(df1[:, i]) df2[:, i] = class_label_encoder.fit_transform(df2[:, i]) df = pd.DataFrame(df, columns=df_all.columns) '''划分数据集''' ''' NUMBER = 1309 X_train = df[:NUMBER].drop(['survived'],1) X_test = df[NUMBER:].drop(['survived'],1) survivor = df['survived'] Y_train = survivor[:NUMBER] Y_test = survivor[NUMBER:] print Y_test ''' NUMBER = 1309
def dataset(dataset_name): if (dataset_name == 'cifar10'): print("| Preparing CIFAR-10 dataset...") sys.stdout.write("| ") trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_training()) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_testing()) outputs = 10 inputs = 3 elif (dataset_name == 'cifar100'): print("| Preparing CIFAR-100 dataset...") sys.stdout.write("| ") trainset = torchvision.datasets.CIFAR100( root='./data', train=True, download=True, transform=transform_training()) testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=False, transform=transform_testing()) outputs = 100 inputs = 3 elif (dataset_name == 'mnist'): print("| Preparing MNIST dataset...") trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_training()) testset = torchvision.datasets.MNIST(root='./data', train=False, download=False, transform=transform_testing()) outputs = 10 inputs = 1 elif (dataset_name == 'fashionmnist'): print("| Preparing FASHIONMNIST dataset...") sys.stdout.write("| ") trainset = torchvision.datasets.FashionMNIST( root='./data', train=True, download=True, transform=transform_training()) testset = torchvision.datasets.FashionMNIST( root='./data', train=False, download=False, transform=transform_testing()) outputs = 10 inputs = 1 elif (dataset_name == 'stl10'): print("| Preparing STL10 dataset...") sys.stdout.write("| ") trainset = torchvision.datasets.STL10(root='./data', split='train', download=True, transform=transform_training()) testset = torchvision.datasets.STL10(root='./data', split='test', download=False, transform=transform_testing()) outputs = 10 inputs = 3 elif (dataset_name == 'dog-breed'): print("| Preparing DOG-BREED dataset...") data_train_csv = pd.read_csv('./data/dog-breed/labels.csv') filenames = data_train_csv.id.values le = LabelEncoder() labels = le.fit_transform(data_train_csv.breed) filenames_train, filenames_val, labels_train, labels_val = train_test_split( filenames, labels, test_size=0.3, stratify=labels, shuffle=True) trainset = get_train_dataset(filenames_train, labels_train, cf.batch_size, rootdir='./data/dog-breed/train') testset = get_train_dataset(filenames_val, labels_val, cf.batch_size, rootdir='./data/dog-breed/train') outputs = 120 inputs = 3 trainloader = torch.utils.data.DataLoader(trainset, batch_size=cf.batch_size, shuffle=True, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=cf.batch_size, shuffle=True, num_workers=4) return trainloader, testloader, outputs, inputs
def design_matrix(sample_labels, interaction_indices=None): """ Parameters --------- sample_labels: a numpy matrix, for each sample a vector with the conditions which we would like to model. cols represent the type of conditions we want to model, row represent a combination of conditions that are represented by the row-variable. if we have a 2x3 design we build this matrix: [[0,0], [0,1], [0,2], [1,0], [1,1], [1,2]] Returns ------- X: the design matrix. factor_labels: the labels of the design-matrix columns factor_num : number of factors for each condition """ factor_num = [] n_factors = 0 for i in range(sample_labels.shape[1]): unique_labels = np.unique(sample_labels[:,i]) if len(unique_labels) == 1: label_factors = 0 else: label_factors = len(unique_labels) n_factors+=label_factors factor_num.append(label_factors) n_interactions = 0 if interaction_indices != None: interaction_factors = np.array(factor_num)[[interaction_indices]] n_interactions = np.prod(interaction_factors) Xint = np.zeros((sample_labels.shape[0], n_interactions)) X = np.zeros((sample_labels.shape[0], n_factors)) lb = LabelEncoder() factor_labels = [] offset = 0 for i, factor in enumerate(factor_num): if factor == 0: continue index = lb.fit_transform(sample_labels.T[i]) for j in range(sample_labels.shape[0]): X[j,index[j]+offset] = 1 factor_labels.append(lb.classes_) offset += factor if interaction_indices != None: interaction_product = [np.arange(v).tolist() for v in interaction_factors] interaction_gen = cartesian(interaction_product) # This is buggy!! Xint = np.zeros((sample_labels.shape[0], n_interactions)) offset = interaction_indices[0] * np.sum(factor_num[:interaction_indices[0]]) offset = np.int(offset) for i, int_indices in enumerate(interaction_gen): index1 = offset + int_indices[0] index2 = offset + int_indices[1] + factor_num[interaction_indices[0]] Xint[:,i] = X[:,index1] * X[:,index2] factor1 = interaction_indices[0] factor2 = interaction_indices[1] new_label = factor_labels[factor1][int_indices[0]] + "_" + \ factor_labels[factor2][int_indices[1]] factor_labels.append(new_label) X = np.hstack((X, Xint)) return X, np.hstack(factor_labels), factor_num
def design_matrix(sample_labels, interaction_indices=None): """ Parameters --------- sample_labels: a numpy matrix, for each sample a vector with the conditions which we would like to model. cols represent the type of conditions we want to model, row represent a combination of conditions that are represented by the row-variable. if we have a 2x3 design we build this matrix: [[0,0], [0,1], [0,2], [1,0], [1,1], [1,2]] Returns ------- X: the design matrix. factor_labels: the labels of the design-matrix columns factor_num : number of factors for each condition """ factor_num = [] n_factors = 0 for i in range(sample_labels.shape[1]): unique_labels = np.unique(sample_labels[:,i]) if len(unique_labels) == 1: label_factors = 0 else: label_factors = len(unique_labels) n_factors+=label_factors factor_num.append(label_factors) n_interactions = 0 if interaction_indices != None: interaction_factors = np.array(factor_num)[[interaction_indices]] n_interactions = np.prod(interaction_factors) Xint = np.zeros((sample_labels.shape[0], n_interactions)) X = np.zeros((sample_labels.shape[0], n_factors)) lb = LabelEncoder() factor_labels = [] offset = 0 for i, factor in enumerate(factor_num): if factor == 0: continue index = lb.fit_transform(sample_labels.T[i]) for j in range(sample_labels.shape[0]): X[j,index[j]+offset] = 1 factor_labels.append(lb.classes_) offset += factor if interaction_indices != None: interaction_product = [np.arange(v).tolist() for v in interaction_factors] interaction_gen = cartesian(interaction_product) # This is buggy!! Xint = np.zeros((sample_labels.shape[0], n_interactions)) offset = interaction_indices[0] * np.sum(factor_num[:interaction_indices[0]]) offset = np.int(offset) for i, int_indices in enumerate(interaction_gen): index1 = offset + int_indices[0] index2 = offset + int_indices[1] + factor_num[interaction_indices[0]] Xint[:,i] = X[:,index1] * X[:,index2] factor1 = interaction_indices[0] factor2 = interaction_indices[1] new_label = factor_labels[factor1][int_indices[0]] + "_" + \ factor_labels[factor2][int_indices[1]] factor_labels.append(new_label) X = np.hstack((X, Xint)) return X, np.hstack(factor_labels), factor_num
def predict(self): try: #filename如果未定义则会抛出异常 path = self.path.get() mylist = os.listdir(path) feeling_list = [] for item in mylist: if item[6:-16] == '02' and int(item[18:-4]) % 2 == 0: feeling_list.append('female_calm') elif item[6:-16] == '02' and int(item[18:-4]) % 2 == 1: feeling_list.append('male_calm') elif item[6:-16] == '03' and int(item[18:-4]) % 2 == 0: feeling_list.append('female_happy') elif item[6:-16] == '03' and int(item[18:-4]) % 2 == 1: feeling_list.append('male_happy') elif item[6:-16] == '04' and int(item[18:-4]) % 2 == 0: feeling_list.append('female_sad') elif item[6:-16] == '04' and int(item[18:-4]) % 2 == 1: feeling_list.append('male_sad') elif item[6:-16] == '05' and int(item[18:-4]) % 2 == 0: feeling_list.append('female_angry') elif item[6:-16] == '05' and int(item[18:-4]) % 2 == 1: feeling_list.append('male_angry') elif item[6:-16] == '06' and int(item[18:-4]) % 2 == 0: feeling_list.append('female_fearful') elif item[6:-16] == '06' and int(item[18:-4]) % 2 == 1: feeling_list.append('male_fearful') labels = pd.DataFrame(feeling_list) #showinfo('提示', '提取测试集') df = pd.DataFrame(columns=['feature']) bookmark = 0 for index, y in enumerate(mylist): if mylist[index][6:-16] != '01' and mylist[index][ 6:-16] != '07' and mylist[index][ 6:-16] != '08' and mylist[ index][:2] != 'su' and mylist[ index][:1] != 'n' and mylist[ index][:1] != 'd' and mylist[ index][:1] != 'A': X, sample_rate = librosa.load(path + '\\' + y, res_type='kaiser_fast', duration=2.5, sr=22050 * 2, offset=0.5) sample_rate = np.array(sample_rate) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0) feature = mfccs #[float(i) for i in feature] #feature1=feature[:135] df.loc[bookmark] = [feature] bookmark = bookmark + 1 df3 = pd.DataFrame(df['feature'].values.tolist()) #将特征和对应的情感存到同一张表中,情感所在列的列名为‘0’ newdf = pd.concat([df3, labels], axis=1) rnewdf = newdf.rename(index=str, columns={"0": "label"}) rnewdf = shuffle(newdf) rnewdf = rnewdf.fillna(0) #print(rnewdf) #将表格分为训练集和测试集 newdf1 = np.random.rand(len(rnewdf)) < 0.2 train = rnewdf[newdf1] test = rnewdf[~newdf1] #特征值为0到倒数第一列,标签值为最后一列 testfeatures = test.iloc[:, :-1] testlabel = test.iloc[:, -1:] X_test = np.array(testfeatures) y_test = np.array(testlabel) lb = LabelEncoder() y_test = np_utils.to_categorical(lb.fit_transform(y_test)) #print(y_train) #创建CNN模型 print('提取测试集...') x_testcnn = np.expand_dims(X_test, axis=2) print(x_testcnn) print('测试...') #showinfo('提示', '正在测试...') preds = self.model.predict(x_testcnn, batch_size=32, verbose=1) preds1 = preds.argmax(axis=1) abc = preds1.astype(int).flatten() predictions = (lb.inverse_transform((abc))) preddf = pd.DataFrame({'predicted_values': predictions}) actual = y_test.argmax(axis=1) abc123 = actual.astype(int).flatten() #print(abc) actualvalues = (lb.inverse_transform((abc123))) actualdf = pd.DataFrame({'actual_values': actualvalues}) finaldf = actualdf.join(preddf) finaldf.to_csv('H:\\预测实际对照表.csv', index=False) showinfo("提示", "表格打印完成,已保存到H盘目录下") print('\n\n输出预测值与实际值的对比表格:\n\n') print( finaldf.groupby('actual_values').count().join( finaldf.groupby('predicted_values').count())) #showinfo("预测值与实际值的对比", finaldf.groupby('actual_values').count()) except FileNotFoundError: showwarning('warning', '该路径不存在,请重新输入')
label_path = os.path.join(abs_path, article_file) if article_file.endswith('.json') and os.path.isfile(label_path): with open(label_path, 'r', encoding='utf-8') as f: json_file = json.load(f) for k, v in json_file.items(): words = re.findall('[a-zA-Z0-9]+', v) article_list.append(words) X.append(words) Y.append(label) if article_list: dic[label] = article_list # 字典 key标签 value文本 dic encoder = LabelEncoder() encoder.fit_transform(Y) values = [] values.extend(dic.values()) all_sentences = [] for each_label in values: for each_article in each_label: all_sentences.append(' '.join(each_article)) # 装载语料 sentences = word2vec.Text8Corpus(all_sentences) # 写入corpusSegDone_1.txt with open(inp, 'w') as fin: fin.write('\n'.join(all_sentences))
def train_classifier(input_data, output_dir, pretrained_model='bert-base-multilingual-cased', cache_dir=None, maxlen=64, batch_size=32, num_epochs=100, logging_steps=1, train_logs=None, **kwargs): # read input data stream texts, choices = [], [] for item in input_data: texts.append(item['input'][0]) choices.append(item['output'][0]) le = LabelEncoder() choices_ids = le.fit_transform(choices) tokenizer = BertTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) train_dataloader = prepare_texts(texts, tokenizer, maxlen, RandomSampler, batch_size, choices_ids) model = BertForSequenceClassification.from_pretrained( pretrained_model, num_labels=len(le.classes_), output_attentions=False, output_hidden_states=False, cache_dir=cache_dir) model.to(device) total_steps = len(train_dataloader) * num_epochs optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) global_step = 0 total_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(num_epochs, desc='Epoch') if train_logs: tb_writer = SummaryWriter( logdir=os.path.join(train_logs, os.path.basename(output_dir))) else: tb_writer = None loss_queue = deque(maxlen=10) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2] } outputs = model(**inputs) loss = outputs[0] loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: last_loss = (total_loss - logging_loss) / logging_steps loss_queue.append(last_loss) if tb_writer: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', last_loss, global_step) logging_loss = total_loss # slope-based early stopping if len(loss_queue) == loss_queue.maxlen: slope = calc_slope(loss_queue) if tb_writer: tb_writer.add_scalar('slope', slope, global_step) if abs(slope) < 1e-2: break if tb_writer: tb_writer.close() model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) return { 'model_path': output_dir, 'batch_size': batch_size, 'maxlen': maxlen, 'pretrained_model': pretrained_model, 'choices_map': list(map(str, le.classes_)) }
index=X.columns) return self def transform(self, X, y=None): return X.fillna(self.fill) big_train = train_df[feature_columns_to_use] big_train_imputed = DataFrameImputer().fit_transform(big_train) big_test = train_df[feature_columns_to_use] big_test_imputed = DataFrameImputer().fit_transform(big_test) le = LabelEncoder() for feature in nonnumeric_columns: big_train_imputed[feature] = le.fit_transform(big_train_imputed[feature]) big_test_imputed[feature] = le.fit_transform(big_test_imputed[feature]) train_X = big_train_imputed[0:train_df.shape[0]] test_X = big_test_imputed[0:test_df.shape[0]] train_y = train_df['click'] test_Y = train_df['click'] target = 'click' idcol = 'id' # test_results = pd.read_csv('test_results.csv') def modelfit(alg, dtrain, dtest, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV:
import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') import numpy as np from sklearn import preprocessing import pandas as pd df = pd.read_excel('F:\\WeiWeiHe\\titanic.xls') '''df.convert_objects(convert_numeric=True)''' '' '''df.fillna(0, inplace=True)''' from sklearn.preprocessing.label import LabelEncoder class_label_encoder = LabelEncoder() print df.head() df = df.values df[:, 2] = class_label_encoder.fit_transform(df[:, 2]) df[:, 3] = class_label_encoder.fit_transform(df[:, 3]) df[:, 7] = class_label_encoder.fit_transform(df[:, 7]) df[:, 9] = class_label_encoder.fit_transform(df[:, 9]) df[:, 10] = class_label_encoder.fit_transform(df[:, 10]) df[:, 11] = class_label_encoder.fit_transform(df[:, 11]) df[:, 13] = class_label_encoder.fit_transform(df[:, 13]) from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='mean', axis=1) df = imp.fit_transform(df) df = pd.DataFrame(df, columns=[ 'pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'