def design_matrix(sample_labels): factors_dict = [] n_factors = 0 for i in range(sample_labels.shape[1]): unique_labels = np.unique(sample_labels[:,i]) if len(unique_labels) == 1: label_factors = 0 else: label_factors = len(unique_labels) n_factors+=label_factors factors_dict.append(label_factors) X = np.zeros((sample_labels.shape[0], n_factors)) lb = LabelEncoder() factor_labels = [] offset = 0 for i, factor in enumerate(factors_dict): if factor == 0: continue index = lb.fit_transform(sample_labels.T[i]) for j in range(sample_labels.shape[0]): X[j,index[j]+offset] = 1 factor_labels.append(lb.classes_) offset+=factor return X, np.hstack(factor_labels), factors_dict
def train_test_from_dataset(dataset, batch_size): sample_labels = list(dataset.sample_labels) label_encoder = LabelEncoder().fit(sample_labels) sample_labels = label_encoder.transform(sample_labels) def label_remapper(label): return int(label_encoder.transform([l])[0]) label_map = label_remapper collate_fn = PersistenceDiagramProviderCollate(dataset, label_map=label_map) train_ids = np.array([ label_map(image_id) for image_id in dataset.sample_labels if training_data_labels[image_id] ]) test_ids = np.array([ label_map(image_id) for image_id in dataset.sample_labels if not training_data_labels[image_id] ]) data_train = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, sampler=SubsetRandomSampler(train_ids.tolist())) data_test = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, sampler=SubsetRandomSampler(test_ids.tolist())) return data_train, data_test
def train_test_from_dataset(dataset, test_size=0.2, batch_size=64, wanted_views=None): sample_labels = list(dataset.sample_labels) label_encoder = LabelEncoder().fit(sample_labels) sample_labels = label_encoder.transform(sample_labels) label_map = lambda l: int(label_encoder.transform([l])[0]) collate_fn = PersistenceDiagramProviderCollate(dataset, label_map=label_map, wanted_views=wanted_views) sp = StratifiedShuffleSplit(n_splits=1, test_size=test_size) train_i, test_i = list(sp.split([0] * len(sample_labels), sample_labels))[0] data_train = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, sampler=SubsetRandomSampler(train_i.tolist())) data_test = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=False, sampler=SubsetRandomSampler(test_i.tolist())) return data_train, data_test
def fit(self, X, y, input_checks=True): # the data passed in could be an array of dataframes? """Fit all estimators, fit the data Parameters ---------- X : array-like or DataFrame of shape [n_samples, n_dimensions, n_length] Input data, of which specified subsets are used to fit the transformers. y : array-like, shape (n_samples, ...), optional Targets for supervised learning. """ if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') # X = _check_X(X) self._validate_estimators() self._validate_column_callables(X) self._validate_remainder(X) self.le_ = LabelEncoder().fit(y) self.classes_ = self.le_.classes_ transformed_y = self.le_.transform(y) for name, estim, column in self._iter(replace_strings=True): estim.fit(_get_column(X, column), transformed_y) return self
def load_dataset(self): with open(self.file_name) as f: dataset = arff.load(f) if self.label_attribute is None: self.label_attribute = dataset["attributes"][-1][0] data = list(numpy.asarray(dataset["data"]).transpose()) labels = None row = 0 for attribute_name, attribute_type in dataset["attributes"]: if attribute_name == self.label_attribute: # Labels found! labels = data.pop(row) continue # Nominal attribute if isinstance(attribute_type, list): # Convert None in '?' for next check and to make label_binarize work for j in range(len(data[row])): if data[row][j] is None: data[row][j] = "?" if numpy.all(data[row] == "?"): # If no data is present, just remove the row data.pop(row) continue if self.binarize: data[row] = numpy.asarray(label_binarize( data[row], attribute_type), dtype=numpy.float64) else: encoder = LabelEncoder() encoder.classes_ = attribute_type if "?" not in encoder.classes_: encoder.classes_.insert(0, "?") data[row] = encoder.transform(data[row]).reshape( (len(data[row]), 1)).astype(numpy.float64) else: # Numeric attributes: check for nan values data[row] = data[row].astype(numpy.float64) nans = numpy.isnan(data[row]) if numpy.all(nans): # If everything is nan, remove the feature data.pop(row) continue if numpy.any(nans): mean = data[row][numpy.invert( nans)].sum() / numpy.invert(nans).sum() data[row][nans] = mean # Reshape to do hstack later data[row] = data[row].reshape((len(data[row]), 1)) # Go to next row only if we have NOT removed the current one row += 1 instances = numpy.hstack(tuple(data)) useless_indices = numpy.where(instances.var(axis=0) == 0) instances = numpy.delete(instances, useless_indices, axis=1) return instances, labels
def __call__(self, labels): labels = LabelEncoder().fit_transform(labels) labels = labels.reshape((len(labels), 1)) labels = OneHotEncoder(sparse=False).fit_transform(labels) if labels.shape[1] == 2: return labels[:, 0].reshape((len(labels), 1)) else: return labels
def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def _preprocess_base(behaviors, name, min_events, min_intra_gap=0, min_inter_gap=0): """ @param min_events: minimum number of consumption behaviors a user has @param min_intra_gap: minimum gap between the adjacent consumption on a same item @param min_inter_gap: minimum gap between the adjacent consumption on any item """ item_set = set() for u in behaviors.keys(): v = behaviors[u] if len(v) <= min_events: del behaviors[u] else: v.sort(key=lambda r: r[1]) # sort history according to timestamp, early record in smaller index # remove noisy records i = len(v) - 1 while i > 0: if v[i][1] < v[i-1][1] or (min_intra_gap > 0 and v[i][0] == v[i-1][0] and v[i][1] - v[i-1][1] < min_intra_gap): del v[i] if i >= len(v): i = len(v) - 1 elif min_inter_gap > 0 and v[i][1] - v[i-1][1] < min_inter_gap: del v[i-1] if i >= len(v): i = len(v) - 1 else: i -= 1 if len(v) <= min_events: del behaviors[u] else: _verify_sequence(v, min_intra_gap, min_inter_gap) v_new = [r[0] for r in v] item_set |= set(v_new) del behaviors[u] behaviors[u] = v_new print '%d users left after removal.' % len(behaviors.keys()) user_id_old = behaviors.keys() item_id_old = list(item_set) user_id_new = LabelEncoder().fit_transform(user_id_old) item_id_new = LabelEncoder().fit_transform(item_id_old) user_id_map = {v:user_id_new[i] for i, v in enumerate(user_id_old)} item_id_map = {v:item_id_new[i] for i, v in enumerate(item_id_old)} behaviors_new = [0] * len(user_id_new) for u, l in behaviors.iteritems(): assert(len(l) > min_events) for i in xrange(len(l)): l[i] = item_id_map[l[i]] behaviors_new[user_id_map[u]] = l behaviors_new = np.array(behaviors_new) behaviors_new.dump('..\\data\\behaviors_%s.array' % name) with open('..\\data\\user_id_%s.map' % name, 'wb') as outfile: json.dump(user_id_map, outfile) with open('..\\data\\item_id_%s.map' % name, 'wb') as outfile: json.dump(item_id_map, outfile) print 'Dumping behavior data finished.'
def test_label_encoder_fit_transform(): # Test fit_transform le = LabelEncoder() ret = le.fit_transform([1, 1, 4, 5, -1, 0]) assert_array_equal(ret, [2, 2, 3, 4, 0, 1]) le = LabelEncoder() ret = le.fit_transform(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(ret, [1, 1, 2, 0])
def test_label_encoder_negative_ints(): le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_empty_array(values): le = LabelEncoder() le.fit(values) # test empty transform transformed = le.transform([]) assert_array_equal(np.array([]), transformed) # test empty inverse transform inverse_transformed = le.inverse_transform([]) assert_array_equal(np.array([]), inverse_transformed)
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, 1, -1]) assert_raises(ValueError, le.inverse_transform, [-1])
def load_dataset(self): with open(self.file_name) as f: dataset = arff.load(f) if self.label_attribute is None: self.label_attribute = dataset["attributes"][-1][0] data = list(numpy.asarray(dataset["data"]).transpose()) labels = None row = 0 for attribute_name, attribute_type in dataset["attributes"]: if attribute_name == self.label_attribute: # Labels found! labels = data.pop(row) continue # Nominal attribute if isinstance(attribute_type, list): # Convert None in '?' for next check and to make label_binarize work for j in range(len(data[row])): if data[row][j] is None: data[row][j] = "?" if numpy.all(data[row] == "?"): # If no data is present, just remove the row data.pop(row) continue if self.binarize: data[row] = numpy.asarray(label_binarize(data[row], attribute_type), dtype=numpy.float64) else: encoder = LabelEncoder() encoder.classes_ = attribute_type if "?" not in encoder.classes_: encoder.classes_.insert(0, "?") data[row] = encoder.transform(data[row]).reshape((len(data[row]), 1)).astype(numpy.float64) else: # Numeric attributes: check for nan values data[row] = data[row].astype(numpy.float64) nans = numpy.isnan(data[row]) if numpy.all(nans): # If everything is nan, remove the feature data.pop(row) continue if numpy.any(nans): mean = data[row][numpy.invert(nans)].sum() / numpy.invert(nans).sum() data[row][nans] = mean # Reshape to do hstack later data[row] = data[row].reshape((len(data[row]), 1)) # Go to next row only if we have NOT removed the current one row += 1 instances = numpy.hstack(tuple(data)) useless_indices = numpy.where(instances.var(axis=0) == 0) instances = numpy.delete(instances, useless_indices, axis=1) return instances, labels
def test_label_encoder_string_labels(): """Test LabelEncoder's transform and inverse_transform methods with non-numeric labels""" le = LabelEncoder() le.fit(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1]) assert_array_equal(le.inverse_transform([2, 2, 1]), ["tokyo", "tokyo", "paris"]) assert_raises(ValueError, le.transform, ["london"])
def preprocess(data): for column in data: if data.dtypes[column] == object: data[column].fillna("Não mensurado", inplace=True) encoder = LabelEncoder() encoder.fit(data[column].tolist()) data[column] = encoder.transform(data[column]) elif data.dtypes[column] == float: data[column].fillna(0, inplace=True) elif data.dtypes[column] == int: data[column].fillna(0, inplace=True) return data
def test_label_encoder_errors(): # Check that invalid arguments yield ValueError le = LabelEncoder() assert_raises(ValueError, le.transform, []) assert_raises(ValueError, le.inverse_transform, []) # Fail on unseen labels le = LabelEncoder() le.fit([1, 2, 3, -1, 1]) msg = "contains previously unseen labels" assert_raise_message(ValueError, msg, le.inverse_transform, [-2]) assert_raise_message(ValueError, msg, le.inverse_transform, [-2, -3, -4])
def davies_bouldin_index(X, labels, metric='euclidean'): """Compute the Davies Bouldin index. The index is defined as the ratio of within-cluster and between-cluster distances. Parameters ---------- X : array-like, shape (``n_samples``, ``n_features``) List of ``n_features``-dimensional data points. Each row corresponds to a single data point. labels : array-like, shape (``n_samples``,) Predicted labels for each sample. Returns ------- score : float The resulting Davies-Bouldin index. References ---------- .. [1] `Davies, David L.; Bouldin, Donald W. (1979). "A Cluster Separation Measure". IEEE Transactions on Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_ """ X, labels = check_X_y(X, labels) le = LabelEncoder() labels = le.fit_transform(labels) n_samples, _ = X.shape n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(X[0])), np.float32) # print("Start") # print(labels) # print(X) for k in range(n_labels): cluster_k = X[labels == k] mean_k = np.mean(cluster_k, axis=0) centroids[k] = mean_k # print("Process") # print(mean_k) # print(cluster_k) intra_dists[k] = np.average( pairwise_distances(cluster_k, [mean_k], metric=metric)) centroid_distances = pairwise_distances(centroids, metric=metric) with np.errstate(divide='ignore', invalid='ignore'): if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \ np.all(centroid_distances == 0.0): return 0.0 scores = (intra_dists[:, None] + intra_dists) / centroid_distances # remove inf values scores[scores == np.inf] = np.nan return np.nanmax(scores, axis=1)
class LabelEncoderImpl(): def __init__(self): self._hyperparams = {} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def normalize_data(data, target): data.replace({'None': np.nan}, inplace=True) types = pd.read_csv('data/datatypes.csv') for i, row in types.iterrows(): data[row['feature']] = data[row['feature']].astype(row['type']) data['memFreq'].fillna(0, inplace=True) data['memtRFC'].fillna(0, inplace=True) os_le = LabelEncoder() cpu_full_le = LabelEncoder() cpu_arch_le = LabelEncoder() mem_type_le = LabelEncoder() data['cpuFull'] = cpu_full_le.fit_transform(data['cpuFull']) data['os'] = os_le.fit_transform(data['os']) data['cpuArch'] = cpu_arch_le.fit_transform(data['cpuArch']) data['memType'] = mem_type_le.fit_transform(data['memType']) # drop single value columns data = data.drop(['cacheL3IsShared', 'BMI', 'CLF_._Cache_Line_Flush', 'CMOV_._Conditionnal_Move_Inst.', 'CX8_._CMPXCHG8B', 'FXSR.FXSAVE.FXRSTOR', 'IA.64_Technology', 'MMX_Technology', 'SSE', 'SSE2', 'SSE4a', 'SSE5', 'TBM', 'X3DNow_Pro_Technology'], axis=1) data['C0'] = np.log(data['n'] * data['m'] * data['k']) data = data.drop(['m', 'n', 'k'], axis=1) return data, target, { 'os': os_le, 'cpuFull': cpu_full_le, 'cpuArch': cpu_arch_le, 'memType': mem_type_le, }
def construct_features(self): ''' Construct features. ''' # Parse date features. print "Parsing date features" parsed_train_X = self.parse_date_feature(self.train_x[:, 0]) parsed_test_X = self.parse_date_feature(self.test_x[:, 0]) # Parse other features. print "Parsing all features" total_train = len(self.train_x) total_test = len(self.test_x) for index_feature in range(1, len(self.train_x[0])): print "Processing feature ", index_feature # Check if we have a categorical feature. labels = np.unique(self.train_x[:, index_feature]) # If we have string or binary labels, we have a categorical feature. if type(self.train_x[0, index_feature]) == np.str or len(labels) == 2: # We have a categorical feature. # Encode it in the one hot format. original_data = np.hstack((self.train_x[:, index_feature], self.test_x[:, index_feature])) label_encoder = LabelEncoder() data_label_encoded = label_encoder.fit_transform(original_data) encoder = OneHotEncoder() data_encoded = encoder.fit_transform(data_label_encoded.reshape((len(data_label_encoded), 1))) data_encoded = np.asarray(data_encoded.todense()).astype(np.bool) # Add encoded feature to data. parsed_train_X = np.hstack((parsed_train_X, data_encoded[0:total_train, :])) parsed_test_X = np.hstack((parsed_test_X, data_encoded[total_train:, :])) del data_encoded else: # We have a numeric feature. # Just add it to the data. parsed_train_X = np.hstack((parsed_train_X, self.train_x[:, index_feature].reshape((total_train, 1)))) parsed_test_X = np.hstack((parsed_test_X, self.test_x[:, index_feature].reshape((total_test, 1)))) self.train_x = parsed_train_X self.test_x = parsed_test_X
def __init__(self, pt): self.labels, self.docs = self.load(pt) self.doc_num = len(self.docs) print("doc_num=%d" % self.doc_num) self.label_encoder = LabelEncoder() self.ys = self.label_encoder.fit_transform(self.labels) self.label_num = len(self.label_encoder.classes_) print("label_num=%d" % self.label_num) self.tokenizer = CountVectorizer() self.tokenizer.fit(self.docs) self.xs = self.tokenizer.transform(self.docs) self.voca_size = max(self.tokenizer.vocabulary_.values()) + 1 print("voca_size=%d" % self.voca_size)
def transform(self, X): ''' Transforms columns of X specified in self.columns using LabelEncoder(). If no columns specified, transforms all columns in X. ''' output = X.copy() if self.columns is not None: for col in self.columns: output[col] = CategoricalImputer().fit_transform(output[col]) output[col] = LabelEncoder().fit_transform(output[col]) else: for colname, col in output.iteritems(): output[colname] = LabelEncoder().fit_transform(col) return output
def __init__(self, pt): self.labels, self.docs = self.load(pt) self.doc_num = len(self.docs) print("doc_num=%d" % self.doc_num) self.label_encoder = LabelEncoder() self.ys = self.label_encoder.fit_transform(self.labels) self.label_num = len(self.label_encoder.classes_) print("label_num=%d" % self.label_num) self.tokenizer = Tokenizer(split=" ") self.tokenizer.fit_on_texts(self.docs) self.xs = self.tokenizer.texts_to_sequences(self.docs) self.voca_size = max(self.tokenizer.word_index.values()) + 1 print("voca_size=%d" % self.voca_size)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def fit(self, X, y=None): """Fit the CategoricalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. Returns ------- self """ if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']: template = ("encoding should be either 'onehot', 'onehot-dense' " "or 'ordinal', got %s") raise ValueError(template % self.handle_unknown) if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if self.encoding == 'ordinal' and self.handle_unknown == 'ignore': raise ValueError("handle_unknown='ignore' is not supported for" " encoding='ordinal'") if self.categories != 'auto': for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, str): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: if self.handle_unknown == 'error': valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(self.categories[i]) self.categories_ = [le.classes_ for le in self._label_encoders_] return self
def test_label_encoder_negative_ints(): le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) with pytest.raises(ValueError): le.transform([0, 6])
def test_label_encoder(values, classes, unknown): # Test LabelEncoder's transform, fit_transform and # inverse_transform methods le = LabelEncoder() le.fit(values) assert_array_equal(le.classes_, classes) assert_array_equal(le.transform(values), [1, 0, 2, 0, 2]) assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values) le = LabelEncoder() ret = le.fit_transform(values) assert_array_equal(ret, [1, 0, 2, 0, 2]) with pytest.raises(ValueError, match="unseen labels"): le.transform(unknown)
def _conform_targets(targets): """ Conform targets to [0, n_targets-1]. Parameters ---------- targets : array (n_targets, ) Returns ------- targets_conformed : array (n_targets, ) targets are between 0 and n_targets-1 label_encoder : LabelEncoder fit on targets, used to invert back using label_encoder.inverse_transform """ le = LabelEncoder() le.fit(targets) return le.transform(targets), le
def test_label_encoder(): # Test LabelEncoder's transform and inverse_transform methods le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6]) le.fit(["apple", "orange"]) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def test_label_encoder_empty_array(): le = LabelEncoder() le.fit(np.array(["1", "2", "1", "2", "2"])) # test empty transform transformed = le.transform([]) assert_array_equal(np.array([]), transformed) # test empty inverse transform inverse_transformed = le.inverse_transform([]) assert_array_equal(np.array([]), inverse_transformed)
def sklearn_titanic(): from sklearn.tree.tree import DecisionTreeClassifier from sklearn.preprocessing.label import LabelEncoder total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeClassifier() clf.fit(train_df.drop(['survived'], axis=1), train_df['survived']) print(clf.score(test_df.drop(['survived'], axis=1), test_df['survived']))
def load_mat_ds(path, subj, folder, **kwargs): data = load_mat_data(path, subj, folder, **kwargs) # load attributes attr = load_attributes(path, subj, folder, **kwargs) attr, labels = edit_attr(attr, data.shape) ds = Dataset.from_wizard(data, attr.targets) ds = add_subjectname(ds, subj) ds = add_attributes(ds, attr) #ds.fa['roi_labels'] = labels ds.fa['matrix_values'] = np.ones_like(data[0]) ds.sa['chunks'] = LabelEncoder().fit_transform(ds.sa['name']) return ds
class BaiduQA: def __init__(self, pt): self.labels, self.docs = self.load(pt) self.doc_num = len(self.docs) print("doc_num=%d" % self.doc_num) self.label_encoder = LabelEncoder() self.ys = self.label_encoder.fit_transform(self.labels) self.label_num = len(self.label_encoder.classes_) print("label_num=%d" % self.label_num) self.tokenizer = Tokenizer(split=" ") self.tokenizer.fit_on_texts(self.docs) self.xs = self.tokenizer.texts_to_sequences(self.docs) self.voca_size = max(self.tokenizer.word_index.values()) + 1 print("voca_size=%d" % self.voca_size) @staticmethod def load(pt): labels = [] docs = [] print("read:" + pt) lines = open(pt).readlines() shuffle(lines) for l in lines: label, doc = l.strip().split("\t")[1].split("|") labels.append(label) docs.append(doc) print("n(doc)=%d" % len(labels)) return labels, docs def split(self): train_ys, test_ys, train_xs, test_xs = train_test_split(self.ys, self.xs, train_size=0.75) return train_ys, test_ys, train_xs, test_xs def next_batch(self, batch_size): i = 0 while True: if i + batch_size > self.doc_num: i == 0 yield (self.ys[i: i + batch_size], self.xs[i: i + batch_size])
def sklearn_titanic_regression(): from sklearn.tree.tree import DecisionTreeRegressor from sklearn.preprocessing.label import LabelEncoder import numpy as np total_df = pd.read_csv("titanic_clean.csv") total_df.drop(['cabin', 'boat', 'body', 'index'], axis=1, inplace=True) total_df.dropna(inplace=True) for col in total_df.columns.tolist(): if str(total_df[col].dtype) == 'object': total_df[col] = LabelEncoder().fit_transform(total_df[col]) total_num = total_df.shape[0] train_df = total_df.iloc[:int(total_num * 0.8)] test_df = total_df.iloc[int(total_num * 0.8):] clf = DecisionTreeRegressor() clf.fit(train_df.drop(['fare'], axis=1), train_df['fare']) pred = clf.predict(test_df.drop(['fare'], axis=1)) truth = test_df['fare'] mse = np.sum(np.square(pred - truth)) / test_df.shape[0] print(mse)
def _fit(self, X, handle_unknown='error'): X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape if self.categories != 'auto': for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") if len(self.categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: if handle_unknown == 'error': valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(self.categories[i]) self.categories_ = [le.classes_ for le in self._label_encoders_]
def test_label_encoder_str_bad_shape(dtype): le = LabelEncoder() le.fit(np.array(["apple", "orange"], dtype=dtype)) msg = "bad input shape" assert_raise_message(ValueError, msg, le.transform, "apple")
def design_matrix(sample_labels, interaction_indices=None): """ Parameters --------- sample_labels: a numpy matrix, for each sample a vector with the conditions which we would like to model. cols represent the type of conditions we want to model, row represent a combination of conditions that are represented by the row-variable. if we have a 2x3 design we build this matrix: [[0,0], [0,1], [0,2], [1,0], [1,1], [1,2]] Returns ------- X: the design matrix. factor_labels: the labels of the design-matrix columns factor_num : number of factors for each condition """ factor_num = [] n_factors = 0 for i in range(sample_labels.shape[1]): unique_labels = np.unique(sample_labels[:,i]) if len(unique_labels) == 1: label_factors = 0 else: label_factors = len(unique_labels) n_factors+=label_factors factor_num.append(label_factors) n_interactions = 0 if interaction_indices != None: interaction_factors = np.array(factor_num)[[interaction_indices]] n_interactions = np.prod(interaction_factors) Xint = np.zeros((sample_labels.shape[0], n_interactions)) X = np.zeros((sample_labels.shape[0], n_factors)) lb = LabelEncoder() factor_labels = [] offset = 0 for i, factor in enumerate(factor_num): if factor == 0: continue index = lb.fit_transform(sample_labels.T[i]) for j in range(sample_labels.shape[0]): X[j,index[j]+offset] = 1 factor_labels.append(lb.classes_) offset += factor if interaction_indices != None: interaction_product = [np.arange(v).tolist() for v in interaction_factors] interaction_gen = cartesian(interaction_product) # This is buggy!! Xint = np.zeros((sample_labels.shape[0], n_interactions)) offset = interaction_indices[0] * np.sum(factor_num[:interaction_indices[0]]) offset = np.int(offset) for i, int_indices in enumerate(interaction_gen): index1 = offset + int_indices[0] index2 = offset + int_indices[1] + factor_num[interaction_indices[0]] Xint[:,i] = X[:,index1] * X[:,index2] factor1 = interaction_indices[0] factor2 = interaction_indices[1] new_label = factor_labels[factor1][int_indices[0]] + "_" + \ factor_labels[factor2][int_indices[1]] factor_labels.append(new_label) X = np.hstack((X, Xint)) return X, np.hstack(factor_labels), factor_num