def binarizelabels(self, labels, nclasses=None): if nclasses == None: mlb = preprocessing.MultiLabelBinarizer() return mlb.fit_transform(labels) # for fit_and_predict to return binarized object of predicted classes mlb = preprocessing.MultiLabelBinarizer(classes=range(nclasses)) return mlb.fit_transform(labels)
def example(): from sklearn import preprocessing lb = preprocessing.LabelBinarizer() lb.fit([1, 2, 6, 4, 2]) print(lb.classes_) print(lb.transform([1, 6])) ####################################### lb = preprocessing.MultiLabelBinarizer() lb.fit_transform([(1, 2), (3, )]) print(lb.classes_) ######################################## from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit([1, 2, 2, 6]) print(le.classes_) print(le.transform([1, 1, 2, 6])) print(le.inverse_transform([0, 0, 1, 2])) ######################################### le = preprocessing.LabelEncoder() le.fit(["paris", "paris", "tokyo", "amsterdam"]) print(list(le.classes_)) print(le.transform(["tokyo", "tokyo", "paris"])) print(list(le.inverse_transform([2, 2, 1])))
def analyse_tweet_ml(tweet): df = pd.read_csv('tweets.csv') df.isnull().any() message = tweet X_train = df.tweet y_train = df.label.astype(str) lb = preprocessing.MultiLabelBinarizer() y_train = lb.fit_transform(y_train) X_test = np.array([message]) # ML Pipeline classifier = Pipeline([('vectorizer', CountVectorizer(max_df=0.5, ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, y_train) predicted = classifier.predict(X_test) print predicted[0] count = 0 label = "" for i in predicted[0]: if i == 1: label = count count = count + 1 return label
def _encode_labels(self): """Encodes y labels using sklearn to create allow for string or numeric inputs""" mlb = preprocessing.MultiLabelBinarizer() mlb.fit(self.y) mapping_dict = dict(zip(list(range(0, len(mlb.classes_))), mlb.classes_)) return mlb, mapping_dict
def get_top_dataset(): lb = preprocessing.MultiLabelBinarizer(top_categories) train_data, train_targets = get_top_split_set("train") test_data, test_targets = get_top_split_set("test") train_targets = lb.fit_transform(train_targets) test_targets = lb.transform(test_targets) return np.array(train_data), np.array(train_targets), np.array(test_data), np.array(test_targets)
def oneVsRest_LogReg_TfIdf(X_train, X_test, Y_train, Y_test, word_dict, tags_dict, data_files, test_doc_ids ): print('Processing : oneVsRest_LogReg_TfIdf') print('-'*50) Y_original = Y_test vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict) X_v_train = vectorizer.fit_transform(X_train) X_v_test = vectorizer.fit_transform(X_test) transformer = TfidfTransformer(smooth_idf=False) X_train_tf = transformer.fit_transform(X_v_train) X_test_tf = transformer.fit_transform(X_v_test) uniq_tags_names = list(tags_dict.keys()) mlb = preprocessing.MultiLabelBinarizer(classes=uniq_tags_names) Y_train = mlb.fit_transform(Y_train) Y_test = mlb.fit_transform(Y_test) classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=0.01)) classifier.fit(X_train_tf, Y_train) score = classifier.score(X_test_tf, Y_test) print('-' * 50) print('Score oneVsRest_LogReg_TfIdf : {}'.format(score)) print('-' * 50) Y_pred = classifier.predict(X_v_test) Y_back = mlb.inverse_transform(Y_pred) write_to_file(Y_original, Y_back, 'oneVsRest_LogREg', score, data_files, test_doc_ids)
def fit(self, X, y, log_run=True): """Fit the model to the provided data""" y = preprocessing.MultiLabelBinarizer().fit_transform( y.reshape(len(y), 1)) y = y.astype(np.float32) kwargs = {} if log_run: cb = tf.compat.v1.keras.callbacks.TensorBoard( log_dir=self.tensor_logdir, #histogram_freq=1, #write_graph=True, #write_grads=True, write_images=True, ) kwargs['callbacks'] = [cb] history = self.model.fit( X, y, self.batch_size, self.n_epoch, verbose=2, validation_split=0.1, # XXX **kwargs) # The history.history dict contains lists of numpy.float64 # values which don't work well with json. We need to turn them # into floats. ret = {} for k, v in history.history.items(): ret[k] = [float(x) for x in v] return ret
def train(X, y, outpath=None, verbose=True): def build(X, y=None): """ Inner build function that builds a single model. """ model = Pipeline([('preprocessor', NLTKPreprocessor()), ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC(C=0.9)))]) model.fit(X, y) return model # Label encode the targets labels = preprocessing.MultiLabelBinarizer() y = labels.fit_transform(y) model = build(X, y) model.labels_ = labels if outpath: with open(outpath, 'wb') as f: pickle.dump(model, f) if verbose: print("Model written out to {}".format(outpath)) return model
def find_hot_encoders(X, missing_values=None): """ Find hot encoders for every feature :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples :param missing_values: The value for missing values. :return: Hot encoders to be used for future hot encoding """ X = np.asarray(X) new_X = np.zeros(([X.shape[0], 0])) hot_encoders = [] for i in range(X.shape[1]): # Copy a new row and delete the missing values new_col = np.copy(X[:, i:i + 1]) new_col = np.delete(new_col, new_col == missing_values, axis=0) # Find out if data is categorical try: new_col = new_col.astype(float) hot_encoder = None except: # Create hot encoder en use it for fitting and transformation hot_encoder = PP.MultiLabelBinarizer() new_col = hot_encoder.fit_transform(new_col) # Keep record of the new data and hot_encoders.append(hot_encoder) return hot_encoders
def model_inferring(val_path='./pelvis_only_224_test_hot_nonhardware.npz', model_path='./fx_models/nonhardware_fx_sep/final_dense_model'): image_array, id_array, label_array, orig_idx_array = read_npz_hotlabel(val_path) model = keras.models.load_model(model_path) predictions = model.predict(image_array) custom_eval_models(predictions, label_array) class_0_preds = predictions[:, 0].reshape(predictions.shape[0], 1) non_fx_preds = np.sum(predictions[:, 1:], axis=1) non_fx_preds = non_fx_preds.reshape(non_fx_preds.shape[0], 1) binary_preds = np.concatenate([class_0_preds, non_fx_preds], axis=1) ind_labels = np.argmax(label_array, axis=1) binary_labels = np.where(ind_labels != 0, 1, ind_labels) binary_labels = preprocessing.MultiLabelBinarizer(np.arange(2)).fit_transform(binary_labels.reshape(binary_labels.shape[0], 1)) custom_eval_models(binary_preds, binary_labels) np.savez('something', image_array=image_array, id_array=id_array, label_array=label_array, orig_idx_array=orig_idx_array, predictions=predictions, binary_preds=binary_preds, binary_labels=binary_labels )
def train(method=0): print 'command train' print 'use %s' % CLFS_NAMES[method] description_list = [] tags_list = [] for line in open('train.data'): tmp = line.rstrip('\r\n').split('#$#') description = tmp[1] tags = tmp[2].rstrip(',').split( ',') # TODO Semantic Web (RDF, OWL, etc.) description_list.append(description) tags_list.append(tags) all_tags = open( 'allTags.txt').read().splitlines() # TODO how to fit this ? lb = preprocessing.MultiLabelBinarizer() binary_tags_list = lb.fit_transform(tags_list) X_train = np.array(description_list) y_train = binary_tags_list clf = Pipeline([('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(CLFS[method]))]) print 'train begin' clf.fit(X_train, y_train) print 'train end' joblib.dump(clf, 'models/model.pkl') joblib.dump(lb, 'models/lb.pkl')
def OVR_Classify(X_train, X_test, Y_train, word_dict, tags_dict, test_tags=None): print('Processing : OVR_Classify') print('-' * 50) from sklearn.feature_extraction.text import TfidfTransformer vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict) X_v_train = vectorizer.fit_transform(X_train) X_v_test = vectorizer.fit_transform(X_test) transformer = TfidfTransformer(smooth_idf=False) X_train_tf = transformer.fit_transform(X_v_train) X_test_tf = transformer.fit_transform(X_v_test) #uniq_tags_names = list(tags_dict.keys()) mlb = preprocessing.MultiLabelBinarizer(classes=list(tags_dict)) train_model = mlb.fit_transform(Y_train) classifier = OneVsRestClassifier(Perceptron(#loss='hinge', alpha=1e-3, penalty='elasticnet', random_state=999, #class_weight="balanced", n_iter=50, #learning_rate='optimal' )) classifier.fit(X_train_tf, train_model) print('-' * 50) #print('Score oneVsRest_SGDC_TfIdf : {}'.format(score)) print('-' * 50) Y_pred = classifier.predict(X_test_tf) print(Y_pred) Y_back = mlb.inverse_transform(Y_pred) print(Y_back)
def explicitness_per_factor(mus_train, y_train, mus_test, y_test): """Compute explicitness score for a factor as ROC-AUC of a classifier. Args: mus_train: Representation for training, (num_codes, num_points)-np array. y_train: Ground truth factors for training, (num_factors, num_points)-np array. mus_test: Representation for testing, (num_codes, num_points)-np array. y_test: Ground truth factors for testing, (num_factors, num_points)-np array. Returns: roc_train: ROC-AUC score of the classifier on training data. roc_test: ROC-AUC score of the classifier on testing data. """ x_train = np.transpose(mus_train) x_test = np.transpose(mus_test) # CHANGED: Explicitly use the default params from numpy 0.20 (solver, # multi_class) to avoid warning messages clf = linear_model.LogisticRegression(solver='liblinear', multi_class='ovr').fit( x_train, y_train) y_pred_train = clf.predict_proba(x_train) y_pred_test = clf.predict_proba(x_test) mlb = preprocessing.MultiLabelBinarizer() roc_train = metrics.roc_auc_score( mlb.fit_transform(np.expand_dims(y_train, 1)), y_pred_train) roc_test = metrics.roc_auc_score( mlb.fit_transform(np.expand_dims(y_test, 1)), y_pred_test) return roc_train, roc_test
def explicitness_per_factor(mus_train, y_train, mus_test, y_test): """Compute explicitness score for a factor as ROC-AUC of a classifier. Args: mus_train: Representation for training, (num_codes, num_points)-np array. y_train: Ground truth factors for training, (num_factors, num_points)-np array. mus_test: Representation for testing, (num_codes, num_points)-np array. y_test: Ground truth factors for testing, (num_factors, num_points)-np array. Returns: roc_train: ROC-AUC score of the classifier on training data. roc_test: ROC-AUC score of the classifier on testing data. """ x_train = np.transpose(mus_train) x_test = np.transpose(mus_test) clf = linear_model.LogisticRegression().fit(x_train, y_train) y_pred_train = clf.predict_proba(x_train) y_pred_test = clf.predict_proba(x_test) mlb = preprocessing.MultiLabelBinarizer() roc_train = metrics.roc_auc_score( mlb.fit_transform(np.expand_dims(y_train, 1)), y_pred_train) roc_test = metrics.roc_auc_score( mlb.fit_transform(np.expand_dims(y_test, 1)), y_pred_test) return roc_train, roc_test
def __init__(self, domain: Domain) -> None: self.intent_enc = preprocessing.LabelEncoder() intents = set(domain.acts_params + domain.dstc2_acts_sys) self.intent_enc.fit([[x] for x in intents]) self.slot_enc = preprocessing.MultiLabelBinarizer() slots = set(domain.requestable_slots + domain.system_requestable_slots) self.slot_enc.fit([[x] for x in slots])
def cosine_model(m, classes=None, **cos_kws): encode = pre.MultiLabelBinarizer(classes=classes) tags = encode.fit_transform(m) cos = cosine_similarity(tags.T.dot(tags)) cos -= np.diag(cos.diagonal()) return cos
def _encode_labels(self): """Encodes string or numeric y labels to integers using MultiLabelBinarizer""" mlb = preprocessing.MultiLabelBinarizer() mlb.fit(self.y) mapping_dict = dict( zip(list(range(0, len(mlb.classes_))), mlb.classes_)) return mlb, mapping_dict
def main(): y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]] y_transformed = preprocessing.MultiLabelBinarizer().fit_transform(y) print('y_transformed =\n', y_transformed) multiclass_example() multioutput_classification_example() multioutput_regression_example()
def generate_tags(df, hashtag_threshold=200): mlb = preprocessing.MultiLabelBinarizer(sparse_output=True) add_tag = np.vectorize(lambda x: f"hashtag_{x}") hashtags = pd.DataFrame(mlb.fit_transform(df.hashtags).toarray(), columns=add_tag(mlb.classes_)) hashtag_freq = hashtags.sum(axis=0).sort_values(ascending=False) hashtags = hashtags.filter(hashtag_freq.iloc[:hashtag_threshold].index) return hashtags
def trainfunctionclassifier(trees, sents, numproc): """Train a classifier to predict functions tags in trees.""" from sklearn import linear_model, multiclass, pipeline from sklearn import preprocessing, feature_extraction from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, jaccard_similarity_score vectorizer = pipeline.Pipeline([ ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), ('scaler', preprocessing.StandardScaler( copy=False, with_mean=False))]) # PTB has no function tags on pretermintals, Negra/Tiger/Lassy do. posfunc = any(functions(node) for tree in trees for node in tree.subtrees() if node and isinstance(node[0], int)) target = [functions(node) for tree in trees for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))] # PTB may have multiple tags (or 0) per node. # Negra/Tiger/Lassy have exactly 1 tag for every node. multi = any(len(a) > 1 for a in target) if multi: encoder = preprocessing.MultiLabelBinarizer() else: encoder = preprocessing.LabelEncoder() target = [a[0] if a else '--' for a in target] # binarize features (output is a sparse array) trainfeats = vectorizer.fit_transform(functionfeatures(node, sent) for tree, sent in zip(trees, sents) for node in tree.subtrees() if tree is not node and node and (posfunc or isinstance(node[0], Tree))) trainfuncs = encoder.fit_transform(target) classifier = linear_model.SGDClassifier( loss='hinge', penalty='elasticnet', n_iter=int(10 ** 6 / len(trees))) alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6] if multi: classifier = multiclass.OneVsRestClassifier( classifier, n_jobs=numproc or -1) param_grid = dict( estimator__alpha=alphas) else: param_grid = dict(alpha=alphas) classifier = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring=make_scorer(jaccard_similarity_score)) # train classifier classifier.fit(trainfeats, trainfuncs) msg = ('trained classifier; grid search results:\n%s\n' 'multi=%r, posfunc=%r; best score on training set: %g %%\n' 'parameters: %r\nfunction tags: %s' % ( '\n'.join(str(a) for a in classifier.grid_scores_), multi, posfunc, 100.0 * classifier.best_score_, classifier.best_estimator_, ' '.join(str(a) for a in encoder.classes_))) return (classifier, vectorizer, encoder, posfunc, multi), msg
def get_encoded_labels(self, training_labels, validation_labels, test_labels, multilabel=False): if multilabel: training_labels = [ label.strip().split(self.processing_params.multilabelsplitter) for label in training_labels ] validation_labels = [ label.strip().split(self.processing_params.multilabelsplitter) for label in validation_labels ] test_labels = [ label.strip().split(self.processing_params.multilabelsplitter) for label in test_labels ] total_labels = [] total_labels.extend([ label for jointlabel in training_labels for label in jointlabel ]) total_labels.extend([ label for jointlabel in validation_labels for label in jointlabel ]) total_labels.extend( [label for jointlabel in test_labels for label in jointlabel]) unique_labels = [np.unique(total_labels)] self.label_transform = preprocessing.MultiLabelBinarizer() else: training_labels = [label.strip() for label in training_labels] validation_labels = [label.strip() for label in validation_labels] test_labels = [label.strip() for label in test_labels] total_labels = training_labels + validation_labels + test_labels unique_labels = np.unique(total_labels) self.label_transform = preprocessing.LabelEncoder() self.label_transform.fit(unique_labels) training_labels = self.label_transform.transform(training_labels) validation_labels = self.label_transform.transform(validation_labels) test_labels = self.label_transform.transform(test_labels) if multilabel: return training_labels.astype(np.float32), \ validation_labels.astype(np.float32), \ test_labels.astype(np.float32), \ self.label_transform.classes_ else: return training_labels.astype(np.int32), \ validation_labels.astype(np.int32), \ test_labels.astype(np.int32), \ self.label_transform.classes_
def fit(self, X, y): # TODO Using 2 classes. Extra argument required if we want # this to work with more than 2 classes n_classes = 2 n_examples, n_features = X.shape iterations = int(n_examples / self.batch_size) total_iterations = self.n_epoch * iterations # 1 column per value so will be easier later to make this work with multiple classes. #y = y.astype(float) y = preprocessing.MultiLabelBinarizer().fit_transform( y.reshape(len(y), 1)) # Placeholders for input values. self.x = tf.placeholder(tf.float64, [None, n_features], name='x') self.y_ = tf.placeholder(tf.float64, [None, n_classes], name='dataset-y') # Variables for computed stuff, we need to initialise them now. W = tf.Variable(tf.zeros([n_features, n_classes], dtype=np.float64), name='weights') b = tf.Variable(tf.zeros([n_classes], dtype=np.float64), name='bias') # Predicted y. self.z = tf.matmul(self.x, W) + b self.y = tf.nn.softmax(self.z) cross_entropy = -tf.reduce_sum( self.y_ * tf.log(tf.clip_by_value(self.y, -1.0, 1.0))) loss = tf.reduce_mean(cross_entropy) # Calculate decay_rate. learning_rate = self.calculate_decay_rate(total_iterations) train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss) init = tf.initialize_all_variables() self.sess.run(init) for e in range(self.n_epoch): for i in range(iterations): offset = i * self.batch_size it_end = offset + self.batch_size if it_end > n_examples: it_end = n_examples - 1 batch_xs = X[offset:it_end] batch_ys = y[offset:it_end] feed = {self.x: batch_xs, self.y_: batch_ys} self.sess.run(train_step, feed_dict=feed)
def init_item_item_simmat(): t1 = time.time() * 1000 itemids = user_item_mat.columns ITEM_ITEM_SIMMAT = pd.DataFrame(0.0, index=itemids, columns=itemids, dtype='f8') # 整理导演字段多标签二值化处理 movie_data.index = movie_data['movie_id'] movie_feature_map = pd.get_dummies(movie_data, columns=['director']) print('movie_feature_map:', movie_feature_map.info(), movie_feature_map.shape) # 整理演员字段多标签二值化处理 L = pd.DataFrame(movie_data['actors'].str.split( ' / ', expand=True)).fillna('').values import sklearn.preprocessing as sp mlb = sp.MultiLabelBinarizer() res = pd.DataFrame(mlb.fit_transform(L), columns=mlb.classes_, index=movie_data['movie_id']) movie_feature_map2 = pd.concat([movie_feature_map, res], axis=1) print('res:', res.info(), res.shape) print('movie_feature_map2:', movie_feature_map2.info(), movie_feature_map2.shape) # 整理电影类型字段多标签二值化处理 G = pd.DataFrame(movie_data['genres'].str.split( ' / ', expand=True)).fillna('').values import sklearn.preprocessing as sp mlb = sp.MultiLabelBinarizer() res = pd.DataFrame(mlb.fit_transform(G), columns=mlb.classes_, index=movie_data['movie_id']) movie_feature_map3 = pd.concat([movie_feature_map2, res], axis=1) # 计算相关性系数矩阵 movie_feature_map3.drop(movie_feature_map3.columns[np.arange( len(MOVIE_DATA_COLUMNS))], axis=1, inplace=True) print(movie_feature_map3.shape) ITEM_ITEM_SIMMAT = movie_feature_map3.T.corr() return ITEM_ITEM_SIMMAT
def featureEngineeringOfAmenities(df): '''clean the amenities field and convert into list''' df['amenities'] = df.apply(lambda x: parse_amenities(x.amenities), axis=1) '''OHE the data of ammenities''' '''we cannot use getdummies here as each row has a list of amenities.so we are using MultiLabelBinarizer ''' mlb = preprocessing.MultiLabelBinarizer() amenities = pandas.DataFrame(mlb.fit_transform(df['amenities']), columns=mlb.classes_, index=df.index) amenities = amenities.drop([ 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50' ], axis=1) '''check corelation between amenities''' cor_amn = pandas.DataFrame(amenities.corr()) for col in cor_amn.columns: cor_amn.loc[col, col] = numpy.nan high_cor = cor_amn.where(cor_amn.abs().gt(.8)) high_cor = high_cor.dropna(axis=1, how='all') high_cor = high_cor.dropna(axis=0, how='all') '''highly corelated with bathroom essentials. so remove them''' amenities = amenities.drop( ['Bath towel', 'Bedroom comforts', 'Body soap', 'Toilet paper'], axis=1) '''highly corelated with cooking basics. so remove them''' amenities = amenities.drop([ 'Dishes and silverware', 'Oven', 'Refrigerator', 'Stove', 'Microwave' ], axis=1) '''highly corelated with self check in.so remove them''' amenities = amenities.drop(['Lockbox'], axis=1) '''highly corelated to toilet so remove''' amenities = amenities.drop(['Wide clearance to shower'], axis=1) '''delete original amenities column''' df = df.drop(['amenities'], axis=1) '''merge amenities with original data''' df = pandas.DataFrame(pandas.concat([df, amenities], axis=1)) '''remove amenities which are most common or most uncommon''' amenities_dist = dict() unbalanced_amenities = list() for i in amenities.columns: freq = df[i].sum().item() amenities_dist.update({i: freq}) if (freq < 1500 or freq > 70000): unbalanced_amenities.append(i) '''sort by most common''' amenities_dist = dict( sorted(amenities_dist.items(), key=operator.itemgetter(1), reverse=True)) '''get rid of amenities which have less than 3% of 0's or 1's in each column''' df = df.drop(unbalanced_amenities, axis=1) return (df)
def pandasToTensor(data, globalVocab): data = shuffle(data) # # Preprocessing data # # retain only text that contain less that 70 tokens to avoid too much padding data["token_size"] = data["text"].apply(lambda x: len(x.split(' '))) data = data.loc[data['token_size'] < 70].copy() # # sampling # data = data.sample(n=50000); # # construct vocab and indexing # inputs = construct.ConstructVocab(data["text"].values.tolist()) # print(globalVocab.vocab[0:10]) input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')] for es in data["text"].values.tolist()] # examples of what is in the input tensors # print(input_tensor[0:2]) # calculate the max_length of input tensor max_length_inp = util.max_length(input_tensor) # print(max_length_inp) # inplace padding input_tensor = [ util.pad_sequences(x, max_length_inp) for x in input_tensor ] # print(input_tensor[0:2]) ###Binarization emotions = list(emotion_dict.values()) num_emotions = len(emotion_dict) # print(emotions) # binarizer mlb = preprocessing.MultiLabelBinarizer(classes=emotions) data_labels = [emos for emos in data[['emotions']].values] # print(data_labels) bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) # print(target_tensor[0:2]) # print(data[0:2]) get_emotion = lambda t: np.argmax(t) get_emotion(target_tensor[0]) emotion_dict[get_emotion(target_tensor[0])] return input_tensor, target_tensor
def load_scale_data(file_path, multilabeltf=False): X, y = load_svmlight_file(file_path, multilabel=multilabeltf) X = X.toarray() # print X[:, 0] # print X[:, 10] # print X[:, 21] # X = preprocessing.scale(X) # min_max_scaler = preprocessing.MinMaxScaler() # X = min_max_scaler.fit_transform(X_dentise) if multilabeltf == True: y = preprocessing.MultiLabelBinarizer().fit_transform(y) return (X, y)
def prepare_one_hot_encoder(img_list, label_csv_path): with open(img_list, "r") as f: tmp_list = [i.strip() for i in f.readlines()] y = [] meta_data = pd.read_csv(label_csv_path) for pid in tmp_list: labels = meta_data.loc[meta_data["Image Index"] == pid, "Finding Labels"] tmp = labels.tolist()[0].split("|") y.append(tmp) encoder = preprocessing.MultiLabelBinarizer() encoder.fit(y) return encoder
def prepareTraining(X_train, Y_train, wl, savgol=False, msc=False, rdp=False, justbymoda=True, plot=False): """ fonction qui permet de préparer les données d'entrainement pour la regression pls. En utilisant ou non un pré-traitement, Binarise les classes en fonction du choix fait. :param X_train: liste des reflectances associées à chaque échantillon :param Y_train: liste de classes associées à chaque échantillon :param wl: longueurs d'ondes :param savgol: booléen si on utilise le pré-traitement Savitzky Golay :param msc: booléen si on utilise le pré-traitement MSC :param rdp: booléen si on utilise le pré-traitement RDP :param justbymoda: booléen on sépare les classes par moda, ou par moda,feuille,das :param plot: booléen si on veut un affichage ou non :return: une nouvelle liste X de reflectances, une nouvelle liste Y de classes (binaire), la référence ref des données passées par MSC (None si msc=False), les longueurs d'ondes wlrdp passées par RDP (=wl si rdp=False) """ #print("prepareTraining: "+str(len(X_train))+" "+str(len(X_train[0]))) X_train = X_train.astype('float32') if savgol is True: X_train = pre_traitement.lissageSavitzky(X_train, wl) ref = None if msc is True: X_train, ref = pre_traitement.compute_msc(X_train, wl, plot=plot) wlrdp = wl if rdp is True: wlrdp, X_train = pre_traitement.computeRdpForData(wl, X_train, Y_train, justbymoda, plot=plot) X = getDatatoDataframe(X_train) #binarisation des classes lb = preprocessing.MultiLabelBinarizer() if justbymoda is True: lb = preprocessing.LabelBinarizer() Y = lb.fit_transform(Y_train) oneShotDictionary = {} for i in range(0, Y.shape[0]): # print("Binarized labels training: "+str(Y[i])+" - "+str(Y_train[i])+" -> "+str(str(Y[i]) in oneShotDictionary)) if (str(Y[i]) in oneShotDictionary) is False: oneShotDictionary[str(Y[i])] = Y_train[i] return oneShotDictionary, X, Y, ref, wlrdp
def _labels_encoder(self): """ prepare labels encoder from string to digits """ pd_meta = self._load_fold_list(fold=1, data_split=self.DataSplit.train) labels_list = pd_meta[self.MetaCol.LAB].astype(str) if self.is_multilabel: le = sk_proc.MultiLabelBinarizer() labels_list = labels_list.str.split(self.LABEL_SEPARATOR) else: le = sk_proc.LabelEncoder() le.fit(labels_list) return le
def ro_gt(start_time, end_time, feature_length): start_time=dt.strptime(start_time, "%Y-%m-%d"); end_time=dt.strptime(end_time, "%Y-%m-%d"); fl = str(feature_length) + "s"; timestamps=pd.date_range(start_time, end_time, freq=fl); dred_df = pd.DataFrame.from_csv('../dataset/Occupancy_data_split.csv'); # need to use different groupby, that can adjust based on feature length dred_fl = dred_df.groupby(pd.TimeGrouper(fl))[u'room'].apply(set).apply(list); dred_fl = dred_fl.apply(lambda x: float('NaN') if len(x)==0 else x).dropna(); mlb = preprocessing.MultiLabelBinarizer(); dred_bin = mlb.fit_transform(dred_fl); dred_bin_df = pd.DataFrame(data=dred_bin, columns=list(mlb.classes_), index=dred_fl.index); return dred_bin_df.loc[timestamps].dropna();