def main(): a = read_in() m = joblib.load('./ML/model-data/model1.pkl') xtrain = np.load('./ML/model-data/xtrain.npy', allow_pickle=True) xtrain = pd.Series(xtrain) tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000) xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain) genre_data = np.load('./ML/model-data/genre_data.npy', allow_pickle=True) genre_data = pd.Series(genre_data) multilabel_binarizer = MultiLabelBinarizer() multilabel_binarizer.fit(genre_data) a = clean_text(a) a = remove_stopwords(a) a = pd.Series(np.array([a])) xval_tfidf = tfidf_vectorizer.transform(a) y_pred = m.predict(xval_tfidf) ans = multilabel_binarizer.inverse_transform(y_pred) y_val_predicted_probabilites_tfidf = m.predict_proba(xval_tfidf) if len(ans[0]) == 0: max_idx = np.argmax(y_val_predicted_probabilites_tfidf) y_pred[0][max_idx] = 1 ans = multilabel_binarizer.inverse_transform(y_pred) final_ans = [] for x in ans[0]: final_ans.append(x) print(final_ans)
def run(): ############################################################## ## local variable setup mlb = MultiLabelBinarizer(classes=mf_labels) mlb.fit(mf_labels) is_training = True is_validating = True is_testing = True # lm = LinearSVC(dual=True, class_weight='balanced', random_state=0, verbose=2, max_iter=500, tol=1e-4) lm = LinearSVC(dual=False, class_weight='balanced', verbose=2) classifier = OneVsRestClassifier(lm, n_jobs=-1) ########################################################### ## Start training if is_training: X = np.load('X_train.npy', mmap_mode='r') Y = np.load('y_train.npy', mmap_mode='r') print("### Classifier fitting X size: {0} y size: {1}".format( len(X), len(Y))) classifier.fit(X, Y) # joblib.dump(classifier, current_classifier_name) ########################################################### ## Prediction if is_validating: print("## Prediction") # classifier = joblib.load(current_classifier_name) X_val = np.load('X_val.npy', mmap_mode='r') y_val = np.load('y_val.npy', mmap_mode='r') X_id = np.arange(1, 9897) predicted = classifier.predict(X_val) pred_labels = mlb.inverse_transform(predicted) groud_truth = mlb.inverse_transform(y_val) for id, predicted, gt in zip(X_id, pred_labels, groud_truth): print('ID: {0} =>\r\nPredicted: {1} \r\nGround truth: {2}'.format( str(id), ', '.join(str(k) for k in predicted), ', '.join(str(k) for k in gt))) ########################################################### ## Test if is_testing: print('## Test') # classifier = joblib.load(current_classifier_name) X_test = np.load('X_test.npy', mmap_mode='r') predicted = classifier.predict(X_test) pred_labels = mlb.inverse_transform(predicted) to_write = 'image_id,label_id\r\n' for idx, predicted in enumerate(pred_labels): i = idx + 1 to_write += '{0},{1}\r\n'.format( str(i), ' '.join(str(k) for k in predicted)) f = open(test_file_name, 'w') f.write(to_write) f.close() return classifier
def main(): image_data_fourier, images = load_image_data() weather_data = load_weather_data() weather_data = transform_weather(weather_data) model = make_pipeline( StandardScaler(), # Around 64% #MinMaxScaler(), # Around 63% #Normalizer(), # Around 65% PCA(120), OneVsRestClassifier(KNeighborsClassifier(n_neighbors=7)) #MLPClassifier(50) #KNeighborsClassifier(n_neighbors=5) #7 100 == 68 5 120 == 68.5 9 340 == 69 13 300 == 68.8 #SVC(kernel='linear', C=1) # Does not support multilabel, also runs endlessly ) join = image_data_fourier[['datetime', 'filename']] weather_data = pd.merge(weather_data, join, on='datetime') train_weather = weather_data[~weather_data['weather'].isnull()] files = train_weather['filename'] train_image = [ndimage.imread(directory_i + file, mode='L') for file in files] train_image = np.array(train_image) train_image = np.reshape(train_image, [train_image.shape[0], train_image.shape[1] * train_image.shape[2]]) mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_weather['weather']) res = mlb.classes_ X_train, X_test, y_train, y_test = train_test_split(train_image, y) model.fit(X_train, y_train) print(classification_report(y_test, model.predict(X_test), target_names=res)) tmp = pd.DataFrame() tmp['real'] = mlb.inverse_transform(y_test) tmp['fake'] = mlb.inverse_transform(model.predict(X_test)) print(accuracy_score(y_test, model.predict(X_test))) tmp.to_csv('image_data.csv') #weather_data = weather_data.groupby('weather').mean() #images.to_csv('weather_data.csv') print("--- %s seconds ---" % (time.time() - start_time))
def supervised(train, test, train_tfidf_matrix, test_tfidf_matrix, n_classes, init_C=10, metric=False, grid=True): print("Method: supervised(train,test,train_tfidf_matrix," "test_tfidf_matrix,init_C=10,probability=True," "metric=False,grid=True)") from sklearn.preprocessing import MultiLabelBinarizer from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier # from scipy.stats import randint as sp_randint mlb = MultiLabelBinarizer() train_labels = [vals["classes"] for id, vals in train.items()] train_labels_bin = mlb.fit_transform(train_labels) print("\nAlgorithm: \t \t \t SVM") SVM = OneVsRestClassifier(SVC(kernel='linear', C=init_C, probability=True)) if grid: print("Performing grid search...") SVM_params = [ { 'estimator__C': [10000, 1000, 100, 10, 1] }, ] # SVM_params = {'estimator__C': sp_randint(1, 10000)} SVM_grid = grid_search(SVM, SVM_params, train_tfidf_matrix, train_labels_bin) SVM = OneVsRestClassifier( SVC(kernel='linear', C=SVM_grid['params']['estimator__C'], probability=True)) SVM_fit = SVM.fit(train_tfidf_matrix, train_labels_bin) SVM_pred = SVM_fit.predict(test_tfidf_matrix) SVM_proba = SVM_fit.predict_proba(test_tfidf_matrix) if metric: result = OrderedDict() test_labels = [vals["classes"] for id, vals in test.items()] mm.accuracy_multi(test_labels, mlb.inverse_transform(SVM_pred), n_classes) result["SVM_metric"] = mm.sklearn_metrics( mlb.fit_transform(test_labels), SVM_pred) return result, mlb.inverse_transform(SVM_pred), SVM_proba return None, mlb.inverse_transform(SVM_pred), SVM_proba
def classify(x_train,y_train,x_test,y_test,test_size,max_labels,threshold): from sklearn.preprocessing import MultiLabelBinarizer from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB from sklearn.cross_validation import train_test_split from sklearn import metrics import numpy # binarize the labels mlb = MultiLabelBinarizer() y_train_binarized = mlb.fit_transform(y_train) # train/test split #corpus_tfidf_vectors, labels_binarized = shuffle(corpus_tfidf_vectors, labels_binarized) #x_train, x_test, y_train, y_test = train_test_split(corpus_tfidf_vectors, labels_binarized, test_size=test_size, random_state=1) # classify #cls = OneVsRestClassifier(LogisticRegression(class_weight='auto')) #cls = OneVsRestClassifier(LogisticRegression()) #cls = OneVsRestClassifier(MultinomialNB(alpha=0.01)) #cls = OneVsRestClassifier(SVC(kernel='linear',probability=True,max_iter=1000)) cls = OneVsRestClassifier(LinearSVC()) cls.fit(x_train, y_train_binarized) pred_proba = 1/(1+numpy.exp(-1*cls.decision_function(x_train))) # evaluate y_pred = mlb.inverse_transform(get_max_n_pred(pred_proba, max_labels,threshold)) result = 'threshold: {0}, precision: {1}, recall: {2}, f1: {3}'.format(threshold,metrics.precision_score(y_train, y_pred, average='micro'),metrics.recall_score(y_train, y_pred, average='micro'),metrics.f1_score(y_train, y_pred, average='micro')) print result
class DFMultiLabelBinarizer(BaseEstimator, TransformerMixin): def __init__(self, **kwargs): self.model = MultiLabelBinarizer(**kwargs) self.transform_cols = None def fit(self, y): self.transform_cols = [x for x in y.columns] self.model.fit(y[self.transform_cols].values) return self def transform(self, y): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_y = pd.DataFrame( self.model.transform(y[self.transform_cols].values), columns=[f'MLB_{x}' for x in self.model.classes_] ) return new_y def fit_transform(self, y): return self.fit(y).transform(y) def inverse_transform(self, y): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = pd.DataFrame(self.model.inverse_transform(y.values), columns=self.transform_cols) return new_X
class MyMultiLabelBinarizer(TransformerMixin): """ Wrap MultiLabelBinarizer so it can be used in pipeline. See https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize for problem explanation. """ def __init__(self, *args, **kwargs): self.classes = [ 'agreement/disagreement', 'certainty', 'contrariety', 'hypotheticality', 'necessity', 'prediction', 'source of knowledge', 'tact/rudeness', 'uncertainty', 'volition' ] self.encoder = MultiLabelBinarizer(classes=self.classes, *args, **kwargs) def fit(self, y, *_): self.encoder.fit(y) return self def transform(self, y, *_): yt = self.encoder.transform(y) return yt def inverse_transform(self, yt): y = self.encoder.inverse_transform(yt) return y
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20, 4, tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"]) #txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf, processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"]) #txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual), predicted_tags, target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int(len(sets["test"]) * (top_percentage / 100.0)) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [predicted_probs[j, i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]] = z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"], predicted_tags_readable, class_list, class_probablities=predicted_probs, threshold_vals=threshold_vals)
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, class_weight=None), n_jobs=-1 ) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def distribution_to_tuples(predictions, threshold, at_least_one_hot=True): """ Convert a distribution to k-hot tuples using the given threshold. :param predictions: predictions, 2-dim numpy matrix (samples, distribution) :param threshold: per-class thresholds for k-hot mapping (binarization) :param at_least_one_hot: ensure that the relatively highest prediction is chosen, if no prediction is greater than its threshold :return: list of tuples, each tuple contains class indices """ threshold_normalized_predictions = predictions / threshold k_hot_predictions = np.where(threshold_normalized_predictions >= 1., 1., 0.) classes = predictions.shape[1] binarizer = MultiLabelBinarizer(classes=[c for c in range(classes)]) binarizer.fit([[c] for c in range(classes)]) binarized_prediction_tuples = binarizer.inverse_transform( k_hot_predictions) # make sure we have at least one prediction if at_least_one_hot: ensure_at_least_one_prediction(binarized_prediction_tuples, threshold_normalized_predictions) return binarized_prediction_tuples
def multilabel(weather,weatherTest): categorias =["clase","humedad","nieve","nubes","nubes-precipitacion","otros","precipitacion","temperatura","viento"] pipeline = Pipeline([ ('vectorize', CountVectorizer()), ('tf_idf', TfidfTransformer(norm='l2')), # play with the parameters and check the model size ('select', SelectPercentile(chi2, percentile=50)), ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber'))) ]) multi_labels = [ ["humedad"], ["nieve"], ["nubes"], ["nubes","precipitación"], ["otros"], ["precipitacion"], ["temperatura"], ["viento"], ] mlb = MultiLabelBinarizer().fit(weather.clase) mlb_labels = mlb.transform(weather.clase) print(mlb_labels) clf = pipeline.fit(weather.frase, mlb_labels) print("classifier has %s bytes" % len(pickle.dumps(pipeline.named_steps['clf']))) predicted = pipeline.predict(weatherTest.frase) print(predicted) #print(np.mean(predicted == weatherTest.clase)) #print(metrics.classification_report(weatherTest.clase, predicted)) all_labels = mlb.inverse_transform(predicted) print(all_labels) for item, labels in zip(weatherTest.frase, all_labels): print('%s => %s' % (item, ', '.join(labels)))
def main(): """ Loads the model from the checkpoint dir as specified in the given config file. Calls the prediction function to save the prediction csv file to the checkpoint dir. """ # capture the config path from the run arguments # then process the json configuration file try: args = get_args() config_array = [process_config(x) for x in args.config.split(" ")] check_array = args.checkpoint_nb.split(" ") cwd = os.getenv("EXP_PATH") if args.outfile_multiple: outfile = os.path.join(cwd, args.outfile_multiple + '.csv') else: outfile = os.path.join(cwd, 'prediction.csv') except Exception: print("missing or invalid arguments") raise # not needed just to question n testIterator = DataTestLoader(config_array[0]) probas = np.zeros((len(config_array), testIterator.n, 28)) i = 0 for config, check in zip(config_array, check_array): # create tensorflow session sess = tf.Session() # create your data generator # here config file used for init does not matter testIterator = DataTestLoader(config) # create an instance of the model you want try: ModelInit = all_models[config.model] model = ModelInit(config) except AttributeError: print("The model to use is not specified in the config file") exit(1) # load model if exists model.load(sess, check) # here you predict from your model predictor = Predictor(sess, model, config) probas[i, :, :] = predictor.predict_probas(testIterator) print('processed {} model'.format(model)) i += 1 tf.reset_default_graph() probas = np.mean(probas, axis=0) print(np.shape(probas)) one_hot_pred = get_pred_from_probas_threshold(probas) bin = MultiLabelBinarizer(classes=np.arange(28)) bin.fit([[1]]) # needed for instantiation of the object pred = bin.inverse_transform(one_hot_pred) predicted_labels = [ ' '.join([str(p) for p in sample_pred]) for sample_pred in pred ] print(np.shape(predicted_labels)) testIterator.result['Predicted'] = predicted_labels testIterator.result = testIterator.result.sort_values(by='Id') testIterator.result.to_csv(outfile, index=False)
def on_train_end(self, logs={}): if self.test_generator: print('Training done. Running predictions...') best_model = load_model(self.save_path) classes = pd.read_csv(paths['dummy']['csv']).columns preds = best_model.predict_generator(self.test_generator, use_multiprocessing=True, workers=8, verbose=1) preds = preds > .5 print('Converting labels...') mlb = MultiLabelBinarizer(classes=classes) mlb.fit(None) # necessary, won't actually do anything sparse_preds = mlb.inverse_transform(preds) submission_list = [] for i, p in enumerate(sparse_preds, start=1): labels = ' '.join(p) submission_list.append([i, labels]) submission_path = join(paths['results'], '{}-submission.csv'.format(self.save_fname)) print('Saving predictions to {}'.format(submission_path)) columns = ['image_id', 'label_id'] pd.DataFrame(submission_list, columns=columns) \ .to_csv(submission_path, index=False)
def tag_recommendation(): # Appeler les Inputs de la page HTML dashboard question = request.form['question'] #request.args.get('question') tags_text = '' if question is not None: question = str(question) question_tag = preprocessing(question) question_tag_df = pd.DataFrame([question_tag], columns=['question']) test_input_df = pd.concat([question_tag_df, input_df_tags_500], ignore_index=True) question_input = test_input_df['question'] vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words=stop_words, max_features=355) X_tfidf = vectorizer.fit_transform(question_input).toarray() feature_names = vectorizer.get_feature_names() X_test_question = pd.DataFrame(X_tfidf) X_test_question = X_test_question.iloc[0:1, :] X_test_question.columns = feature_names tags_num = pipeline.predict(X_test_question) mlb = MultiLabelBinarizer(classes=sorted(input_tags_500)) mlb.fit(input_tags_500) tags_text = pd.concat( [pd.Series(mlb.inverse_transform(tags_num), name='tags_num')], axis=1) tags_text = str(tags_text.values.tolist()).strip('[()]') tags_text return render_template('recommendation.html', tags=tags_text)
class MultiHotEncoder(BaseEncoder): def __init__(self, is_target=False): super().__init__(is_target) self._binarizer = MultiLabelBinarizer() self._seen = set() @staticmethod def _clean_col_data(column_data): column_data = [(arr if arr is not None else []) for arr in column_data] column_data = [[str(x) for x in arr] for arr in column_data] return column_data def prepare(self, column_data, max_dimensions=100): column_data = self._clean_col_data(column_data) self._binarizer.fit(column_data + [('None')]) for arr in column_data: for x in arr: self._seen.add(x) self._prepared = True def encode(self, column_data): column_data = self._clean_col_data(column_data) data_array = self._binarizer.transform(column_data) return torch.Tensor(data_array) def decode(self, vectors): # It these are logits output by the neural network, we need to treshold them to binary vectors vectors = np.where(vectors > 0, 1, 0) words_tuples = self._binarizer.inverse_transform(vectors) return [list(w) for w in words_tuples]
def supervised2(params, pkl_file=False): print("Method: supervised(train,test,train_tfidf_matrix," "test_tfidf_matrix,init_C=10,probability=True," "metric=False,grid=True)") from sklearn.preprocessing import MultiLabelBinarizer from sklearn.svm import SVC from sklearn.multiclass import OneVsRestClassifier # from scipy.stats import randint as sp_randint train = params["train"] test = params["test"] train_tfidf_matrix = params["train_tfidf_matrix"] test_tfidf_matrix = params["test_tfidf_matrix"] n_classes = params["n_classes"] init_C = params["init_C"] metric = params["metric"] mlb = MultiLabelBinarizer() train_labels = [vals["classes"] for id, vals in train.items()] train_labels_bin = mlb.fit_transform(train_labels) print("\nAlgorithm: \t \t \t SVM") SVM = None if pkl_file: if os.path.isfile(pkl_file): SVM = load_pickle(pkl_file) else: SVM = OneVsRestClassifier( SVC(kernel='linear', C=init_C, probability=True)) pkl_file = "SVM" save_pickle(SVM, pkl_name, tag=False) SVM_fit = SVM.fit(train_tfidf_matrix, train_labels_bin) SVM_pred = SVM_fit.predict(test_tfidf_matrix) SVM_proba = SVM_fit.predict_proba(test_tfidf_matrix) if metric: result = OrderedDict() test_labels = [vals["classes"] for id, vals in test.items()] mm.accuracy_multi(test_labels, mlb.inverse_transform(SVM_pred), n_classes) result["SVM_metric"] = mm.sklearn_metrics( mlb.fit_transform(test_labels), SVM_pred) return result, mlb.inverse_transform( SVM_pred), SVM_proba, SVM, pkl_file return None, mlb.inverse_transform(SVM_pred), SVM_proba, SVM, pkl_file
def translate_to_labels(root, y_pred): path = os.path.join(root, 'data', 'Y.csv') df = pd.read_csv(path, header=None, names=['labels']) df['labels'] = df['labels'].apply(lambda x: x.split()) mlb = MultiLabelBinarizer() mlb.fit_transform(df['labels']) return mlb.inverse_transform(y_pred)
class MultiLabelClassifier: """ Helper class for training and evaluating multi-label classifiers on movie genres Classifier can predict multiple genres for a given movie """ def __init__(self, vectorizer=None, classifier=None): self.vectorizer = vectorizer self.classifier = classifier self.encoder = MultiLabelBinarizer() self.trained = False def save_clf(self, file_name): if not self.trained: raise Exception("Classifier needs to be trained first") with open(file_name, 'wb') as output: pickle.dump(self.vectorizer, output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL) pickle.dump(self.encoder, output, pickle.HIGHEST_PROTOCOL) def load_clf(self, file_name): with open(file_name, 'rb') as input: self.vectorizer = pickle.load(input) self.classifier = pickle.load(input) self.encoder = pickle.load(input) self.trained = True def prepare_data(self, label_col, df: pd.DataFrame): y = self.encoder.fit_transform(df[label_col]) return train_test_split(df.drop(columns=[label_col]), y, test_size=0.2, random_state=9) def train(self, X, Y): Xtrain = self.vectorizer.fit_transform(X) self.classifier.fit(Xtrain, Y) self.trained = True def predict_one(self, x) -> List[str]: """ Given one movie as a single DataFrame, predict possible genres """ if not self.trained: raise Exception("Classifier needs to be trained first") xtest = self.vectorizer.transform(x) ytest = self.classifier.predict(xtest) return list(self.encoder.inverse_transform(ytest)[0]) def evaluate(self, X, Y): if not self.trained: raise Exception("Classifier needs to be trained first") Xval = self.vectorizer.transform(X) Ypredict = self.classifier.predict(Xval) overlap = np.count_nonzero(Ypredict + Y == 2, axis=1) total = overlap.shape[0] correct = np.count_nonzero(overlap > 0) incorrect = total - correct return correct, incorrect
def get_label_strings_from_tensor(pred_labels_tensor): mlb = MultiLabelBinarizer(classes=LABEL_LIST) mlb = mlb.fit(None) #what hte f**k pred_labels_cpu = pred_labels_tensor.cpu().numpy() pred_labels_str = mlb.inverse_transform(pred_labels_cpu) pred_labels = [ " ".join(pred_labels_str[i]) for i in range(pred_labels_cpu.shape[0]) ] return pred_labels
class DataProcess(object): # 特征处理 def __init__(self, process_type): self.process_type = process_type if self.process_type == "Binary": # 二值化处理 self.processmodule = Binarizer(copy=True, threshold=0.0) # 大于 threshold 的映射为1, 小于 threshold 的映射为0 elif self.process_type == "MinMax": # 归一化处理 self.processmodule = MinMaxScaler(feature_range=(0, 1), copy=True) elif self.process_type == "Stand": # 标准化处理 self.processmodule = StandardScaler(copy=True, with_mean=True, with_std=True) elif self.process_type == "Normal": # 正则化处理 self.processmodule = Normalizer(copy=True, norm="l2") # 可选择l1, max ,l2三种 elif self.process_type == "MultiLabelBinar": # 多标签二值化处理 self.processmodule = MultiLabelBinarizer(sparse_output=False) # 使用其他CRS格式使用True else: raise ValueError("please select a correct process_type") def fit_transform(self, data): return self.processmodule.fit_transform(data) def fit(self, data): self.processmodule.fit(data) def transform(self, data): self.processmodule.transform(data) def set_params(self, params): self.processmodule.set_params(**params) def get_params(self): return self.processmodule.get_params(deep=True) def get_classes(self): assert self.process_type in {"MultiLabelBinar"} return self.processmodule.classes_ # 输出相关的classs有哪些不同的值 def invser_transform(self, data): assert self.process_type in {"MultiLabelBinar", "MinMax", "Stand"} return self.processmodule.inverse_transform(data) def get_max(self): # 获取数组中所多有维度上的最大值与最小值 assert self.process_type in {"MinMax", "Stand"} return self.processmodule.data_max_ def get_min(self): assert self.process_type in {"MinMax", "Stand"} return self.processmodule.data_min_ def partial_fit(self): # 使用最后的一个缩放函数来在线计算最大值与最小值 assert self.process_type in {"MinMax", "Stand"} return self.processmodule.partial_fit()
def main(): #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"]) sets = select_sets_by_tag(20,4,tag_names) #sets = random_select_sets(30,6) train_tags = fetch_tags(sets["train"]) train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"]) #vectorize count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename") X_train_counts = count_vect.fit_transform(train_texts) #tf-idf transformation tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) #process tags mlb = MultiLabelBinarizer() processed_train_tags = mlb.fit_transform(train_tags) #rint(processed_train_tags) #classifier #clf = OneVsRestClassifier(MultinomialNB()) clf = OneVsRestClassifier(LinearSVC()) clf.fit(X_train_tfidf,processed_train_tags) print("classes:{}".format(clf.classes_)) #process test set test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"]) X_test_counts = count_vect.transform(test_texts) #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts))) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted_tags = clf.predict(X_test_tfidf) predicted_tags_readable = mlb.inverse_transform(predicted_tags) test_tags_actual = fetch_tags(sets["test"]) predicted_probs = clf.decision_function(X_test_tfidf) #predicted_probs = clf.get_params(X_test_tfidf) class_list = mlb.classes_ report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list) print(report) #retrieve top 30% for each class top_percentage = 30 threshold_index = int( len(sets["test"]) *(top_percentage/100.0) ) threshold_vals_dic = {} threshold_vals = [] num_classes = len(class_list) for i in range(num_classes): z = [ predicted_probs[j,i] for j in range(len(sets["test"]))] z.sort(reverse=True) threshold_vals_dic[class_list[i]]= z[threshold_index] threshold_vals.append(z[threshold_index]) print(threshold_vals_dic) print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
class label_preprocess: def __init__(self, list_): self.mlb = MultiLabelBinarizer() self.mlb.fit(list_) def encode(self, list_): return (list(self.mlb.transform([list_])[0])) def decode(self, list_): buf = self.mlb.inverse_transform( np.array(list_).reshape(1, len(self.mlb.classes_)))[0] return (buf)
class KaggleAmazonDataset(Dataset): def __init__(self, csv_path, img_path, img_ext, transform=None): tmp_df = pd.read_csv(csv_path) self.mlb = MultiLabelBinarizer() self.img_path = img_path self.img_ext = img_ext self.transform = transform # Extracts the data and the images self.X_train = tmp_df['image_name'] self.y_train = self.mlb.fit_transform( tmp_df['tags'].str.split()).astype(np.float32) def __getitem__(self, index): img = Image.open(self.img_path + self.X_train[index] + self.img_ext) img = img.convert('RGB') if self.transform is not None: img = self.transform(img) label = torch.from_numpy(self.y_train[index]) return img, label def name(self): return self.X_train def __len__(self): return len(self.X_train.index) def splits(self, valx, valy): self.X_train = pd.Series(self.X_train.tolist()[valx:valy]) self.y_train = self.y_train[valx:valy] def getLabelEncoder(self): return self.mlb def numClasses(self): return self.y_train.shape[1] def classesName(self): return self.mlb.inverse_transform(np.array([[1] * 17])) ## newly added def set_transformation(self): num_of_transf = randint(1, len(TRANSFORMATIONS)) rand_transf = random.sample(TRANSFORMATIONS, k=num_of_transf) rand_transf.extend([ transforms.ToTensor(), transforms.Normalize([0.311, 0.340, 0.299], [0.167, 0.144, 0.138]) ]) self.transform = transforms.Compose(rand_transf)
def test_training(): trip_data = pickle.load(open("save.p", "rb")) models = list() if (len(trip_data) > 5): mlb = MultiLabelBinarizer() y_raw = trip_data["tag_array"] mlb.fit(y_raw) y = mlb.transform(y_raw) X = trip_data[[ 'distance', 'start_long', 'start_lat', 'end_long', 'end_lat', 'start_hour', 'end_hour', 'vehicleid', 'sample_weight', 'vehicle_engine_capacity', 'vehicle_year' ]] num_tagged_trips = len(y[y]) # split data into train and test sets seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=seed) print("Y_train") print(y_train) estimator = CatBoostClassifier(iterations=10, random_state=1, allow_const_label=True) model = OneVsRestClassifier(estimator=estimator) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_transformed = mlb.inverse_transform(y_pred) y_test_transformed = mlb.inverse_transform(y_test) print("y_pred") print(y_pred_transformed) print("y_test") print(y_test_transformed) predictions = [(value) for value in y_pred] print("predictions") print(predictions) accuracy = accuracy_score(y_test, predictions) print(f"accuracy {accuracy}") ACCURACY_THRESHOLD = 0.85 if (accuracy > ACCURACY_THRESHOLD): models.append({ "model": model, "tag_id": tag_id, "accuracy": accuracy }) return models
def get_scores(clf, train_tags, train_labels, test_tags, test_labels, binarize=False, store_true=False): """ Gets two lists of changeset ids, does training+testing """ if binarize: binarizer = MultiLabelBinarizer() clf.fit(train_tags, binarizer.fit_transform(train_labels)) preds = binarizer.inverse_transform(clf.predict(test_labels)) else: logging.info("Fitting model:") clf.fit(train_tags, train_labels) # train model logging.info("Generating predictions:") preds = clf.predict(test_tags) # predict labels for test set return copy.deepcopy(test_labels), preds
def get_scores(clf, X_train, y_train, X_test, y_test, binarize=False, human_check=False, store_true=False): """ Gets two lists of changeset ids, does training+testing returns true and predicted labels """ if binarize: binarizer = MultiLabelBinarizer() clf.fit(X_train, binarizer.fit_transform(y_train)) preds = binarizer.inverse_transform(clf.predict(X_test)) else: clf.fit(X_train, y_train) preds = clf.predict(X_test) if store_true: labels = clf.transform_labels(y_test) with open('/home/centos/sets/true_labels.txt', 'w') as f: for label in labels: f.write(str(label) + '\n') #logging.info("Wrote true labels to ~/sets/true_labels.txt") hits = misses = predictions = 0 #if LABEL_DICT.exists(): # with LABEL_DICT.open('rb') as f: # pred_label_dict = pickle.load(f) #else: pred_label_dict = {} for pred, label in zip(preds, y_test): if human_check: while (pred, label) not in pred_label_dict: print("Does '{}' match the label '{}'? [Y/n]".format(pred, label)) answer = input().lower() if answer == 'y': pred_label_dict[(pred, label)] = True elif answer == 'n': pred_label_dict[(pred, label)] = False else: print("Please try again") with LABEL_DICT.open('wb') as f: pickle.dump(pred_label_dict, f) if pred_label_dict[(pred, label)]: hits += 1 else: misses += 1 else: if pred == label: hits += 1 else: misses += 1 predictions += 1 #logging.info("Preds:" + str(predictions)) #logging.info("Hits:" + str(hits)) #logging.info("Misses:" + str(misses)) return copy.deepcopy(y_test), preds
def classify(self, features): model = pickle.load(open("classification\\numerical\\random_forest\\model\\model.pickle", 'rb')) result = model.predict(features) # todo put this functionality into the common classifier template MultiLabelBinarizer.set_params(range(0, 16)) mlb = MultiLabelBinarizer() mlb.fit([range(0, 16)]) genre_predictions_categorized = mlb.inverse_transform(result) if len(genre_predictions_categorized) == 0 or not all(genre_predictions_categorized): return ["Unclassifiable"] genre_predictions_categorized = [x[0] for x in mlb.inverse_transform(result)] # this needs checkinf for which value o fthe tuple is the actual value genre_predictions = [] lm = LabelManipulator() for label in genre_predictions_categorized: genre_predictions.append(lm.uncategorise_genre(label)) # convert the ids to names return genre_predictions
def get_data(directory, metadata, index_name): with open(f"{directory_path}/config.json", "r") as f: config = json.load(f) with tf.Session() as sess: iterator = BigEarthNet(f"{directory}/record.tfrecord", config["batch_size"], 1, 0, config["label_type"]).batch_iterator iterator_ins = iterator.get_next() model = importlib.import_module("models." + config["model_name"]).DNN_model( config["label_type"]) model.create_network() variables_to_restore = tf.global_variables() sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) model_saver = tf.train.Saver(max_to_keep=0, var_list=variables_to_restore) model_file = environment["model_weights"] model_saver.restore(sess, model_file) graph = tf.get_default_graph() prediction = graph.get_tensor_by_name("Cast:0") mlb = MultiLabelBinarizer(config["labels"]) mlb.fit(config["labels"]) num_patches = len(glob.glob(f"{directory}/patches/*")) for batch_number in range(math.ceil(num_patches / config["batch_size"])): try: batch_dict = sess.run(iterator_ins) sess_res = sess.run([prediction], feed_dict=model.feed_dict(batch_dict)) results = mlb.inverse_transform(sess_res[0]) except tf.errors.OutOfRangeError: pass for index, patch in enumerate(batch_dict["patch_name"].values): if results[index]: data = {} data.update(metadata) data["labels"] = results[index] data["location"] = patch_location(directory, patch.decode("utf-8")) yield {"_index": "fyp-patches", "_source": data}
def test_multilabelencoder(implementation): name = 'testmulilabelencoder_me' valid_me = MultiLabelBinarizer() valid_me.fit([('a', 'b'), ('c', )]) implementation.save(valid_me, name) test_me = implementation.load(name) got = test_me.transform([('a', )]) expected = valid_me.transform([('a', )]) assert_array_equal(got, expected) # test inverse transform print(expected) inverse_expected = valid_me.inverse_transform(expected) print(got) inverse_got = test_me.inverse_transform(got) assert_array_equal(inverse_got, inverse_expected)
class TagPredictor: classifier = None model = None corpus = None def __init__(self, classifier, corpus): self.classifier = classifier self.corpus = corpus np.random.seed(500) print("Initialized TagPredictor") def train(self): print("Started training") # Transform tags to multilabel format self.mlb = MultiLabelBinarizer() Y_matrix = self.mlb.fit_transform(self.corpus['Tags']) #np.set_printoptions(threshold=np.inf) #print(matrix[0]) print(self.mlb.classes_) train, test, Train_Y, Test_Y = train_test_split(self.corpus, Y_matrix, test_size=0.3, shuffle=True) Train_X = train['Bag_of_Words'] Test_X = test['Bag_of_Words'] #print(Train_X) #print(Train_Y) self.Tfidf_vect = TfidfVectorizer(max_features=5000) self.Tfidf_vect.fit(self.corpus['Bag_of_Words']) Train_X_Tfidf = self.Tfidf_vect.transform(Train_X) Test_X_Tfidf = self.Tfidf_vect.transform(Test_X) self.model = self.classifier() self.model.train(Train_X_Tfidf, Train_Y) print("Finished training") def predict(self, df): # return predictions_df, confidence_level X = df X_Tfidf = self.Tfidf_vect.transform(X) matrix, confidenceList = self.model.predict(X_Tfidf) labels = self.mlb.inverse_transform(matrix) return labels, confidenceList
def checkTweet(text) : """Function to find the gender based on the tweet""" labels=[] train=[] des=[] file = Path("gendertext.pickle") if(!file.exists()) : # Retrieve text and labels for training with open("gender-classifier-DFE-791531.csv",encoding="latin-1") as f: for row in DictReader(f): label= row["gender"] labels.append([label]) train.append(row["text"]) clean_text = [] for i in range( 0, len(train)): clean_text.append(" ".join(SentenceTokeniser.review_to_wordlist(train[i], True))) print(i) mlb = MultiLabelBinarizer() Y = mlb.fit_transform(labels) with open('mlb.pickle', 'wb') as f: # Python 3: open(..., 'wb') pickle.dump(mlb, f) classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('clf', OneVsRestClassifier(LinearSVC()))]) # Fit the data using the classifier classifier.fit(clean_text, Y) # Save classifer as pickle file with open('gendertext.pickle', 'wb') as f: # Python 3: open(..., 'wb') pickle.dump(classifier, f) des.append(" ".join(SentenceTokeniser.review_to_wordlist(text, True))) # Load classifer saved as pickle file with open('gendertext.pickle','rb') as f: # Python 3: open(..., 'rb') classifier = pickle.load(f) # Predict class predicted = classifier.predict(des) with open('mlb.pickle','rb') as f: # Python 3: open(..., 'rb') mlb = pickle.load(f) all_labels = mlb.inverse_transform(predicted) try : return(all_labels[0][0]) except : return "None"
def testClassifiers(X_train, y_train, X_test, y_test, multilabel): mlb = None if multilabel: mlb = MultiLabelBinarizer() mlb = mlb.fit(y_train) y_train = mlb.transform(y_train) y_test = mlb.transform(y_test) results = [] for clf, clf_name in ( #(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), #(Perceptron(n_iter=50), "Perceptron"), #(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), #(KNeighborsClassifier(n_neighbors=10), "kNN"), #(KNeighborsClassifier(n_neighbors=8, algorithm='brute', metric='cosine'), "kNN cosine"), #(RandomForestClassifier(n_estimators=100), "Random forest"), #(LinearSVC(penalty="l2", dual=False, tol=1e-3), "Linear SVC [l2]"), #(LinearSVC(penalty="l1", dual=False, tol=1e-3), "Linear SVC [l1]"), #(SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"), "SGD Classifier [l2]"), #(SGDClassifier(alpha=.0001, n_iter=50, penalty="l1"), "SGD Classifier [l1]"), (SGDClassifier(alpha=.001, n_iter=50, penalty="elasticnet"), "SGD Classifier [elasticnet]"), #(NearestCentroid(), "Nearest Centroid"), #not suitable for multilabel #(Pipeline([ # ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))), # ('classification', LinearSVC(penalty="l2"))]), "Linear SVC [l1 based features]") ): #print('=' * 80) #print(name) if multilabel: clf = OneVsRestClassifier(clf) results.append( benchmark(clf, clf_name, X_train, y_train, X_test, y_test, multilabel)) indices = np.arange(len(results)) results = [[x[i] for x in results] for i in xrange(len(results[0]))] if multilabel: preds = results[2] for i, pred in enumerate(preds): preds[i] = mlb.inverse_transform(pred) return results """
class ACMClassificator(BaseACMClassificator): def __init__(self): self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize) self.mlb = MultiLabelBinarizer() self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1) def _prepare_problems(self, problems): return self.vectorizer.transform([p.statement for p in problems]) def fit(self, problems, tags): nltk.download('punkt', quiet=True) self.vectorizer.fit([p.statement for p in problems]) mat = self._prepare_problems(problems) self.mlb = self.mlb.fit(tags) self.classificator.fit(mat.toarray(), self.mlb.transform(tags)) def predict(self, problems): mat = self._prepare_problems(problems) predicted = self.classificator.predict(mat.toarray()) return self.mlb.inverse_transform(predicted)
def get_classify(): X_train, Y_train = load_data() # 定义分类器 classifier = Pipeline([ ('counter', CountVectorizer(tokenizer=jieba_tokenizer)), # 标记和计数,提取特征用 向量化 ('tfidf', TfidfTransformer()), # IF-IDF 权重 ('clf', OneVsRestClassifier(LinearSVC())), # 1-rest 多分类(多标签) ]) mlb = MultiLabelBinarizer() Y_train = mlb.fit_transform(Y_train) # 分类号数值化 classifier.fit(X_train, Y_train) # X_test = ["数据分析"] # 把所有的测试文本存到一个list中 test_list = [] test_name = [] filelist2 = os.listdir(base_path + "data_test/") for files in filelist2: # print (files) test_name.append(files) f = open(base_path + "data_test/" + files, 'r') test_list.append(f.read()) prediction = classifier.predict(test_list) result = mlb.inverse_transform(prediction) f = open('result2.txt', 'w') for i in range(len(test_name)): f.write(str(test_name[i]) + ' ' + str(result[i]) + '\n') print (result, len(result)) num_dict = Counter(result) print (len(num_dict)) print ((num_dict[('1',)] + num_dict[('2',)] + num_dict[('3',)]) / float(len(result))) # 整数除整数为0,应把其中一个改为浮点数。
import pandas as pd data_root = "/Users/erdicalli/dev/workspace/yelp/submission/submissions/" mlb = MultiLabelBinarizer() total_labels = list() for idx, file in enumerate(output_file_names): f = pd.read_csv(data_root + "merged_" + output_file_names[idx] + ".csv") labels = np.array([list(y.replace(" ", "")) for y in f["labels"]]) total_labels.append(mlb.fit_transform(labels)) result_labels = np.ndarray(shape=(10000, 9)) for label_id, algorithm in enumerate(combination): result_labels[:, label_id] = total_labels[algorithm][:, label_id] labels = mlb.inverse_transform(result_labels) test_data_frame = pd.read_csv(data_root + "merged_" + output_file_names[4] + ".csv") df = pd.DataFrame(columns=['business_id', 'labels']) for i in range(len(test_data_frame)): biz = test_data_frame.loc[i]['business_id'] label = labels[i] label = str(label)[1:-1].replace(",", " ") df.loc[i] = [str(biz), label] with open(data_root + "combined_results.csv", 'w') as f: df.to_csv(f, index=False)
y_map_cate = ml_cate.fit_transform(y_cate) y_map_cate = np.array(y_map_cate) f_scores = [] for loop_stat in range(0,1): scores = [] report_y_actual = [] report_y_predict = [] kf = cross_validation.KFold(tfidf_train.shape[0], n_folds=5, shuffle=True) loop = 0 for train_index, test_index in kf: x_train, x_test = tfidf_train[train_index].toarray(), tfidf_train[test_index].toarray() y_train_cate_map, y_test_cate_map = y_map_cate[train_index], y_map_cate[test_index] y_train_code_map,y_test_code_map = y_map[train_index], y_map[test_index] y_train_code, y_test_code = np.array(ml.inverse_transform(y_train_code_map)),np.array(ml.inverse_transform(y_test_code_map)) y_train_cate,y_test_cate = np.array(ml_cate.inverse_transform(y_train_cate_map)),np.array(ml_cate.inverse_transform(y_test_cate_map)) # classify the category model_cate = OneVsRestClassifier(LogisticRegression()) model_cate.fit(x_train, y_train_cate_map) y_predict_cate_map = model_cate.predict(x_test) y_predict_cate = np.array(ml_cate.inverse_transform(y_predict_cate_map)) y_predict_cate_unique = reduce(lambda a,b:set(a)|set(b) ,y_predict_cate) for cate_cur in y_predict_cate_unique: if cate_cur not in defaultcode: y_text_new,y_predict_new = transfer_multilabel(y_predict_cate_map,y_test_cate_map,ml_cate,None,"0") report_y_predict.extend(y_predict_new) report_y_actual.extend(y_text_new) else: continue idx_test_cur = [ind for ind in range(0,len(y_predict_cate)) if cate_cur in y_predict_cate[ind]]
def run(options): DATA_PATHS = json.load(options.key_file) VERBOSE = options.verbose persister = Persister(DATA_PATHS, options) if options.persist and persister.is_saved(): X, Y, tr = persister.read() if VERBOSE: print("Y = " + str(Y.shape)) else: # --- LOAD DATA --- X_raw, Y_raw, tr = load_dataset(DATA_PATHS, options.data_key, options.fulltext) if options.toy_size < 1: if VERBOSE: print("Just toying with %d%% of the data." % (options.toy_size * 100)) zipped = list(zip(X_raw, Y_raw)) random.shuffle(zipped) X_raw, Y_raw = zip(*zipped) toy_slice = int(options.toy_size * len(X_raw)) X_raw, Y_raw = X_raw[:toy_slice], Y_raw[:toy_slice] if options.verbose: print("Binarizing labels...") mlb = MultiLabelBinarizer(sparse_output=True, classes=[i[1] for i in sorted( tr.index_nodename.items())] if options.hierarch_f1 else None) Y = mlb.fit_transform(Y_raw) if VERBOSE: print("Y = " + str(Y.shape)) # --- EXTRACT FEATURES --- input_format = 'filename' if options.fulltext else 'content' concept_analyzer = SynsetAnalyzer().analyze if options.synsets \ else ConceptAnalyzer(tr.thesaurus, input=input_format, persist=options.persist and options.concepts, persist_dir=options.persist_to, repersist=options.repersist, file_path=DATA_PATHS[options.data_key]['X']).analyze terms = CountVectorizer(input=input_format, stop_words='english', binary=options.binary, token_pattern=word_regexp) concepts = CountVectorizer(input=input_format, analyzer=concept_analyzer, binary=options.binary, vocabulary=tr.nodename_index if not options.synsets else None) if options.hierarchical: hierarchy = tr.nx_graph if options.prune_tree: if VERBOSE: print("[Pruning] Asserting tree hierarchy...") old_edge_count = hierarchy.number_of_edges() hierarchy = nx.bfs_tree(hierarchy, tr.nx_root) pruned = old_edge_count - hierarchy.number_of_edges() if VERBOSE: print("[Pruning] Pruned %d of %d edges (%.2f) to assert a tree hierarchy" % (pruned, old_edge_count, pruned/old_edge_count)) if options.hierarchical == "bell": activation = SpreadingActivation(hierarchy, decay=1, weighting="bell", root=tr.nx_root) elif options.hierarchical == "belllog": activation = SpreadingActivation(hierarchy, decay=1, weighting="belllog", root=tr.nx_root) elif options.hierarchical == "children": # weights are already initialized with 1/out_degree, so use basic SA with decay 1 activation = SpreadingActivation(hierarchy, decay=1, weighting="children") elif options.hierarchical == "binary": activation = BinarySA(hierarchy) elif options.hierarchical == "onehop": activation = OneHopActivation(hierarchy, verbose=VERBOSE) else: # basic activation = SpreadingActivation(tr.nx_graph, firing_threshold=1.0, decay=0.25, weighting=None) concepts = make_pipeline(concepts, activation) if options.graph_scoring_method: extractor = GraphVectorizer(method=options.graph_scoring_method, analyzer=concept_analyzer if options.concepts else NltkNormalizer().split_and_normalize) elif options.terms and (options.concepts or options.synsets): extractor = FeatureUnion([("terms", terms), ("concepts", concepts)]) elif options.terms: extractor = terms else: extractor = concepts if VERBOSE: print("Extracting features...") if VERBOSE > 1: start_ef = default_timer() X = extractor.fit_transform(X_raw) if VERBOSE > 1: print(default_timer() - start_ef) if options.persist: persister.persist(X, Y, tr) if VERBOSE: print("X = " + repr(X)) print("Vocabulary size: {}".format(X.shape[1])) print("Number of documents: {}".format(X.shape[0])) print("Mean distinct words per document: {}".format(X.count_nonzero() / X.shape[0])) words = X.sum(axis=1) print("Mean word count per document: {} ({})".format(words.mean(), words.std())) if VERBOSE > 1: X_tmp = X.todense() # drop samples without any features... X_tmp = X_tmp[np.unique(np.nonzero(X_tmp)[0])] print("[entropy] Dropped {} samples with all zeroes?!".format(X.shape[0] - X_tmp.shape[0])) X_tmp = X_tmp.T # transpose to compute entropy per sample h = entropy(X_tmp) print("[entropy] shape:", h.shape) print("[entropy] mean entropy per sample {} ({})".format(h.mean(), h.std())) # print("Mean entropy (base {}): {}".format(X_dense.shape[0], entropy(X_dense, base=X_dense.shape[0]).mean())) # print("Mean entropy (base e): {}".format(entropy(X_dense).mean())) # _, _, values = sp.find(X) # print("Mean value: %.2f (+/- %.2f) " % (values.mean(), 2 * values.std())) # n_iter = np.ceil(10**6 / (X.shape[0] * 0.9)) # print("Dynamic n_iter = %d" % n_iter) if options.interactive: print("Please wait...") clf = create_classifier(options, Y.shape[1]) # --- INTERACTIVE MODE --- clf.fit(X, Y) thesaurus = tr.thesaurus print("Ready.") try: for line in sys.stdin: x = extractor.transform([line]) y = clf.predict(x) desc_ids = mlb.inverse_transform(y)[0] labels = [thesaurus[desc_id]['prefLabel'] for desc_id in desc_ids] print(*labels) except KeyboardInterrupt: exit(1) exit(0) if VERBOSE: print("Performing %d-fold cross-validation..." % (options.folds if options.cross_validation else 1)) if options.plot: all_f1s = [] # --- CROSS-VALIDATION --- scores = defaultdict(list) if options.cross_validation: kf = model_selection.KFold(X.shape[0], n_folds=options.folds, shuffle=True) else: kf = ShuffleSplit(X.shape[0], test_size=options.test_size, n_iter=1) for train, test in kf: if VERBOSE: print("=" * 80) X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test] # mlp doesn't seem to like being stuck into a new process... if options.debug or options.clf_key in {'mlp', 'mlpthr'}: Y_pred, Y_train_pred = fit_predict(X_test, X_train, Y_train, options, tr) else: Y_pred, Y_train_pred = fit_predict_new_process(X_test, X_train, Y_train, options, tr) if options.training_error: scores['train_f1_samples'].append(f1_score(Y_train, Y_train_pred, average='samples')) scores['avg_n_labels_pred'].append(np.mean(Y_pred.getnnz(1))) scores['avg_n_labels_gold'].append(np.mean(Y_test.getnnz(1))) scores['f1_samples'].append(f1_score(Y_test, Y_pred, average='samples')) scores['p_samples'].append(precision_score(Y_test, Y_pred, average='samples')) scores['r_samples'].append(recall_score(Y_test, Y_pred, average='samples')) scores['f1_micro'].append(f1_score(Y_test, Y_pred, average='micro')) scores['p_micro'].append(precision_score(Y_test, Y_pred, average='micro')) scores['r_micro'].append(recall_score(Y_test, Y_pred, average='micro')) scores['f1_macro'].append(f1_score(Y_test, Y_pred, average='macro')) scores['p_macro'].append(precision_score(Y_test, Y_pred, average='macro')) scores['r_macro'].append(recall_score(Y_test, Y_pred, average='macro')) if options.plot: all_f1s.append(f1_per_sample(Y_test, Y_pred)) if options.worst: f1s = f1_per_sample(Y_test, Y_pred) predicted_labels = [[tr.thesaurus[l]['prefLabel'] for l in y] for y in mlb.inverse_transform(Y_pred)] f1s_ids = sorted(zip(f1s, [X_raw[i] for i in test], [[tr.thesaurus[l]['prefLabel'] for l in Y_raw[i]] for i in test], predicted_labels)) pprint(f1s_ids[:options.worst]) if options.hierarch_f1: scores['hierarchical_f_score'].append( hierarchical_f_measure(tr, Y_test, Y_pred)) if options.cross_validation and VERBOSE: print(' <> '.join(["%s : %0.3f" % (key, values[-1]) for key, values in sorted(scores.items())])) # if options.lsa: # if VERBOSE: print("Variance explained by SVD:", svd.explained_variance_ratio_.sum()) if VERBOSE: print("=" * 80) results = {key: (np.array(values).mean(), np.array(values).std()) for key, values in scores.items()} print(' <> '.join(["%s: %0.3f (+/- %0.3f)" % (key, mean, std) for key, (mean, std) in sorted(results.items())])) if options.output_file: write_to_csv(results, options) if options.plot: Y_f1 = np.hstack(all_f1s) Y_f1.sort() if VERBOSE: print("Y_f1.shape:", Y_f1.shape, file=sys.stderr) print("Saving f1 per document as txt numpy to", options.plot) np.savetxt(options.plot, Y_f1) return results
printF1scores() t = time.time() binarizer = MultiLabelBinarizer() #labels list is converted to binary matrix y_train= binarizer.fit_transform(y_train) random_state = np.random.RandomState(0) svmclassifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) svmclassifier.fit(X_train, y_train) y_predict = svmclassifier.predict(X_test) #Binary matrix is converted to labels y_predict_label = binarizer.inverse_transform(y_predict) print "Elaspsed Time: ", "{0:.1f}".format(time.time()-t), "sec" tdf = pd.read_csv(path_to_data+"test_biz_fc8features.csv") df = pd.DataFrame(columns=['business_id','labels']) for i in range(len(tdf)): biz = tdf.loc[i]['business'] label = y_predict_label[i] label = str(label)[1:-1].replace(",", " ") df.loc[i] = [str(biz), label] with open(path_to_data+"submission_fc8.csv",'w') as file67: df.to_csv(file67, index=False)
from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() train_counts = count_vect.fit_transform(stories) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer().fit(train_counts) X_train_tfidf = tfidf_transformer.transform(train_counts) #format tags from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() tag_list = preprocess_tags(tags) processed_tags = mlb.fit_transform(tag_list) print(processed_tags) #train the classifier from sklearn.multiclass import OneVsRestClassifier from sklearn.naive_bayes import MultinomialNB clf = OneVsRestClassifier(MultinomialNB())#MultinomialNB() clf.fit(X_train_tfidf,processed_tags) test_docs = ["funny funny joke", "died sad joke tragedy funny", "lasers and robots"] X_test_counts = count_vect.transform(test_docs,) print("X_test_counts.shape") print(X_test_counts.shape) X_test_tfidf = tfidf_transformer.transform(X_test_counts) predicted = clf.predict(X_test_tfidf) print(predicted) print(mlb.inverse_transform(predicted))
################## # In[ ]: classifier.fit(train_business_feature, y_ptrain_mlb) test_business_feature = pd.read_csv(data_root+'test_business_feature'+cluster +'.csv') business_id = test_business_feature['business_id'].reshape(-1,1) test_business_feature.drop('business_id', axis=1, inplace=True) y_predict_test = classifier.predict(test_business_feature) # In[ ]: y_predict_label = mlb.inverse_transform(y_predict_test) df = pd.DataFrame(columns=['business_id','labels']) for i in range(len(y_predict_label)): biz = business_id[i][0] label = y_predict_label[i] label = str(label)[1:-1].replace(",", " ") df.loc[i] = [str(biz), label] with open(data_root+"sub_pca300.csv",'w') as f: df.to_csv(f, index=False)
X_train_scaled_Concat = np.hstack((X_train_scaled,X_train_scaled_Res)) X_test_scaled = preprocessing.normalize(X_test, norm='l2') X_test_scaled_Res = preprocessing.normalize(X_test_Res, norm='l2') X_test_scaled_Concat = np.hstack((X_test_scaled,X_test_scaled_Res)) mlb = MultiLabelBinarizer() y_train= mlb.fit_transform(y_train) #Convert list of labels to binary matrix random_state = np.random.RandomState(0) classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) classifier.fit(X_train_scaled_Concat, y_train) y_predict = classifier.predict(X_test_scaled_Concat) #print list(mlb.classes_) y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec" test_data_frame = pd.read_csv(data_root+"test_biz_fc7features.csv") #fc7features and fc1000features have same business names df = pd.DataFrame(columns=['business_id','labels']) for i in range(len(test_data_frame)): biz = test_data_frame.loc[i]['business'] label = y_predict_label[i] label = str(label)[1:-1].replace(",", " ") df.loc[i] = [str(biz), label] with open(data_root+"submission_fc7_fc1000_norm.csv",'w') as f: df.to_csv(f, index=False)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2, random_state=random_state) print("About to start training classifier with set parameters on subset of train data") classifier = OneVsRestClassifier(GradientBoostingClassifier(learning_rate=0.01, n_estimators=5000, subsample=0.5, min_samples_split=175, min_samples_leaf=10, max_depth=5, max_features='sqrt', verbose=1, random_state=SEED)) classifier.fit(X_ptrain, y_ptrain) print("About to make predictions on sample of training data") y_ppredict = classifier.predict(X_ptest) print("Time passed: {0:.1f} sec".format(time.time() - t)) print("Samples of predicted labels (in binary matrix):\n{}".format(y_ppredict[0:3])) print("\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3])) statistics = pd.DataFrame(columns=["attribute " + str(i) for i in range(9)] + ['num_biz'], index=["biz count", "biz ratio"]) pd.options.display.float_format = '{:.0f}%'.format print(statistics) print("F1 score: {}".format(f1_score(y_ptest, y_ppredict, average='micro'))) print("Individual Class F1 score: {}".format(f1_score(y_ptest, y_ppredict, average=None))) # Re-Train classifier using all training data, and make predictions on test set t = time.time() mlb = MultiLabelBinarizer() y_train = mlb.fit_transform(y_train) # Convert list of labels to binary matrix print("About to train classifier on all training data (to have it ready to predict on submission test data)")
import time t=time.time() mlb = MultiLabelBinarizer() y_ptrain= mlb.fit_transform(y_train) #Convert list of labels to binary matrix random_state = np.random.RandomState(0) X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state) classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) #F1 score: 0.803711220644 #classifier = OneVsOneClassifier(svm.SVC(kernel='linear', probability=True)) #classifier = OutputCodeClassifier(svm.SVC(kernel='linear', probability=True)) classifier.fit(X_ptrain, y_ptrain) y_ppredict = classifier.predict(X_ptest) print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec" print "Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3] print "\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3]) statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"]) statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict)) pd.options.display.float_format = '{:.0f}%'.format statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict) statistics from sklearn.metrics import f1_score print "F1 score: ", f1_score(y_ptest, y_ppredict, average='micro') print "Individual Class F1 score: ", f1_score(y_ptest, y_ppredict, average=None)
print "Calculating Predictions..." files = ["xaa", "xab", "xac", "xad", "xae", "xaf"] header = True for chunk in files: t = time.time() print "chunk: " + chunk test_df = pd.read_csv(data_root + chunk) # test_features = test_df['feature vector'].values test_features = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']]) test_features = normalize(np.append(normalize(test_features[:, :8192]), normalize(test_features[:, 8192:]), axis=1)) reduced_test_features = model.transform(test_features) binarized_predicted_labels = classifier.predict(reduced_test_features) predicted_labels = mlb.inverse_transform(binarized_predicted_labels) print "Calculated Predictions... Time passed: ", "{0:.1f}".format(time.time() - t), "sec" print "Writing predictions to output file" test_data_frame = pd.read_csv(data_root + chunk) df = pd.DataFrame(columns=['business_id', 'labels']) for i in range(len(test_data_frame)): biz = test_data_frame.loc[i]['business'] label = predicted_labels[i] label = str(label)[1:-1].replace(",", " ") df.loc[i] = [str(biz), label] if header: with open(submission_root + "reduced_" + output_file_name + ".csv", 'w') as f: df.to_csv(f, index=False, header=header)
# ('tfidf', TfidfTransformer()), # ('to_dense', DenseTransformer()), # ('clf', OneVsRestClassifier(tree.DecisionTreeClassifier()))]) print '7th print' gc.collect() classifier.fit(X_train, Y) print '8th print' predicted = classifier.predict(X_test) predicted_probability = classifier.predict_proba(X_test) all_labels = mlb.inverse_transform(predicted) results = classifier.predict_proba(X_test)[0] # gets a dictionary of {'class_name': probability} prob_per_class_dictionary = dict(zip(all_labels, results)) # gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class'] results_ordered_by_probability = map(lambda x: x[0], sorted(zip(all_labels, results), key=lambda x: x[1], reverse=True)) print results_ordered_by_probability # for item, labels, probability in zip(X_test, all_labels,predicted_probability): # #print '%s => %s, %s' % (item, ', '.join(labels),str(probability)) # output_file_object.write('%s => %s, %s' % (item, ', '.join(labels),str(probability))+'\n')