def load_custom_dataset(dataset_name, path=None): if dataset_name == "20ng": arff_path = path if path else "./datasets/20NG-F.arff" n_labels = 20 label_location = "start" arff_file_is_sparse = False x_mulan, y_mulan, feature_names, label_names = load_from_arff( arff_path, n_labels, label_location=label_location, load_sparse=arff_file_is_sparse, return_attribute_definitions=True) return x_mulan, y_mulan, feature_names, label_names if dataset_name == "test": arff_path = path if path else "./datasets/test.arff" n_labels = 5 label_location = "end" arff_file_is_sparse = False x, y, feature_names, label_names = load_from_arff( arff_path, n_labels, label_location=label_location, load_sparse=arff_file_is_sparse, return_attribute_definitions=True) return x, y, feature_names, label_names
def load_dataset(self, which_set): if self.verbose: sys.stdout.write("Reading data...") if which_set not in {'train', 'test', 'full'}: raise ValueError('Unrecognized `which_set` value "%s". ' % (which_set, ) + 'Valid values are ["train", "test", "full"].') # datapath = os.path.join(self.datadir, which_set + '.pkl.gz') # dataset = pkl.load(gzip.open(datapath)) datapath = os.path.join(self.datadir, which_set + '.arff') dataset = load_from_arff( datapath, # number of labels labelcount=self.n_labels, # MULAN format, labels at the end of rows in arff data endian='little', # bag of words input_feature_type='int', encode_nominal=False, # sometimes the sparse ARFF loader is borked, like in delicious, # scikit-multilearn converts the loaded data to sparse representations, # so disabling the liac-arff sparse loader load_sparse=False, # this decides whether to return attribute names or not, usually # you don't need this return_attribute_definitions=False) if self.verbose: sys.stdout.write("Done.\n") return dataset
def prepare_medical_dataset(filename): data = load_from_arff(filename, label_count=45, load_sparse=False, return_attribute_definitions=True) cols_X = [i[0] for i in data[2]] cols_Y = [i[0] for i in data[3]] X_med_df = pd.DataFrame(data[0].todense(), columns=cols_X) y_med_df = pd.DataFrame(data[1].todense(), columns=cols_Y) df = pd.concat([X_med_df, y_med_df], 1) return df, cols_Y
def load_dataset(dataset_name): logger = logging.getLogger() path_train = '../data/mulan/{}/{}-train.arff'.format( dataset_name, dataset_name) path_test = '../data/mulan/{}/{}-test.arff'.format(dataset_name, dataset_name) if (not path.exists(path_train)) or (not path.exists(path_test)): logger.debug('data set \"{}\" not found.'.format(dataset_name)) return None, None, None, None X_train, y_train = dataset.load_from_arff(path_train, labelcount=14, endian="little") logger.debug(X_train) X_test, y_test = dataset.load_from_arff(path_test, labelcount=14, endian="little") return X_train.toarray(), y_train.toarray(), X_test.toarray( ), y_test.toarray()
def inner_load(path): return load_from_arff( path, # number of labels label_count=n_labels, # MULAN format, labels at the end of rows in arff data label_location=label_location, # bag of words input_feature_type=input_feature_type, encode_nominal=False, # sometimes the sparse ARFF loader is borked, like in delicious, # scikit-multilearn converts the loaded data to sparse representations, # so disabling the liac-arff sparse loader load_sparse=sparse, # this decides whether to return attribute names or not, usually # you don't need this return_attribute_definitions=False)
def load_moa_stream(filepath, labels): print("Reading original arff from path") with open(filepath) as arff_file: arff_file_content = [line.rstrip(",\n") + "\n" for line in arff_file] filename = "stream_{}".format(str(uuid.uuid1())) tmp_file = "/tmp/{}".format(filename) with open(tmp_file, "w") as opened: opened.write("".join(arff_file_content)) del arff_file_content print("Reading original arff from tmp") arff_path = tmp_file label_location = "start" arff_file_is_sparse = False return load_from_arff(arff_path, labels, label_location=label_location, load_sparse=arff_file_is_sparse, return_attribute_definitions=True)
def load_txt_data(dataset_name, walks, folds, rng): svd = TruncatedSVD(n_components=300, n_iter=10, random_state=0) std_scaler = Normalizer() tfidf = TfidfTransformer() if dataset_name == "tmc2007": labelcount = open( "../multi_label_dataset/tmc2007/tmc2007.xml").read().count("class") features, Y = dataset_load.load_from_arff( "../multi_label_dataset/tmc2007/tmc2007.arff", labelcount=labelcount, endian="little", load_sparse=True) features, Y = preprocess_data(features, Y) features = tfidf.fit_transform(features) features = svd.fit_transform(features) features = std_scaler.fit_transform(features) elif "yahoo" in dataset_name: labelcount = open("../multi_label_dataset/yahoo/{}.xml".format( dataset_name.split("-")[1])).read().count("Label") features, Y = dataset_load.load_from_arff( "../multi_label_dataset/yahoo/{}.arff".format( dataset_name.split("-")[1].title()), labelcount=labelcount, endian="little", load_sparse=True) features, Y = preprocess_data(features, Y) features = tfidf.fit_transform(features) features = svd.fit_transform(features) features = std_scaler.fit_transform(features) else: print("unknown dataset") exit() X = [] y = [] Y = Y.toarray() for I in range(len(walks)): X.append(walks[str(I)]) y.append(Y[I]) X = np.array( X) # np.ones((len(docs), max_sentences, maxlen), dtype=np.int64) * -1 y = np.array(y) # X = scaler.fit_transform(pca.fit_transform(X.toarray())) # X = select_norm_count(X, mindf=0.03, maxdf=0.8, normalise = True) # elif dataset_name == "delve": # X, Y = load_delve(rng, comb_type=comb_type, add_validation_set=add_validation_set, add_Text=add_Text, # is_cv=is_cv, d_size=d_size) index = np.arange(np.shape(X)[0]) unlabeled = np.where(Y.sum(-1) < 0)[0] labeled_idx = np.setdiff1d(index, unlabeled) print("Features shape = {}".format(X.shape)) print("Label shape = {}".format(Y.shape)) dataset = dataset_name.split("-")[1] folds, _, y, blacklist_samples = iterative_sampling( y, labeled_idx, folds, rng, dataset) sel_samples = np.setdiff1d(index, blacklist_samples) print(sel_samples.shape) return (X, y, folds, features)
def main(**kwargs): """ main function for problem transformation algorithms :param kwargs: includes path_to_data,feature_type, num_labels, algorithm :return: """ path = kwargs.pop('path_to_data') feature_type = kwargs.pop('feature_type') num_labels = kwargs.pop('num_labels') algorithm = kwargs.pop('algorithm') X, y = load_from_arff(path, label_count=num_labels, label_location="end", load_sparse=not feature_type) X = X.toarray() y = y.toarray() features_list, target_list = construct_cv_folds(5, X, y) accuracy = [] ham_score = [] precision = [] recall = [] f1_score = [] if feature_type == 1: for i in range(len(X[0])): X[:, i] = pre_process_continuous(X[:, i], 5) for i in range(5): training_set_features = [] training_set_target = [] for j in range(5): if i != j: training_set_features = training_set_features + features_list[j] training_set_target = training_set_target + target_list[j] X_train = np.array(training_set_features) y_train = np.array(training_set_target) X_test = np.array(features_list[i]) y_test = np.array(target_list[i]) print(i) params = {'feature_type': feature_type, 'num_bins': 5} # Training and Testing if feature_type == 0: if algorithm == "BR": br = BinaryRelevanceNB(nbayes, nbayes_prediction, params) br.fit(X_train, y_train) predictions = br.predict(X_test) else: cc = ClassifierChainsNB(nbayes, nbayes_prediction, params) cc.fit(X_train, y_train) predictions = cc.predict(X_test) else: if algorithm == "BR": br = BinaryRelevanceNB(nbayes, nbayes_prediction, params) br.fit(X_train, y_train) predictions = br.predict(X_test) else: cc = ClassifierChainsNB(nbayes, nbayes_prediction, params) cc.fit(X_train, y_train) predictions = cc.predict(X_test) # print(predictions) # print(y_test) acc = accuracy_score(y_test, predictions) ham = hamming_score(y_test, predictions) p, r, f1, _ = precision_recall_fscore_support(y_test, predictions, average='micro') accuracy.append(acc) ham_score.append(ham) precision.append(p) recall.append(r) f1_score.append(f1) print("Accuracy: " + str(sum(accuracy)/5)) print("Hamming_Score: " + str(sum(ham_score)/5)) print("Precision: " + str(sum(precision)/5)) print("Recall: " + str(sum(recall)/5)) print("F1_Score: " + str(sum(f1_score)/5))
datasets = [ 'yahoo-' + data.split('.')[0] for data in os.listdir('../multi_label_dataset/yahoo') if '.xml' in data ] datasets = [ data.split('.')[0] for data in os.listdir('../multi_label_dataset/tmc2007') if '.xml' in data ] for dataset_name in datasets: if dataset_name == "tmc2007": labelcount = open( "../multi_label_dataset/tmc2007/tmc2007.xml").read().count("class") X, Y = dataset_load.load_from_arff( "../multi_label_dataset/tmc2007/tmc2007.arff", labelcount=labelcount, endian="little", load_sparse=True) X = tfidf.fit_transform(X) X, Y = preprocess_data(X, Y) labeled_idx = np.arange(Y.shape[0]) selection, Y, blacklist_samples = iterative_sampling( Y.toarray(), labeled_idx, 5, dataset_name) writelabels(Y, selection, dataset_name) X = X.toarray() X[X <= 0.1] = 0 S = np.zeros((X.shape[0], X.shape[0])) for k in range(X.shape[1]): connect_doc = np.where(X[:, k] > 0)[0]