def main(): data_dir = sys.argv[1] trn_x = du.read_sparse_file(os.path.join(data_dir, sys.argv[2])) trn_y = du.read_sparse_file(os.path.join(data_dir, sys.argv[3])) yft_x = du.read_sparse_file(os.path.join(data_dir, sys.argv[4])) tmp_mdata = sys.argv[5] assert trn_x.shape[0] == trn_y.shape[0], "Number of instances must be same in features and labels" num_labels = trn_y.shape[1] valid_trn_x = np.where(trn_x.getnnz(axis=1) > 0)[0] valid_trn_y = np.where(trn_y.getnnz(axis=1) > 0)[0] valid_idx = np.intersect1d(valid_trn_x, valid_trn_y) trn_x = trn_x[valid_idx] trn_y = trn_y[valid_idx] features = np.where(trn_x.getnnz(axis=0) > 0)[0] labels = np.where(trn_y.getnnz(axis=0) > 0)[0] v_lbs_wrds = np.where(yft_x[labels].getnnz(axis=0) > 0)[0] union_fts = np.union1d(v_lbs_wrds, features) path = os.path.join(tmp_mdata, 'features_split.txt') np.savetxt(path, union_fts, fmt='%d') path = os.path.join(tmp_mdata, 'labels_split.txt') np.savetxt(path, labels, fmt='%d') path = os.path.join(tmp_mdata, 'v_lbs_fts_split.txt') np.savetxt(path, union_fts, fmt='%d') params = "{},{},{},{}".format(union_fts.size, num_labels, labels.size, union_fts.size) print(params) stats_obj = {'header': 'num_features,num_labels,valid_num_labels, valid_num_features'} stats_obj['all'] = params json.dump(stats_obj, open(os.path.join( tmp_mdata, "split_stats.json"), 'w'), indent=4)
def main(targets_file, train_file, predictions_file, A, B): """ Args: targets_file: test labels train_file: train labels (to compute prop) prediction_file: predicted labels A: int: to compute propensity B: int: to compute propensity """ true_labels = data_utils.read_sparse_file(targets_file) predicted_labels = data_utils.read_sparse_file(predictions_file) inv_psp = compute_inv_propensity(train_file, A, B) acc = xc_metrics.Metrics(true_labels=true_labels, inv_psp=inv_psp) args = acc.eval(predicted_labels, 5) print(xc_metrics.format(*args))
def read_predictions(self, fname): if self.ftype == 'mat': return loadmat(fname)['predicted_labels'] elif self.ftype == 'txt': return data_utils.read_sparse_file(fname) elif self.ftype == 'npz': return load_npz(fname)
def main(targets_label_file, train_label_file, predictions_file, A, B, docs, lbls): true_labels = _remove_overlap( data_utils.read_sparse_file( targets_label_file, force_header=True).tolil(), docs, lbls) trn_labels = data_utils.read_sparse_file( train_label_file, force_header=True) inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A=A, B=B) acc = xc_metrics.Metrics( true_labels, inv_psp=inv_propen, remove_invalid=False) predicted_labels = _remove_overlap( load_npz(predictions_file+'.npz').tolil(), docs, lbls) rec = xc_metrics.recall(predicted_labels, true_labels, k=20)[-1]*100 print("R@20=%0.2f" % (rec)) args = acc.eval(predicted_labels, 5) print(xc_metrics.format(*args))
def prepare_data(f_train_x, f_train_y, f_test_x, f_test_y, f_val_x, f_val_y): train_x = data_utils.read_sparse_file(f_train_x).todense() train_y = pd.read_csv(f_train_y, header=None) train_y = np.array(train_y).reshape(len(train_y), ) test_x = data_utils.read_sparse_file(f_test_x).todense() test_y = pd.read_csv(f_test_y, header=None) test_y = np.array(test_y).reshape(len(test_y), ) val_x = data_utils.read_sparse_file(f_val_x).todense() val_y = pd.read_csv(f_val_y, header=None) val_y = np.array(val_y).reshape(len(val_y), ) print('Shape of Trainig data :', train_x.shape, train_y.shape) print('Shape of Testing data :', test_x.shape, test_y.shape) print('Shape of Validation data :', val_x.shape, val_y.shape) return train_x, train_y, test_x, test_y, val_x, val_y
def main(tst_label_fname, trn_label_fname, filter_fname, pred_fname, A, B, betas, top_k, save): true_labels = data_utils.read_sparse_file(tst_label_fname) trn_labels = data_utils.read_sparse_file(trn_label_fname) inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A, B) mapping = get_filter_map(filter_fname) acc = xc_metrics.Metrics(true_labels, inv_psp=inv_propen) root = os.path.dirname(pred_fname) ans = "" if isinstance(betas, list) and betas[0] != -1: knn = filter_predictions(load_npz(pred_fname + '_knn.npz'), mapping) clf = filter_predictions(load_npz(pred_fname + '_clf.npz'), mapping) args = acc.eval(clf, 5) ans = f"classifier\n{xc_metrics.format(*args)}" args = acc.eval(knn, 5) ans = ans + f"\nshortlist\n{xc_metrics.format(*args)}" clf = retain_topk(clf, k=top_k) knn = retain_topk(knn, k=top_k) clf = normalize(sigmoid(clf), norm='max') knn = normalize(sigmoid(knn), norm='max') for beta in betas: predicted_labels = beta * clf + (1 - beta) * knn args = acc.eval(predicted_labels, 5) ans = ans + f"\nbeta: {beta:.2f}\n{xc_metrics.format(*args)}" if save: fname = os.path.join(root, f"score_{beta:.2f}.npz") save_npz(fname, retain_topk(predicted_labels, k=top_k), compressed=False) else: predicted_labels = filter_predictions( sigmoid(load_npz(pred_fname + '.npz')), mapping) args = acc.eval(predicted_labels, 5) ans = xc_metrics.format(*args) if save: print("Saving predictions..") fname = os.path.join(root, "score.npz") save_npz(fname, retain_topk(predicted_labels, k=top_k), compressed=False) line = "-" * 30 print(f"\n{line}\n{ans}\n{line}") return ans
def compute_inv_propensity(train_file, A, B): """ Compute Inverse propensity values Values for A/B: Wikpedia-500K: 0.5/0.4 Amazon-670K, Amazon-3M: 0.6/2.6 Others: 0.55/1.5 """ train_labels = data_utils.read_sparse_file(train_file) inv_propen = xc_metrics.compute_inv_propesity(train_labels, A, B) return inv_propen
def main(tst_label_fname, trn_label_fname, pred_fname, A, B, save, *args, **kwargs): true_labels = data_utils.read_sparse_file(tst_label_fname) trn_labels = data_utils.read_sparse_file(trn_label_fname) inv_propen = xc_metrics.compute_inv_propesity(trn_labels, A, B) acc = xc_metrics.Metrics(true_labels, inv_psp=inv_propen) root = os.path.dirname(pred_fname[-1]) predicted_labels = read_files(pred_fname) ens_predicted_labels = merge(predicted_labels) ans = "" for idx, pred in enumerate(predicted_labels): args = acc.eval(pred, 5) ans = ans + f"learner: {idx}\n{xc_metrics.format(*args)}\n" args = acc.eval(ens_predicted_labels, 5) ans = ans + f"Ensemble\n{xc_metrics.format(*args)}" if save: print("Saving predictions..") fname = os.path.join(root, "score.npz") save_npz(fname, ens_predicted_labels, compressed=False) line = "-"*30 print(f"\n{line}\n{ans}\n{line}") return ans
def load(self, data_dir, fname, X): if X is not None: return X else: assert fname is not None, "Filename can not be None." fname = os.path.join(data_dir, fname) if fname.lower().endswith('.pkl'): return pickle.load(open(fname, 'rb'))['X'] elif fname.lower().endswith('.txt'): return data_utils.read_sparse_file( fname, dtype=np.float32) else: raise NotImplementedError("Unknown file extension")
def load(self, data_dir, fname, Y): if Y is not None: return Y elif fname is None: return None else: fname = os.path.join(data_dir, fname) if fname.lower().endswith('.pkl'): return pickle.load(open(fname, 'rb'))['Y'] elif fname.lower().endswith('.txt'): return data_utils.read_sparse_file(fname, dtype=np.float32, safe_read=False) else: raise NotImplementedError("Unknown file extension")
def run(feat_fname, lbl_fname, feature_type, method, threshold, seed, tmp_dir): np.random.seed(seed) if feature_type == 'dense': features = data_utils.read_gen_dense(feat_fname) elif feature_type == 'sparse': features = data_utils.read_gen_sparse(feat_fname) else: raise NotImplementedError() labels = data_utils.read_sparse_file(lbl_fname) assert features.shape[0] == labels.shape[0], \ "Number of instances must be same in features and labels" num_features = features.shape[1] stats_obj = {} stats_obj['threshold'] = threshold stats_obj['method'] = method sd = SurrogateMapping(method=method, threshold=threshold, feature_type=feature_type) sd.fit(features, labels) stats_obj['surrogate'] = "{},{},{}".format(num_features, sd.num_surrogate_labels, sd.num_surrogate_labels) stats_obj['extreme'] = "{},{},{}".format(num_features, sd.num_labels, len(sd.valid_labels)) json.dump(stats_obj, open(os.path.join(tmp_dir, "data_stats.json"), 'w'), indent=4) np.savetxt(os.path.join(tmp_dir, "valid_labels.txt"), sd.valid_labels, fmt='%d') np.savetxt(os.path.join(tmp_dir, "surrogate_mapping.txt"), sd.mapping, fmt='%d')
import time import math import pdb from xclib.data import data_utils import hnswlib lbl_ft_file = sys.argv[1] model_file = sys.argv[2] M = int(sys.argv[3]) efC = int(sys.argv[4]) num_threads = int(sys.argv[5]) num_ft = int(sys.argv[6]) metric_space = sys.argv[7] start = time.time() data = data_utils.read_sparse_file(lbl_ft_file) end = time.time() start = time.time() index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) index.addDataPointBatch(data) index.createIndex({ 'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC }) end = time.time() print('Training time of ANNS datastructure = %f' % (end - start)) nmslib.saveIndex(index, model_file)
from xclib.data import data_utils from scipy import stats import scipy import numpy as np import math import time # Read sparse file labels = data_utils.read_sparse_file('train_x.txt') x1 = scipy.sparse.csr_matrix.todense(labels) x = np.asarray(x1) #x = x2[u] f = open("train_y.txt", "r") contents = f.read() y1 = list(map(int, contents.split())) y = np.array(y1) y5 = np.array(y1) #y = y2[u] #print(y_prime) class Node: def __init__(self): self.index = -1 self.label = -1 self.leaf = False
] print("len(trn_point_titles), len(tst_point_titles), len(label_titles) = ", len(trn_point_titles), len(tst_point_titles), len(label_titles)) trn_point_features = np.load( "{}/{}CondensedData/trn_point_embs.npy".format(DATASET, EMB_TYPE)) label_features = np.load("{}/{}CondensedData/label_embs.npy".format( DATASET, EMB_TYPE)) tst_point_features = np.load( "{}/{}CondensedData/tst_point_embs.npy".format(DATASET, EMB_TYPE)) print( "trn_point_features.shape, tst_point_features.shape, label_features.shape", trn_point_features.shape, tst_point_features.shape, label_features.shape) trn_X_Y = data_utils.read_sparse_file("{}/trn_X_Y.txt".format(DATASET), force_header=True) tst_X_Y = data_utils.read_sparse_file("{}/tst_X_Y.txt".format(DATASET), force_header=True) tst_valid_inds, trn_X_Y, tst_X_Y_trn, tst_X_Y_val, node_features, valid_tst_point_features, label_remapping, adjecency_lists, NUM_TRN_POINTS = prepare_data( trn_X_Y, tst_X_Y, trn_point_features, tst_point_features, label_features, trn_point_titles, tst_point_titles, label_titles, args) hard_negs = [[] for i in range(node_features.shape[0])] print("trn_X_Y.shape, tst_X_Y_trn.shape, tst_X_Y_val.shape", trn_X_Y.shape, tst_X_Y_trn.shape, tst_X_Y_val.shape) temp = [ line.strip().split() for line in open( "{}/filter_labels_test.txt".format(DATASET), "r").readlines()
def get_accuracy(clf, a, b): i = 0 cnt = 0 for x in clf.predict(a): if (x == b[i]): cnt += 1 i += 1 return (cnt / i) if __name__ == "__main__": import sys from sklearn.datasets import load_iris #___________________________________________________________________________________ train_x = data_utils.read_sparse_file(train_x_path) train_y = load_y(train_y_path, train_x.shape[0]) test_x = data_utils.read_sparse_file(test_x_path) test_y = load_y(test_y_path, test_x.shape[0]) val_x = data_utils.read_sparse_file(val_x_path) val_y = load_y(val_y_path, val_x.shape[0]) i = 0 train_data = np.zeros(shape=train_size) for x in train_x.toarray(): train_data[i] = [int(i) for i in x] i += 1 train_data = train_data[:100, :] train_y = train_y[:100]
def read_labels(f_name): f = pd.read_csv(f_name, header=None, encoding='ISO-8859-1') f = f.to_numpy() return f # In[3]: Y_test = read_labels( '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/test_y.txt') Y_train = read_labels( '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/train_y.txt') x_test = data_utils.read_sparse_file( '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/test_x.txt', force_header=True) x_train = data_utils.read_sparse_file( '/home/shreya/Sem6/COL774/A3/virus/ass3_parta_data/train_x.txt', force_header=True) # In[4]: # X_train = np.vstack((x_train[0].toarray(), x_train[1].toarray())) # for i in range(2,x_train.shape[0]): # l = x_train[i].toarray() # X_train = np.vstack((X_train, x_train[i].toarray())) X_train = x_train.toarray() # In[5]:
def get_matrix_from_txt(path, isSparse): if (isSparse): labels = data_utils.read_sparse_file('trn_X_Xf.txt', force_header=True) features, labels, num_samples, num_features, num_labels = data_utils.read_data( path) return features.toarray(), labels.toarray().astype(int)
from matplotlib import pyplot as plt from collections import deque import copy import sys from treeclass import Decision_tree_classifier train_x_path = sys.argv[1] train_y_path = sys.argv[2] test_x_path = sys.argv[3] test_y_path = sys.argv[4] val_x_path = sys.argv[5] val_y_path = sys.argv[6] train_x = data_utils.read_sparse_file(train_x_path, force_header=True).toarray() test_x= data_utils.read_sparse_file(test_x_path, force_header=True).toarray() val_x = data_utils.read_sparse_file(val_x_path, force_header=True).toarray() train_y = np.array(pd.read_csv(train_y_path, header = None)) test_y = np.array(pd.read_csv(test_y_path, header = None)) val_y = np.array(pd.read_csv(val_y_path, header = None)) def plot_node_acc(test_score,train_acc, val_score,x, var_param_name ): ax= plt.figure(figsize=(12,7)) plt.plot(x, train_acc,color='green') plt.plot(x,test_score) plt.plot(x,val_score, color='r') plt.legend(['Train Accuracy', 'Test Accuracy', 'Validation Accuracy']) plt.xlabel(var_param_name) plt.ylabel('Accuracy') plt.title('Accuracy vs Number of nodes')
def readFileSparse(path): return data_utils.read_sparse_file(path, header=True)
model_file = sys.argv[2] num_ft = int(sys.argv[3]) num_lbls = int(sys.argv[4]) efS = int(sys.argv[5]) num_nbrs = int(sys.argv[6]) write_dist = int(sys.argv[7]) out_dir = sys.argv[8] num_thread = int(sys.argv[9]) num_out_threads = int(sys.argv[10]) metric_space = sys.argv[11] lbl_ft_file = sys.argv[12] index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR) data = data_utils.read_sparse_file(lbl_ft_file) index.addDataPointBatch(data) nmslib.loadIndex(index, model_file) index.setQueryTimeParams({'efSearch': efS, 'algoType': 'old'}) start = time.time() query = data_utils.read_sparse_file(tst_ft_file) end = time.time() start = time.time() nbrs = index.knnQueryBatch(query, k=num_nbrs, num_threads=num_thread) end = time.time() print('Time taken to find approx nearest neighbors = %f' % (end - start)) batch_size = int(math.ceil(float(len(nbrs)) / float(num_out_threads))) for i in range(num_out_threads):
# In[1]: from xclib.data import data_utils import numpy as np import pandas as pd from math import log2 import time import matplotlib.pyplot as plt import sys # # Reading the data-set # In[2]: # Read sparse file train_x = data_utils.read_sparse_file(sys.argv[1], force_header=True) train_x = np.array(train_x.toarray(), dtype=int) train_y = pd.read_csv(sys.argv[2], sep="\n", header=None).to_numpy() test_x = data_utils.read_sparse_file(sys.argv[3], force_header=True) test_x = np.array(test_x.toarray(), dtype=int) test_y = pd.read_csv(sys.argv[4], sep="\n", header=None).to_numpy() val_x = data_utils.read_sparse_file(sys.argv[5], force_header=True) val_x = np.array(val_x.toarray(), dtype=int) val_y = pd.read_csv(sys.argv[6], sep="\n", header=None).to_numpy() # # Calculating Entropy # In[3]:
metavar='INIT_RATIO', type=float, help='set initial ratio of labels for pretraining') args = parser.parse_args() def csr2list(M): row, col, _ = find(M) res = [[] for _ in range(M.shape[0])] for r, c in zip(row, col): res[r].append(c) return res Ytr = data_utils.read_sparse_file(args.trnYfile, force_header=True) Yte = data_utils.read_sparse_file(args.tstYfile, force_header=True) #prob = data_utils.read_sparse_file(args.model_dir + "/overall_score_mat_init_ratio_50_batch_size_" + str(args.batch_size), force_header=True) prob = data_utils.read_sparse_file(args.score, force_header=True) # dense label matrix ground_truth = Yte.toarray().astype(np.int32) mlb = MultiLabelBinarizer(range(Yte.shape[1]), sparse_output=True) targets = mlb.fit_transform(csr2list(Yte)) train_labels = csr2list(Ytr) if args.dataset.startswith('WikiPedia'): a, b = 0.55, 0.1 elif args.dataset.startswith('Amazon-'): a, b = 0.6, 2.6 else:
import pickle from tqdm import tqdm #Loading the data first if len (sys.argv) != 7 : print("Please pass the required 6 arguments. ") sys.exit (1) trnxPath = sys.argv[1] trnyPath = sys.argv[2] tstxPath = sys.argv[3] tstyPath = sys.argv[4] valxPath = sys.argv[5] valyPath = sys.argv[6] trainX = data_utils.read_sparse_file(trnxPath).toarray() trainY = pd.read_csv(trnyPath,header=None).to_numpy() testX = data_utils.read_sparse_file(tstxPath).toarray() testY = pd.read_csv(tstyPath,header=None).to_numpy() validX = data_utils.read_sparse_file(valxPath).toarray() validY = pd.read_csv(valyPath,header=None).to_numpy() # In[2]: print(trainX.shape) print(testX.shape) print(validX.shape)