def show_all_ipd(csvfile, mark="", alpha=1): matrix, label, row, col = mcsv.load_matrix(csvfile) labeltype = { l: i for i, l in enumerate(sorted(set([l for l in label if mark in l]))) } x = [[] for t in labeltype] y = [[] for t in labeltype] if "left" == mark or "right" == mark and len(labeltype) == 2: lvalue = list(labeltype.keys()) def filter(x, l): xl = x[label == l] return xl[xl != -1] index = [ i for i, cv in enumerate(matrix.transpose()) if ttest(filter(cv, lvalue[0]), filter(cv, lvalue[1]), 10, alpha) ] else: index = list(range(len(col))) for m, l, r in zip(matrix[:, index], label, row): for mi, c in zip(m, col[index]): if mi >= 0 and l in labeltype: x[labeltype[l]].append(c) y[labeltype[l]].append(mi) for i, (xi, yi) in enumerate(zip(x, y)): li = plt.scatter(xi, yi, color=scalarMap.to_rgba(i), s=3) print(i, scalarMap.to_rgba(i)) plt.show()
def test_feature(csvfile, mark, ratio, alpha, count=1): alpha = float(alpha) ratio = float(ratio) count = int(count) matrix, label, row, col = mcsv.load_matrix(csvfile) omatrix = matrix.copy() jaccard, train_size, all_size, train_in_all = 0, [], [], 0 for i in range(count): itrain, itest = random_split_data(ratio, matrix, label) # matrix = normalize(matrix, label) matrix = norm1(matrix) labeltype = { l: i for i, l in enumerate(sorted(set([l for l in label if mark in l]))) } assert "left" == mark or "right" == mark and len(labeltype) == 2 lvalue = list(labeltype.keys()) Q = alpha itest_all = set( np.array(range(matrix.shape[1]))[benjamini_hochberg_filter( matrix, label == lvalue[0], label == lvalue[1], Q)]) itest_train = set( np.array(range( matrix[itrain, ].shape[1]))[benjamini_hochberg_filter( matrix[itrain, ], label[itrain, ] == lvalue[0], label[itrain, ] == lvalue[1], Q)]) #itest_all = set(ttest_matrix(matrix, label, lvalue, alpha)) #itest_train = set(ttest_matrix(matrix[itrain,], label[itrain], lvalue, alpha)) #print(itest_all) #print(itest_train) train_size.append(len(itest_train)) all_size.append(len(itest_all)) train_in_all += len( itest_train.intersection(itest_all)) / len(itest_all) jaccard += len(itest_all.intersection(itest_train)) / len( itest_all.union(itest_train)) print(train_size) print("Size:", all_size[0], np.mean(train_size), np.var(train_size)) print("Jaccard index:", jaccard / count, train_in_all / count)
def test_feature(csvfile, mark, alpha): matrix, label, row, col = mcsv.load_matrix(csvfile) omatrix = matrix.copy() itrain, itest = random_split_data(0.8, matrix, label) matrix = normalize(matrix, label) labeltype = { l: i for i, l in enumerate(sorted(set([l for l in label if mark in l]))) } assert "left" == mark or "right" == mark and len(labeltype) == 2 lvalue = list(labeltype.keys()) itest_all = ttest_matrix(matrix, label, lvalue, alpha) itest_train = ttest_matrix(omatrix[itrain, ], label[itrain], lvalue, alpha) print(itest_all) print(itest_train)
def summary_matrix(csvfile): from collections import defaultdict matrix, label, row, col = mcsv.load_matrix(csvfile) print("matrix.shape:", matrix.shape) labelcount = defaultdict(int) for l in label: labelcount[l] += 1 print("Label Count:", labelcount.items()) readcount = defaultdict(int) for r in row: readcount[r] += 1 countread = defaultdict(list) for r, c in readcount.items(): countread[c].append(r) for c, r in countread.items(): print(c, len(r)) if c >= 4: print(r)