def unionx(spdr): data_path = spdr.DATA_PATH mt_file = 'X.npz', 'Y.npz' ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem', 'extmesh'] import bionlp.util.io as io X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr') ft_idx = {} for i, col in enumerate(X.columns): for ft in ft_order: if (col.startswith(ft+'_')): ft_idx.setdefault(ft, []).append(i) break union_ft_idx = {} for j in range(Y.shape[1]): X_j = io.read_df(os.path.join(data_path, 'X_%i.npz'%j), with_idx=True, sparse_fmt='csr') for i, col in enumerate(X_j.columns): for ft in ft_order: if (col.startswith(ft+'_')): union_ft_idx.setdefault(ft, set([])).add(col) break new_ft = [] for ft in ft_order: new_ft.extend(list(union_ft_idx[ft])) union_X = X.loc[:,new_ft] io.write_df(union_X, os.path.join(data_path, 'union_X.npz'), with_idx=True, sparse_fmt='csr', compress=True)
def ftstat(spdr): ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem'] ft_name = {'lem':'LBoW', 'nn':'N-Bigrams', 'ner':'NE', 'parse':'GR', 'vc':'VC', 'mesh':'MeSH', 'chem':'Chem'} # ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem', 'extmesh', 'expmesh'] # ft_name = {'lem':'LBoW', 'nn':'N-Bigrams', 'ner':'NE', 'parse':'GR', 'vc':'VC', 'mesh':'MeSH', 'chem':'Chem', 'extmesh':'ExtMeSH', 'expmesh':'ExpMeSH'} data_path = spdr.DATA_PATH mt_file = 'X.npz', 'Y.npz' import bionlp.util.io as io X, Y = io.read_df(os.path.join(data_path, mt_file[0]), sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), sparse_fmt='csr') ft_idx = {} for i, col in enumerate(X.columns): for ft in ft_order: if (col.startswith(ft+'_')): ft_idx.setdefault(ft, []).append(i) break ftor_list = [] for i in xrange(Y.shape[1]): # mt_lb = sp.sparse.csr_matrix(X.iloc[np.arange(Y.shape[0])[Y.iloc[:,i].values == 1],:].as_matrix()) agg_X = X.iloc[np.arange(Y.shape[0])[Y.iloc[:,i].values == 1],:].values.sum(axis=0) ft_sum = np.zeros((1,X.shape[1])) # ft_sum[0,mt_lb.indices] = 1 ft_sum[0,agg_X > 0] = 1 ftor_list.append(ft_sum) ftor_mt = np.concatenate(ftor_list, axis=0) ft_set_list = [] for ft in ft_order: ft_set_list.append(ftor_mt[:,ft_idx[ft]].sum(axis=1)) ft_stat_mt = np.column_stack(ft_set_list) ft_stat_pd = pd.DataFrame(ft_stat_mt, index=HMLB, columns=[ft_name[fset] for fset in ft_order]) ft_stat_pd.to_excel('ft_stat.xlsx')
def get_mltl_npz(lbs=[], mltlx=True, spfmt='csr'): if (len(lbs) == 0): return None, None Xs = [] Ys = [] for lb in lbs: if (mltlx): Xs.append(io.read_df(os.path.join(DATA_PATH, 'X_%i.npz' % lb), with_idx=True, sparse_fmt=spfmt)) Ys.append(io.read_df(os.path.join(DATA_PATH, 'y_%i.npz' % lb), with_col=False, with_idx=True, sparse_fmt=spfmt)) if (not mltlx): Xs.append(io.read_df(os.path.join(DATA_PATH, 'X.npz'), with_idx=True, sparse_fmt=spfmt)) return Xs, Ys
def axstat_cmp(spdr): data_path = spdr.DATA_PATH mt1_file, mt2_file = 'X1.npz', 'X2.npz' import bionlp.util.io as io mt1 = sp.sparse.coo_matrix(io.read_df(os.path.join(data_path, mt1_file), sparse_fmt='csr').as_matrix()) mt2 = sp.sparse.coo_matrix(io.read_df(os.path.join(data_path, mt2_file), sparse_fmt='csr').as_matrix()) mask1_mt = np.zeros(mt1.shape) mask2_mt = np.zeros(mt2.shape) mask1_mt[mt1.row, mt1.col] = 1 mask2_mt[mt2.row, mt2.col] = 1 axstat1 = mask1_mt.sum(axis=0) axstat2 = mask2_mt.sum(axis=0) plot.plot_2hist(axstat1, axstat2, '# abstracts', '# features through \nnormalized cumulative log', normed=True, cumulative=True, log=True, title='Number of features (y-axis) that have m abstracts (x-axis)', fname='hist_abs_ft')
def pred2uniq(dir_path, file_ptn, mdls, pids=range(10), crsval=10): import scipy.stats as stats from hm_clf import pred_ovl import bionlp.util.math as imath # uniqp_dict = dict.fromkeys(mdls, dict.fromkeys(HMLB, [])) uniqp_dict = dict((mdl, dict((k, []) for k in HMLB)) for mdl in mdls) for pid in pids: crsval_povl, crsval_spearman = [[] for i in range(2)] for crs_t in xrange(crsval): preds, true_lb = [], None for mdl in mdls: mdl = mdl.replace(' ', '_').lower() file = file_ptn.replace('#CRST#', str(crs_t)).replace('#MDL#', mdl).replace('#PID#', str(pid)) npz_file = io.read_npz(os.path.join(dir_path, file)) preds.append(npz_file['pred_lb']) true_lb = npz_file['true_lb'] preds.append(true_lb) tpreds_mt = np.column_stack([x.ravel() for x in preds]) test_idx = io.read_df(os.path.join(dir_path, 'test_idx_crsval_%s_%s.npz' % (crs_t, pid)), with_idx=True).index.tolist() uniq_true_list = [] for i in xrange(tpreds_mt.shape[1] - 1): rmd_idx = range(tpreds_mt.shape[1] - 1) del rmd_idx[i] condition = np.logical_and(np.logical_not(np.any(tpreds_mt[:,rmd_idx], axis=1)), np.all(tpreds_mt[:,[i,-1]], axis=1)) uniq_true = np.arange(tpreds_mt.shape[0])[condition] for j in uniq_true: uniqp_dict[mdls[i]][HMLB[j % true_lb.shape[1]]].append(test_idx[j / true_lb.shape[1]]) uniq_true_list.append(', '.join(['%s | %s' % (test_idx[x/true_lb.shape[1]], x%true_lb.shape[1]) for x in uniq_true])) uniq_true_str = '\n'.join(['%s: %s' % (mdls[i], uniq_true_list[i]) for i in range(len(uniq_true_list))]) fs.write_file(uniq_true_str, os.path.join(dir_path, 'uniqtrue_crsval_%s_%s.txt' % (crs_t, pid))) for mdl, idmap in uniqp_dict.iteritems(): uniqp_df = pd.DataFrame([(hm, ', '.join(idmap[hm])) for hm in HMLB], columns=['Hallmark', 'PMIDS']) uniqp_df.to_excel(os.path.join(dir_path, 'uniqtrue_%s.xlsx' % mdl), index=False)
def filtx(spdr): data_path = spdr.DATA_PATH mt_file = 'X.npz', 'Y.npz' import bionlp.util.io as io X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr') Xs = spdr.ft_filter(X, Y) for i, x_df in enumerate(Xs): io.write_df(x_df, os.path.join(data_path, 'X_%i.npz' % i), with_idx=True, sparse_fmt='csr', compress=True)
def matshow(spdr): from matplotlib import pyplot as plt data_path = spdr.DATA_PATH mt_file = 'X.npz' X=io.read_df(os.path.join(data_path, mt_file), with_idx=True, sparse_fmt='csr') plt.matshow(X.values, cmap=plt.cm.Blues) plt.title('Standard Dataset') plt.savefig('X_matshow')
def avgfeatw(dir_path='.'): df_list = [] for file in fs.listf(dir_path): if file.endswith(".npz"): df_list.append(io.read_df(os.path.join(dir_path, file), with_idx=True)) feat_w_mt = pd.concat([df.loc[:,'Importance Mean'] for df in df_list], axis=1, join_axes=[df_list[0].index]).astype('float').values feat_w_avg = feat_w_mt.mean(axis=1) feat_w_std = feat_w_mt.std(axis=1) sorted_idx = np.argsort(feat_w_avg, axis=-1)[::-1] sorted_feat_w = np.column_stack((df_list[0].loc[:,'Feature Name'].values[sorted_idx], feat_w_avg[sorted_idx], feat_w_std[sorted_idx])) feat_w_df = pd.DataFrame(sorted_feat_w, index=df_list[0].index.values[sorted_idx], columns=['Feature Name', 'Importance Mean', 'Importance Std']) feat_w_df.to_excel(os.path.join(dir_path, 'featw.xlsx')) io.write_df(feat_w_df, os.path.join(dir_path, 'featw'), with_idx=True)
def leave1out(spdr, mltl=True): data_path = spdr.DATA_PATH mt_file = 'X.npz', 'Y.npz' ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem'] import bionlp.util.io as io X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr') ft_dict = {} for col in X.columns: for ft in ft_order: if (col.startswith(ft+'_')): ft_dict.setdefault(ft, []).append(col) break if (mltl): for ft in ft_order: new_X = X.drop(ft_dict[ft], axis=1) io.write_df(new_X, os.path.join(data_path, 'l1o_%s_X.npz'%ft), sparse_fmt='csr', compress=True) else: for i in range(Y.shape[1]): X_i = io.read_df(os.path.join(data_path, 'X_%i.npz'%i), sparse_fmt='csr') for ft in ft_order: new_X = X_i.drop(ft_dict[ft], axis=1) io.write_df(new_X, os.path.join(data_path, 'l1o_%s_X_%i.npz'%(ft,i)), sparse_fmt='csr', compress=True)
def axstat(spdr): data_path = spdr.DATA_PATH mt_file = 'X.npz' import bionlp.util.io as io mt = sp.sparse.coo_matrix(io.read_df(os.path.join(data_path, mt_file), sparse_fmt='csr').as_matrix()) mask_mt = np.zeros(mt.shape) mask_mt[mt.row, mt.col] = 1 axstat0 = mask_mt.sum(axis=0) plot.plot_hist(axstat0, '# abstracts', '# features', log=True, title='Number of features (y-axis) that have m abstracts (x-axis)', fname='hist_abs_ft') # hist0 = np.histogram(axstat0, bins=max(10, min(20, int(axstat0.max())))) # plot.plot_scat(np.column_stack((hist0[1][:-1], hist0[0])), '# abstracts', '# features', title='Number of features (y-axis) that have m abstracts (x-axis)', fname='scat_abs_ft') axstat1 = mask_mt.sum(axis=1) plot.plot_hist(axstat1, '# features', '# abstracts', log=False, title='Number of abstracts (y-axis) that have n features (x-axis)', fname='hist_ft_abs')
def get_data(articles, from_file=None, ft_type='binary', max_df=1.0, min_df=1, fmt='npz', spfmt='csr'): # Read from local files if (from_file): if (type(from_file) == bool): file_name = 'X.npz' if (fmt == 'npz') else 'X.csv' else: file_name = from_file print 'Reading file: %s and Y.%s' % (file_name, fmt) if (fmt == 'npz'): return io.read_df(os.path.join(DATA_PATH, file_name), with_idx=True, sparse_fmt=spfmt), io.read_df(os.path.join(DATA_PATH, 'Y.npz'), with_idx=True, sparse_fmt=spfmt) else: return pd.read_csv(os.path.join(DATA_PATH, file_name), index_col=0, encoding='utf8'), pd.read_csv(os.path.join(DATA_PATH, 'Y.csv'), index_col=0, encoding='utf8') ## Feature columns ft_pmid, ft_abs, ft_lem, ft_nnv, ft_ner, ft_parse, ft_vc, ft_mesh, ft_chem, label = [[] for i in range(10)] ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem'] ft_name = {'lem':'LBoW', 'nn':'N-Bigrams', 'ner':'NE', 'parse':'GR', 'vc':'VC', 'mesh':'MeSH', 'chem':'Chem'} ft_dic = {'lem':ft_lem, 'ner':ft_ner, 'parse':ft_parse, 'vc':ft_vc, 'mesh':ft_mesh, 'chem':ft_chem} hm_lb = ['PS', 'GS', 'CD', 'RI', 'A', 'IM', 'GI', 'TPI', 'CE', 'ID'] bft_dic, hm_stat = [{} for i in range(2)] label_set = set() # sent_splitter = nltk.data.load('tokenizers/punkt/english.pickle') for artcl in articles: ft_pmid.append(artcl['id']) ft_abs.append(artcl['abs']) ft_mesh.append(artcl['mesh']) ft_chem.append(artcl['chem']) label.append(artcl['annots']) label_set.update(artcl['annots']) c = Counter(artcl['annots']) for hm, num in c.iteritems(): hs = hm_stat.setdefault(hm, [0,0]) hs[0], hs[1] = hs[0] + 1, hs[1] + num # hs[0], hs[1] = hs[0] + 1, hs[1] + len(sent_splitter.tokenize(artcl['abs'].strip())) # uniq_lb = list(label_set) uniq_lb = [IHM_MAP[lb] for lb in hm_lb] ## Get the feature sets of the specific hallmark # feat_sets = get_fsnames() feat_sets = ft_order feature_sets, feat_stat = get_featsets(feat_sets, len(uniq_lb)) ft_stat_mt = np.array([feat_stat[ft] for ft in ft_order]).T ft_stat_pd = pd.DataFrame(ft_stat_mt, index=hm_lb, columns=[ft_name[fset] for fset in feat_sets]) hm_stat_pd = pd.DataFrame([hm_stat[lb] for lb in uniq_lb], index=hm_lb, columns=['No. abstracts', 'No. sentences']) if (fmt == 'npz'): io.write_df(ft_stat_pd, os.path.join(DATA_PATH, 'ft_stat.npz')) io.write_df(hm_stat_pd, os.path.join(DATA_PATH, 'hm_stat.npz'), with_idx=True) else: ft_stat_pd.to_csv(os.path.join(DATA_PATH, 'ft_stat.csv'), encoding='utf8') hm_stat_pd.to_csv(os.path.join(DATA_PATH, 'hm_stat.csv'), encoding='utf8') ## Extract the features from the preprocessed data for i in range(len(feat_sets)): fset = feat_sets[i] feature_set = feature_sets[i] if (fset == 'chem' or fset == 'mesh'): continue if (fset == 'nn'): continue for pmid in ft_pmid: feature, extra_feat = [[], []] prev_term = '' for line in fs.read_file(os.path.join(DATA_PATH, fset, '.'.join([pmid, fset, 'txt'])), 'utf8'): if (line == '~~~'): continue if (fset == 'lem'): if (line == '. . .' or line == '~~~ ~~~' or line == ', , ,'): continue items = line.split() if (len(items) < 3): # Skip the unrecognized words continue feature.append(items[2].lower()) # Extract NN feature if (items[1] == 'NN'): if (prev_term != ''): extra_feat.append(prev_term + ' ' + items[0].lower()) prev_term = items[0].lower() else: prev_term = '' if (fset == 'ner'): feature.append(line) if (fset == 'parse'): record = line.strip('()').replace(' _ ', ' ').split() feature.append(','.join([w.split('_')[0] for w in record]).lower()) if (fset == 'vc'): feature.extend(line.split()) ft_dic[fset].append(feature) if (fset == 'lem'): ft_nnv.extend(extra_feat) ## Convert the raw features into binary features ft_type = ft_type.lower() for i in range(len(feat_sets)): fset = feat_sets[i] feature_set = feature_sets[i] if (fset == 'nn'): bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', max_df=max_df, min_df=min_df, vocabulary=set(ft_nnv), binary=True if ft_type=='binary' else False) ft_nn = bigram_vectorizer.fit_transform(ft_abs).tocsr() nn_classes = [cls[0] for cls in sorted(bigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1))] bft_dic[fset] = (ft_nn, nn_classes) continue # overall_ft = list(set([ft for samp in ft_dic[fset] for ft in samp if ft])) # mlb = MultiLabelBinarizer(classes=overall_ft) # bft_dic[fset] = (mlb.fit_transform(ft_dic[fset]), mlb.classes_) count_vectorizer = CountVectorizer(tokenizer=lambda text: [t for t in text.split('*#@') if t and t not in string.punctuation], lowercase=False, stop_words='english', token_pattern=r'\b\w+\b', max_df=max_df, min_df=min_df, binary=True if ft_type=='binary' else False) ft_all = count_vectorizer.fit_transform(['*#@'.join(samp) for samp in ft_dic[fset]]) all_classes = [cls[0] for cls in sorted(count_vectorizer.vocabulary_.items(), key=operator.itemgetter(1))] bft_dic[fset] = (ft_all, all_classes) ## Convert the annotations of each document to binary labels mlb = MultiLabelBinarizer(classes=uniq_lb) bin_label = (mlb.fit_transform(label), mlb.classes_) ## Generate the features as well as the labels to form a completed dataset feat_mt = sp.sparse.hstack([bft_dic[fset][0] for fset in ft_order]) if (ft_type == 'tfidf'): transformer = TfidfTransformer(norm='l2', sublinear_tf=False) feat_mt = transformer.fit_transform(feat_mt) feat_cols = ['%s_%s' % (fset, w) for fset in ft_order for w in bft_dic[fset][1]] feat_df = pd.DataFrame(feat_mt.todense(), index=ft_pmid, columns=feat_cols) label_df = pd.DataFrame(bin_label[0], index=ft_pmid, columns=bin_label[1]) obj_samp_idx = np.random.random_integers(0, feat_df.shape[0] - 1, size=200).tolist() ft_samp_idx = np.random.random_integers(0, feat_df.shape[1] - 1, size=1000).tolist() samp_feat_df = feat_df.iloc[obj_samp_idx, ft_samp_idx] samp_lb_df = label_df.iloc[obj_samp_idx,:] if (fmt == 'npz'): io.write_df(feat_df, os.path.join(DATA_PATH, 'X.npz'), with_idx=True, sparse_fmt=spfmt, compress=True) io.write_df(label_df, os.path.join(DATA_PATH, 'Y.npz'), with_idx=True, sparse_fmt=spfmt, compress=True) io.write_df(samp_feat_df, os.path.join(DATA_PATH, 'sample_X.npz'), with_idx=True, sparse_fmt=spfmt, compress=True) io.write_df(samp_lb_df, os.path.join(DATA_PATH, 'sample_Y.npz'), with_idx=True, sparse_fmt=spfmt, compress=True) else: feat_df.to_csv(os.path.join(DATA_PATH, 'X.csv'), encoding='utf8') label_df.to_csv(os.path.join(DATA_PATH, 'Y.csv'), encoding='utf8') samp_feat_df.to_csv(os.path.join(DATA_PATH, 'sample_X.csv'), encoding='utf8') samp_lb_df.to_csv(os.path.join(DATA_PATH, 'sample_Y.csv'), encoding='utf8') return feat_df, label_df
def npz2xls(spdr): data_path = spdr.DATA_PATH npz_file = 'hm_stat.npz' fpath = os.path.join(data_path, npz_file) df = io.read_df(fpath) df.to_excel(os.path.splitext(fpath)[0] + '.xlsx')