Ejemplo n.º 1
0
def unionx(spdr):
	data_path = spdr.DATA_PATH
	mt_file = 'X.npz', 'Y.npz'
	ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem', 'extmesh']
	import bionlp.util.io as io
	X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr')
	ft_idx = {}
	for i, col in enumerate(X.columns):
		for ft in ft_order:
			if (col.startswith(ft+'_')):
				ft_idx.setdefault(ft, []).append(i)
				break
	union_ft_idx = {}
	for j in range(Y.shape[1]):
		X_j = io.read_df(os.path.join(data_path, 'X_%i.npz'%j), with_idx=True, sparse_fmt='csr')
		for i, col in enumerate(X_j.columns):
			for ft in ft_order:
				if (col.startswith(ft+'_')):
					union_ft_idx.setdefault(ft, set([])).add(col)
					break
	new_ft = []
	for ft in ft_order:
		new_ft.extend(list(union_ft_idx[ft]))
	union_X = X.loc[:,new_ft]
	io.write_df(union_X, os.path.join(data_path, 'union_X.npz'), with_idx=True, sparse_fmt='csr', compress=True)
Ejemplo n.º 2
0
def decomp_data(method='LDA', n_components=100):
	X, Y = spdr.get_data(None, ft_type=opts.type, max_df=ast.literal_eval(opts.maxdf), min_df=ast.literal_eval(opts.mindf), from_file=True, fmt=opts.fmt, spfmt=opts.spfmt)
	method = method.upper()
	n_components = min(n_components, X.shape[1])
	if (method == 'LDA'):
		model = make_pipeline(LatentDirichletAllocation(n_topics=n_components, learning_method='online', learning_offset=50., max_iter=5, n_jobs=opts.np, random_state=0), Normalizer(copy=False))
	elif (method == 'NMF'):
		model = make_pipeline(NMF(n_components=n_components, random_state=0, alpha=.1, l1_ratio=.5), Normalizer(copy=False))
	elif (method == 'LSI'):
		model = make_pipeline(TruncatedSVD(n_components), Normalizer(copy=False))
	elif (method == 'TSNE'):
		model = make_pipeline(ftdecomp.DecompTransformer(n_components, ftdecomp.t_sne, initial_dims=15*n_components, perplexity=30.0))
	if (opts.prefix == 'all'):
		td_cols = X.columns
	else:
		# Only apply dimension reduction on specific columns
		td_cols = np.array(map(lambda x: True if any(x.startswith(prefix) for prefix in opts.prefix.split(SC)) else False, X.columns))
	td_X = X.loc[:,td_cols]
	new_td_X = model.fit_transform(td_X.as_matrix())
	if (opts.prefix == 'all'):
		columns = range(new_td_X.shape[1]) if not hasattr(model.steps[0][1], 'components_') else td_X.columns[model.steps[0][1].components_.argmax(axis=1)]
		new_X = pd.DataFrame(new_td_X, index=X.index, columns=['tp_%s' % x for x in columns])
	else:
		columns = range(new_td_X.shape[1]) if not hasattr(model.steps[0][1], 'components_') else td_X.columns[model.steps[0][1].components_.argmax(axis=1)]
		# Concatenate the components and the columns are not applied dimension reduction on
		new_X = pd.concat([pd.DataFrame(new_td_X, index=X.index, columns=['tp_%s' % x for x in columns]), X.loc[:,np.logical_not(td_cols)]], axis=1)
	if (opts.fmt == 'npz'):
		io.write_df(new_X, os.path.join(spdr.DATA_PATH, '%s%i_X.npz' % (method.lower(), n_components)), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
	else:
		new_X.to_csv(os.path.join(spdr.DATA_PATH, '%s%i_X.csv' % (method.lower(), n_components)), encoding='utf8')
Ejemplo n.º 3
0
def filtx(spdr):
	data_path = spdr.DATA_PATH
	mt_file = 'X.npz', 'Y.npz'
	import bionlp.util.io as io
	X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr')
	Xs = spdr.ft_filter(X, Y)
	for i, x_df in enumerate(Xs):
		io.write_df(x_df, os.path.join(data_path, 'X_%i.npz' % i), with_idx=True, sparse_fmt='csr', compress=True)
Ejemplo n.º 4
0
def extend_mesh(ft_type='binary'):
	X, Y = spdr.get_data(None, ft_type=opts.type, max_df=ast.literal_eval(opts.maxdf), min_df=ast.literal_eval(opts.mindf), from_file=True, fmt=opts.fmt, spfmt=opts.spfmt)
	mesh_df = mm.mesh_countvec(X.index)
	mesh_df.columns = ['extmesh_' + x for x in mesh_df.columns]
	new_X = pd.concat([X, mesh_df], axis=1, join_axes=[X.index])
	print 'The size of data has been changed from %s to %s.' % (X.shape, new_X.shape)
	if (opts.fmt == 'npz'):
		io.write_df(new_X, os.path.join(spdr.DATA_PATH, 'extmesh_X.npz'), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
	else:
		new_X.to_csv(os.path.join(spdr.DATA_PATH, 'extmesh_X.csv'), encoding='utf8')
Ejemplo n.º 5
0
def gen_data():
	if (opts.local):
		X, Y = spdr.get_data(None, from_file=True)
	else:
		pmid_list = spdr.get_pmids()
		articles = spdr.fetch_artcls(pmid_list)
		X, Y = spdr.get_data(articles, ft_type=opts.type, max_df=ast.literal_eval(opts.maxdf), min_df=ast.literal_eval(opts.mindf), fmt=opts.fmt, spfmt=opts.spfmt)
	hallmarks = Y.columns

	# Feature Selection
	# mt = sp.sparse.coo_matrix(X)
	# mask_mt = np.zeros(mt.shape)
	# mask_mt[mt.row, mt.col] = 1
	# stat = mask_mt.sum(axis=0)
	# cln_X = X.iloc[:,np.arange(stat.shape[0])[stat>ast.literal_eval(opts.thrshd) * (stat.max() - stat.min()) + stat.min()]]
	
	# Document Frequence
	# stat, _ = ftslct.freqs(X.values, Y.values)
	# Mutual information
	# stat, _ = ftslct.mutual_info(X.values, Y.values)
	# Information gain
	# stat, _ = ftslct.info_gain(X.values, Y.values)
	# GSS coefficient
	# stat, _ = ftslct.gss_coef(X.values, Y.values)
	# NGL coefficient
	# stat, _ = ftslct.ngl_coef(X.values, Y.values)
	# Odds ratio
	# stat, _ = ftslct.odds_ratio(X.values, Y.values)
	# Fisher criterion
	# stat, _ = ftslct.fisher_crtrn(X.values, Y.values)
	# GU metric
	# stat, _ = ftslct.gu_metric(X.values, Y.values)
	# Decision tree
	# stat, _ = ftslct.decision_tree(X.values, Y.values)
	# Combined feature
	stat, _ = ftslct.utopk(X.values, Y.values, ftslct.decision_tree, fn=100)
	io.write_npz(stat, os.path.join(spdr.DATA_PATH, 'ftw.npz'))
	
	# cln_X = X.iloc[:,np.arange(stat.shape[0])[stat>stat.min()]]
	cln_X = X.iloc[:,stat.argsort()[-500:][::-1]]
	print 'The size of data has been changed from %s to %s.' % (X.shape, cln_X.shape)
	
	if (opts.fmt == 'npz'):
		io.write_df(cln_X, os.path.join(spdr.DATA_PATH, 'cln_X.npz'), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
	else:
		cln_X.to_csv(os.path.join(spdr.DATA_PATH, 'cln_X.csv'), encoding='utf8')
	del X, cln_X
	for i in range(Y.shape[1]):
		y = Y.iloc[:,i]
		if (opts.fmt == 'npz'):
			io.write_df(y, os.path.join(spdr.DATA_PATH, 'y_%s.npz' % i), with_col=False, with_idx=True)
		else:
			y.to_csv(os.path.join(spdr.DATA_PATH, 'y_%s.csv' % i), encoding='utf8')
Ejemplo n.º 6
0
def avgfeatw(dir_path='.'):
	df_list = []
	for file in fs.listf(dir_path):
		if file.endswith(".npz"):
			df_list.append(io.read_df(os.path.join(dir_path, file), with_idx=True))
	feat_w_mt = pd.concat([df.loc[:,'Importance Mean'] for df in df_list], axis=1, join_axes=[df_list[0].index]).astype('float').values
	feat_w_avg = feat_w_mt.mean(axis=1)
	feat_w_std = feat_w_mt.std(axis=1)
	sorted_idx = np.argsort(feat_w_avg, axis=-1)[::-1]
	sorted_feat_w = np.column_stack((df_list[0].loc[:,'Feature Name'].values[sorted_idx], feat_w_avg[sorted_idx], feat_w_std[sorted_idx]))
	feat_w_df = pd.DataFrame(sorted_feat_w, index=df_list[0].index.values[sorted_idx], columns=['Feature Name', 'Importance Mean', 'Importance Std'])
	feat_w_df.to_excel(os.path.join(dir_path, 'featw.xlsx'))
	io.write_df(feat_w_df, os.path.join(dir_path, 'featw'), with_idx=True)
Ejemplo n.º 7
0
def add_d2v(n_components=100, win_size=8, min_t=5, mdl_fname='d2v.mdl'):
	from gensim.parsing.preprocessing import preprocess_string
	from gensim.models.doc2vec import TaggedDocument, Doc2Vec
	def read_files(fpaths, code='ascii'):
		for fpath in fpaths:
			try:
				yield TaggedDocument(words=preprocess_string('\n'.join(fs.read_file(fpath, code))), tags=[os.path.splitext(os.path.basename(fpath))[0]])
			except Exception as e:
				continue
	def read_prcsed_files(fpaths, code='ascii'):
		for fpath in fpaths:
			try:
				words = []
				for line in fs.read_file(fpath, code):
					if (line == '~~~'):
						continue
					if (line == '.	.	.' or line == '~~~	~~~' or line == ',	,	,'):
						continue
					items = line.split()
					if (len(items) < 3): # Skip the unrecognized words
						continue
					words.append(items[2].lower())
				yield TaggedDocument(words=words, tags=[os.path.splitext(os.path.basename(fpath))[0]])
			except Exception as e:
				continue
	mdl_fpath = os.path.join(spdr.DATA_PATH, mdl_fname)
	if (os.path.exists(mdl_fpath)):
		model = Doc2Vec.load(mdl_fpath)
	else:
		# model = Doc2Vec(read_files(fs.listf(spdr.ABS_PATH, full_path=True)), size=n_components, window=8, min_count=5, workers=opts.np)
		model = Doc2Vec(read_prcsed_files(fs.listf(os.path.join(spdr.DATA_PATH, 'lem'), full_path=True)), size=n_components, window=8, min_count=5, workers=opts.np)
		model.save(os.path.join(spdr.DATA_PATH, mdl_fname))
		
	X, Y = spdr.get_data(None, ft_type=opts.type, max_df=ast.literal_eval(opts.maxdf), min_df=ast.literal_eval(opts.mindf), from_file=True, fmt=opts.fmt, spfmt=opts.spfmt)
	# Map the index of original matrix to that of the paragraph vectors
	d2v_idx = [model.docvecs.index_to_doctag(i).rstrip('.lem') for i in range(model.docvecs.count)]
	mms = MinMaxScaler()
	d2v_X = pd.DataFrame(mms.fit_transform(model.docvecs[range(model.docvecs.count)]), index=d2v_idx, columns=['d2v_%i' % i for i in range(model.docvecs[0].shape[0])])
	# d2v_X = pd.DataFrame(model.docvecs[range(model.docvecs.count)], index=d2v_idx, columns=['d2v_%i' % i for i in range(model.docvecs[0].shape[0])])
	new_X = pd.concat([X, d2v_X], axis=1, join_axes=[X.index])
	print 'The size of data has been changed from %s to %s.' % (X.shape, new_X.shape)
	if (opts.fmt == 'npz'):
		io.write_df(d2v_X, os.path.join(spdr.DATA_PATH, 'd2v_X.npz'), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
		io.write_df(new_X, os.path.join(spdr.DATA_PATH, 'cmb_d2v_X.npz'), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
	else:
		d2v_X.to_csv(os.path.join(spdr.DATA_PATH, 'd2v_X.csv'), encoding='utf8')
		new_X.to_csv(os.path.join(spdr.DATA_PATH, 'cmb_d2v_X.csv'), encoding='utf8')
Ejemplo n.º 8
0
def pred2cor(dir_path, file_ptn, mdls, pids=range(10), crsval=10):
	import scipy.stats as stats
	from chm_annot import pred_ovl
	import bionlp.util.math as imath
	for pid in pids:
		crsval_povl, crsval_spearman = [[] for i in range(2)]
		for crs_t in xrange(crsval):
			preds, true_lb = [], None
			for mdl in mdls:
				mdl = mdl.replace(' ', '_').lower()
				file = file_ptn.replace('#CRST#', str(crs_t)).replace('#MDL#', mdl).replace('#PID#', str(pid))
				npz_file = io.read_npz(os.path.join(dir_path, file))
				preds.append(npz_file['pred_lb'])
				true_lb = npz_file['true_lb']
			preds_mt = np.column_stack([x.ravel() for x in preds])
			preds.append(true_lb)
			tpreds_mt = np.column_stack([x.ravel() for x in preds])
			crsval_povl.append(pred_ovl(preds_mt, true_lb.ravel()))
			crsval_spearman.append(stats.spearmanr(tpreds_mt))
		povl_avg = np.array(crsval_povl).mean(axis=0).round()
		spmnr_avg = np.array([crsp[0] for crsp in crsval_spearman]).mean(axis=0)
		spmnr_pval = np.array([crsp[1] for crsp in crsval_spearman]).mean(axis=0)
		povl_idx = list(imath.subset(mdls, min_crdnl=1))
		povl_avg_df = pd.DataFrame(povl_avg, index=povl_idx, columns=['pred_ovl', 'tpred_ovl'])
		spmnr_avg_df = pd.DataFrame(spmnr_avg, index=mdls+['Annotations'], columns=mdls+['Annotations'])
		spmnr_pval_df = pd.DataFrame(spmnr_pval, index=mdls+['Annotations'], columns=mdls+['Annotations'])
		povl_avg_df.to_excel(os.path.join(dir_path, 'cpovl_avg_%s.xlsx' % pid))
		spmnr_avg_df.to_excel(os.path.join(dir_path, 'spmnr_avg_%s.xlsx' % pid))
		spmnr_pval_df.to_excel(os.path.join(dir_path, 'spmnr_pval_%s.xlsx' % pid))
		io.write_df(povl_avg_df, os.path.join(dir_path, 'povl_avg_%s.npz' % pid), with_idx=True)
		io.write_df(spmnr_avg_df, os.path.join(dir_path, 'spmnr_avg_%s.npz' % pid), with_idx=True)
		io.write_df(spmnr_pval_df, os.path.join(dir_path, 'spmnr_val_%s.npz' % pid), with_idx=True)
Ejemplo n.º 9
0
def leave1out(spdr, mltl=True):
	data_path = spdr.DATA_PATH
	mt_file = 'X.npz', 'Y.npz'
	ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem']
	import bionlp.util.io as io
	X, Y = io.read_df(os.path.join(data_path, mt_file[0]), with_idx=True, sparse_fmt='csr'), io.read_df(os.path.join(data_path, mt_file[1]), with_idx=True, sparse_fmt='csr')
	ft_dict = {}
	for col in X.columns:
		for ft in ft_order:
			if (col.startswith(ft+'_')):
				ft_dict.setdefault(ft, []).append(col)
				break
	if (mltl):
		for ft in ft_order:
			new_X = X.drop(ft_dict[ft], axis=1)
			io.write_df(new_X, os.path.join(data_path, 'l1o_%s_X.npz'%ft), sparse_fmt='csr', compress=True)
	else:
		for i in range(Y.shape[1]):
			X_i = io.read_df(os.path.join(data_path, 'X_%i.npz'%i), sparse_fmt='csr')
			for ft in ft_order:
				new_X = X_i.drop(ft_dict[ft], axis=1)
				io.write_df(new_X, os.path.join(data_path, 'l1o_%s_X_%i.npz'%(ft,i)), sparse_fmt='csr', compress=True)
Ejemplo n.º 10
0
def expand_data(ft_type='binary', db_name='mesh2016', db_type='LevelDB', store_path='store'):
	from rdflib import Graph
	from bionlp.util import ontology
	
	X, Y = spdr.get_data(None, ft_type=opts.type, max_df=ast.literal_eval(opts.maxdf), min_df=ast.literal_eval(opts.mindf), from_file=True, fmt=opts.fmt, spfmt=opts.spfmt)
	mesh_cols = filter(lambda x: x.startswith('mesh_') or x.startswith('extmesh_'), X.columns)
	mesh_X = X.loc[:,mesh_cols]
	exp_meshx = set([])
	ext_meshx_dict = {}
	g = Graph(store=db_type, identifier=db_name)
	g.open(store_path)
	for col in mesh_X.columns:
		mesh_lb = col.strip('extmesh_').strip('mesh_').replace('"', '\\"')
		# Get similar MeSH terms
		em_set = set(ontology.slct_sim_terms(g, mesh_lb, prdns=[('meshv',ontology.MESHV)], eqprds=ontology.MESH_EQPRDC_MAP))
		# Overall extended MeSH terms
		exp_meshx |= em_set
		# Extended MeSH terms per column
		ext_meshx_dict[col] = em_set
	g.close()
	exp_mesh_X = pd.DataFrame(np.zeros((mesh_X.shape[0], len(exp_meshx)), dtype='int8'), index=X.index, columns=['expmesh_%s' % w for w in exp_meshx])
	# Append the similar MeSH terms of each column to the final matrix
	for col, sim_mesh in ext_meshx_dict.iteritems():
		if (len(sim_mesh) == 0): continue
		sim_cols = ['expmesh_%s' % w for w in sim_mesh]
		if (ft_type == 'binary'):
			exp_mesh_X.loc[:,sim_cols] = np.logical_or(exp_mesh_X.loc[:,sim_cols], mesh_X.loc[:,col].reshape((-1,1))).astype('int')
		elif (ft_type == 'numeric'):
			exp_mesh_X.loc[:,sim_cols] += mesh_X.loc[:,col].reshape((-1,1))
		elif (ft_type == 'tfidf'):
			pass
	new_X = pd.concat([X, exp_mesh_X], axis=1, join_axes=[X.index])
	print 'The size of data has been changed from %s to %s.' % (X.shape, new_X.shape)
	if (opts.fmt == 'npz'):
		io.write_df(new_X, os.path.join(spdr.DATA_PATH, 'exp_X.npz'), with_idx=True, sparse_fmt=opts.spfmt, compress=True)
	else:
		new_X.to_csv(os.path.join(spdr.DATA_PATH, 'exp_X.csv'), encoding='utf8')
Ejemplo n.º 11
0
def _pred2event(spdr_mod,
                combined,
                pred_fpath,
                data_path,
                test_X_paths=['cbow/dev_X%i' % i for i in range(4)],
                train_Y_path='cbow/train_Y',
                method='cbow',
                source='2011',
                task='bgi'):
    pred = io.read_npz(pred_fpath)['pred_lb']
    if (combined):
        event_mt = np.column_stack(
            [pred[:, i] for i in range(0, pred.shape[1], 2)])
        dir_mt = np.column_stack(
            [pred[:, i] for i in range(1, pred.shape[1], 2)])
    else:
        evnt_num = pred.shape[1] / 2
        event_mt = pred[:, :evnt_num]
        dir_mt = pred[:, evnt_num:]
    np.place(dir_mt, dir_mt == 0, [-1])
    event_mt *= dir_mt
    test_Xs = [pd.read_hdf(data_path, dspath) for dspath in test_X_paths]
    train_Y = pd.read_hdf(data_path, train_Y_path)
    test_Y = pd.DataFrame(event_mt,
                          index=test_Xs[0].index,
                          columns=train_Y.columns)
    io.write_df(test_Y,
                'test_Y',
                with_idx=True,
                sparse_fmt='csr',
                compress=True)
    events = spdr_mod.pred2data(test_Y,
                                method=method,
                                source=source,
                                task=task)
    spdr_mod.to_a2(events, './pred', source=source, task=task)
Ejemplo n.º 12
0
def get_data(articles, from_file=None, ft_type='binary', max_df=1.0, min_df=1, fmt='npz', spfmt='csr'):
	# Read from local files
	if (from_file):
		if (type(from_file) == bool):
			file_name = 'X.npz' if (fmt == 'npz') else 'X.csv'
		else:
			file_name = from_file
		print 'Reading file: %s and Y.%s' % (file_name, fmt)
		if (fmt == 'npz'):
			return io.read_df(os.path.join(DATA_PATH, file_name), with_idx=True, sparse_fmt=spfmt), io.read_df(os.path.join(DATA_PATH, 'Y.npz'), with_idx=True, sparse_fmt=spfmt)
		else:
			return pd.read_csv(os.path.join(DATA_PATH, file_name), index_col=0, encoding='utf8'), pd.read_csv(os.path.join(DATA_PATH, 'Y.csv'), index_col=0, encoding='utf8')
	## Feature columns
	ft_pmid, ft_abs, ft_lem, ft_nnv, ft_ner, ft_parse, ft_vc, ft_mesh, ft_chem, label = [[] for i in range(10)]
	ft_order = ['lem', 'nn', 'ner', 'parse', 'vc', 'mesh', 'chem']
	ft_name = {'lem':'LBoW', 'nn':'N-Bigrams', 'ner':'NE', 'parse':'GR', 'vc':'VC', 'mesh':'MeSH', 'chem':'Chem'}
	ft_dic = {'lem':ft_lem, 'ner':ft_ner, 'parse':ft_parse, 'vc':ft_vc, 'mesh':ft_mesh, 'chem':ft_chem}
	hm_lb = ['PS', 'GS', 'CD', 'RI', 'A', 'IM', 'GI', 'TPI', 'CE', 'ID']
	bft_dic, hm_stat = [{} for i in range(2)]
	label_set = set()
#	sent_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
	for artcl in articles:
		ft_pmid.append(artcl['id'])
		ft_abs.append(artcl['abs'])
		ft_mesh.append(artcl['mesh'])
		ft_chem.append(artcl['chem'])
		label.append(artcl['annots'])
		label_set.update(artcl['annots'])
		c = Counter(artcl['annots'])
		for hm, num in c.iteritems():
			hs = hm_stat.setdefault(hm, [0,0])
			hs[0], hs[1] = hs[0] + 1, hs[1] + num
#			hs[0], hs[1] = hs[0] + 1, hs[1] + len(sent_splitter.tokenize(artcl['abs'].strip()))
#	uniq_lb = list(label_set)
	uniq_lb = [IHM_MAP[lb] for lb in hm_lb]
	
	## Get the feature sets of the specific hallmark
#	feat_sets = get_fsnames()
	feat_sets = ft_order
	feature_sets, feat_stat = get_featsets(feat_sets, len(uniq_lb))
	
	ft_stat_mt = np.array([feat_stat[ft] for ft in ft_order]).T
	ft_stat_pd = pd.DataFrame(ft_stat_mt, index=hm_lb, columns=[ft_name[fset] for fset in feat_sets])
	hm_stat_pd = pd.DataFrame([hm_stat[lb] for lb in uniq_lb], index=hm_lb, columns=['No. abstracts', 'No. sentences'])
	if (fmt == 'npz'):
		io.write_df(ft_stat_pd, os.path.join(DATA_PATH, 'ft_stat.npz'))
		io.write_df(hm_stat_pd, os.path.join(DATA_PATH, 'hm_stat.npz'), with_idx=True)
	else:
		ft_stat_pd.to_csv(os.path.join(DATA_PATH, 'ft_stat.csv'), encoding='utf8')
		hm_stat_pd.to_csv(os.path.join(DATA_PATH, 'hm_stat.csv'), encoding='utf8')
		
	## Extract the features from the preprocessed data
	for i in range(len(feat_sets)):
		fset = feat_sets[i]
		feature_set = feature_sets[i]
		if (fset == 'chem' or fset == 'mesh'):
			continue
		if (fset == 'nn'):
			continue
		for pmid in ft_pmid:
			feature, extra_feat = [[], []]
			prev_term = ''
			for line in fs.read_file(os.path.join(DATA_PATH, fset, '.'.join([pmid, fset, 'txt'])), 'utf8'):
				if (line == '~~~'):
					continue
				if (fset == 'lem'):
					if (line == '.	.	.' or line == '~~~	~~~' or line == ',	,	,'):
						continue
					items = line.split()
					if (len(items) < 3): # Skip the unrecognized words
						continue
					feature.append(items[2].lower())
					# Extract NN feature
					if (items[1] == 'NN'):
						if (prev_term != ''):
							extra_feat.append(prev_term + ' ' + items[0].lower())
						prev_term = items[0].lower()
					else:
						prev_term = ''
				if (fset == 'ner'):
					feature.append(line)
				if (fset == 'parse'):
					record = line.strip('()').replace(' _ ', ' ').split()
					feature.append(','.join([w.split('_')[0] for w in record]).lower())
				if (fset == 'vc'):
					feature.extend(line.split())
			ft_dic[fset].append(feature)
			if (fset == 'lem'):
				ft_nnv.extend(extra_feat)

	## Convert the raw features into binary features
	ft_type = ft_type.lower()
	for i in range(len(feat_sets)):
		fset = feat_sets[i]
		feature_set = feature_sets[i]
		if (fset == 'nn'):
			bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', max_df=max_df, min_df=min_df, vocabulary=set(ft_nnv), binary=True if ft_type=='binary' else False)
			ft_nn = bigram_vectorizer.fit_transform(ft_abs).tocsr()
			nn_classes = [cls[0] for cls in sorted(bigram_vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]
			bft_dic[fset] = (ft_nn, nn_classes)
			continue
#		overall_ft = list(set([ft for samp in ft_dic[fset] for ft in samp if ft]))
#		mlb = MultiLabelBinarizer(classes=overall_ft)
#		bft_dic[fset] = (mlb.fit_transform(ft_dic[fset]), mlb.classes_)
		count_vectorizer = CountVectorizer(tokenizer=lambda text: [t for t in text.split('*#@') if t and t not in string.punctuation], lowercase=False, stop_words='english', token_pattern=r'\b\w+\b', max_df=max_df, min_df=min_df, binary=True if ft_type=='binary' else False)
		ft_all = count_vectorizer.fit_transform(['*#@'.join(samp) for samp in ft_dic[fset]])
		all_classes = [cls[0] for cls in sorted(count_vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]
		bft_dic[fset] = (ft_all, all_classes)
	
	## Convert the annotations of each document to binary labels
	mlb = MultiLabelBinarizer(classes=uniq_lb)
	bin_label = (mlb.fit_transform(label), mlb.classes_)
	
	## Generate the features as well as the labels to form a completed dataset
	feat_mt = sp.sparse.hstack([bft_dic[fset][0] for fset in ft_order])
	if (ft_type == 'tfidf'):
		transformer = TfidfTransformer(norm='l2', sublinear_tf=False)
		feat_mt = transformer.fit_transform(feat_mt)
	feat_cols = ['%s_%s' % (fset, w) for fset in ft_order for w in bft_dic[fset][1]]
	feat_df = pd.DataFrame(feat_mt.todense(), index=ft_pmid, columns=feat_cols)
	label_df = pd.DataFrame(bin_label[0], index=ft_pmid, columns=bin_label[1])
	
	obj_samp_idx = np.random.random_integers(0, feat_df.shape[0] - 1, size=200).tolist()
	ft_samp_idx = np.random.random_integers(0, feat_df.shape[1] - 1, size=1000).tolist()
	samp_feat_df = feat_df.iloc[obj_samp_idx, ft_samp_idx]
	samp_lb_df = label_df.iloc[obj_samp_idx,:]

	if (fmt == 'npz'):
		io.write_df(feat_df, os.path.join(DATA_PATH, 'X.npz'), with_idx=True, sparse_fmt=spfmt, compress=True)
		io.write_df(label_df, os.path.join(DATA_PATH, 'Y.npz'), with_idx=True, sparse_fmt=spfmt, compress=True)
		io.write_df(samp_feat_df, os.path.join(DATA_PATH, 'sample_X.npz'), with_idx=True, sparse_fmt=spfmt, compress=True)
		io.write_df(samp_lb_df, os.path.join(DATA_PATH, 'sample_Y.npz'), with_idx=True, sparse_fmt=spfmt, compress=True)
	else:
		feat_df.to_csv(os.path.join(DATA_PATH, 'X.csv'), encoding='utf8')
		label_df.to_csv(os.path.join(DATA_PATH, 'Y.csv'), encoding='utf8')
		samp_feat_df.to_csv(os.path.join(DATA_PATH, 'sample_X.csv'), encoding='utf8')
		samp_lb_df.to_csv(os.path.join(DATA_PATH, 'sample_Y.csv'), encoding='utf8')
	return feat_df, label_df