def select(self, params=None):

        X, y = self.data.for_train().gene_data()

        if bool(params) and 'feature_file' in params:
            feature_file = params['feature_file']
            features = utils.load_string_data(feature_file)
            X = X.filter(items=features, axis=0)

        data = X.values.T
        print('Gene data size {}'.format(data.shape))

        from sklearn.decomposition import PCA
        pca = PCA(svd_solver='full')
        pca.fit(data)

        print(pca.singular_values_)
        print(pca.components_[0], np.max(pca.components_[0]))
        sing_values = pca.singular_values_
        agg = 0
        for i in range(1, len(sing_values)):
            agg = agg + sing_values[i - 1]
            if (agg / np.sum(sing_values)) > .9:
                break
        print('90 percent variance captured by {} vectors'.format(i))

        agg = 0
        for i in range(1, len(sing_values)):
            agg = agg + sing_values[i - 1]
            if (agg / np.sum(sing_values)) > .99:
                break
        print('99 percent variance captured by {} vectors'.format(i))
    def select(self, params={}):

        if bool(params) and 'feature_file' in params:
            feature_file = params['feature_file']
            self.features = utils.load_string_data(feature_file)
        else:
            self.features = list(self.data.genes())

        self.current_features = list(self.features)
    def select(self, params=None):

        if bool(params) and 'feature_file' in params:
            feature_file = params['feature_file']
            features = utils.load_string_data(feature_file)
        else:
            features = list(self.data.genes())

        self.features = self._create_featurelist(features)
Ejemplo n.º 4
0
def compare_genes(dir, prefix):

	# files = os.listdir(dir)
	sets = set()
	alt = 0
	gene_map = {}
	# for f in files:
		# if f.startswith(prefix):
	value = utils.load_string_data(os.path.join(dir,prefix))
	gene_map[prefix] = set(list(value))

	if len(value.shape) > 0:
		sets = sets.union(value)
		alt = alt + value.shape[0]
	else:
		sets.add(str(value))
		alt = alt + 1

	print(len(sets))

	ds = Dataset('../data/')
	ds.load_gene_data()			
	X, y = ds.for_train().gene_data()
	X = X.filter(items=sets, axis=0)
	data = X.values.T

	from sklearn.decomposition import PCA
	pca = PCA(svd_solver='full')
	pca.fit(data)

	sing_values = pca.singular_values_

	print(sing_values[0:20])

	agg = 0
	for i in range(1, len(sing_values)):
		agg = agg + sing_values[i-1]
		if (agg/np.sum(sing_values)) > .9:
			break
	print('90 percent variance captured by {} vectors'.format(i))

	agg = 0
	for i in range(1, len(sing_values)):
		agg = agg + sing_values[i-1]
		if (agg/np.sum(sing_values)) > .99:
			break
	print('99 percent variance captured by {} vectors'.format(i))

	evec1 = pca.components_[0]
Ejemplo n.º 5
0
def check_cov(dir, prefix):

	files = os.listdir(dir)
	sets = set()
	alt = 0
	for f in files:
		if f.startswith(prefix):
			value = utils.load_string_data(os.path.join(dir,f))

			if len(value.shape) > 0:
				sets = sets.union(value)
				alt = alt + value.shape[0]
			else:
				sets.add(str(value))
				alt = alt + 1
			
			print(len(sets))

	print('Total', alt)

	np.set_printoptions(threshold=np.nan, linewidth= np.nan)
	
	ds = Dataset('../data/')
	ds.load_gene_data()
	X, Y = ds.for_train().gene_data()
	fil = X.filter(items=sets, axis=0)
	print(fil.shape)
	fil_v = fil.values
	cov = np.cov(fil_v)
	print(cov.shape)

	print(np.sort(np.diag(cov)))
	# exit()

	cov[np.where(np.identity(cov.shape[0])==1)] = 0

	
	sort_ind = np.argsort(np.sum(np.abs(cov), axis=1))
	print(sort_ind)
	print(np.sort(np.sum(np.abs(cov), axis=1)))

	l_set = np.array(list(sets))
	trim_set = l_set[sort_ind[0:cov.shape[0]]]
	print(trim_set)

	np_trim_set = np.array(trim_set)
Ejemplo n.º 6
0
def combine_data(dir, prefix, saveto):

	files = os.listdir(dir)
	dset = set()

	for f in files:
		if f.startswith(prefix):
			value = utils.load_string_data(os.path.join(dir,f))
			if isinstance(value, list) or isinstance(value, np.ndarray):
				for v in value:
					if isinstance(v, list) or isinstance(v, np.ndarray):
						dset = dset.union(v)
					else:
						dset.add(v)
			else:
				dset.add(value)

	# print(len(dset), dset)
	utils.save_string_data(saveto, np.array(list(dset)))
Ejemplo n.º 7
0
def clean_genes(ds):

	genes = set(ds.genes())
	print(len(genes))
	data = utils.load_string_data('random_selected_features_0')
	new_data = []
	for d in data:
		good = []
		for v in d:
			if v.startswith('cg'):
				good.append(v)

		small = list(genes.difference(good))
		wanted = d.shape[0] - len(good)

		want = list(np.random.choice(len(small), wanted))

		for w in want:
			good.append(small[w])

		new_data.append(good)

	utils.save_string_data('random_selected_features_0-clean', np.array(new_data))