def prepare_data(path, config): thresh = config.get('Evaluation', 'FilterThresh') data = pd.read_csv(path + '/abundance.tsv', index_col=0, sep='\t', header=None) labels = np.genfromtxt(path + '/labels.txt', dtype=np.str_, delimiter=',') core_filt_thresh = float(thresh) opp_filt_thresh = 0.0 data = data.transpose() sums = data.sum(axis=1) data = data.divide(sums, axis=0) labels, label_set = pd.factorize(labels) pos_set = data.iloc[np.where(labels == 1)] neg_set = data.iloc[np.where(labels == 0)] core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh) data = core features = list(data.columns.values) print("There are %d raw features..." % (len(features))) features_df = get_feature_df(features) print("Building tree structure...") try: g = pickle.load( open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'rb')) print("Found tree file...") except: print("Tree file not found...") print("Contsructing tree..") g = Graph() g.build_graph() g.prune_graph(features_df) pickle.dump( g, open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'wb')) print("Populating trees...") results = Parallel(n_jobs=num_cores)( delayed(generate_maps)(x, g, features_df) for x in data.values) my_maps = np.array(np.take(results, 1, 1).tolist()) counts = np.count_nonzero(my_maps, axis=0) my_benchmark = np.array(np.take(results, 0, 1).tolist()) my_benchmark_tree = np.array(np.take(results, 2, 1).tolist()) tree_features = g.graph_vector_features() my_benchmark_df = pd.DataFrame(index=tree_features, data=np.transpose(my_benchmark_tree)) my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean() tree_features = my_benchmark_df.index my_benchmark_tree = np.transpose(my_benchmark_df.values) num_tree_features = len(tree_features) print("There are %d tree features..." % (num_tree_features)) return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df
def prepare_data(path, config, k, m): # def prepare_data ( path , config): thresh = config.get('Evaluation', 'FilterThresh') data = pd.read_csv(path + '/pois_t2d_trainabun_1+1.tsv', index_col=0, sep='\t', header=None) #542行(微生物),232列(样本),第一列为名称,后为数据, labels = np.genfromtxt(path + '/pois_t2d_trainlabel_1+1.txt', dtype=np.str_, delimiter=',') #一行,232列,依次记录"n"和"Cirrhosis" core_filt_thresh = float(thresh) opp_filt_thresh = 0.0 data = data.transpose() #此时542列微生物特征和232行样本 sums = data.sum(axis=1) #232个样本,每个样本的各类微生物之和,均为100 data = data.divide(sums, axis=0) # labels, label_set = pd.factorize(labels) #label_set=['n','Cirrhosis'] #labels:一行,前114个为0,后118个为1 pos_set = data.iloc[np.where(labels == 1)] #118行, neg_set = data.iloc[np.where(labels == 0)] #114行 core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh) #可能是过滤数据或者是打乱数据顺序,原本232个样本,542个微生物特征,经过filter以后为232个样本,269个微生物特征 data = core features = list(data.columns.values) print("There are %d raw features..." % (len(features))) features_df = get_feature_df(features) #每一种微生物名字剥离出来,分门别类,比如一行知道他是哪个界,哪个门,这样排序,成一个表。 #看做微生物名字表:每一行都是一种具体微生物(即特征)所属的界、门、科、目、纲、属、种 print("Building tree structure...") try: g = pickle.load( open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'rb')) print("Found tree file...") except: print("Tree file not found...") print("Contsructing tree..") g = Graph() g.build_graph() g.prune_graph(features_df) #build_graph为根据很多括号的通用树文件建立的树 #而features_df为单一数据集中出现的微生物特征,根据当前数据集实际微生物特征修剪通用的进化树。 g.removeRepeatName() g.routeToRoot() # pickle.dump(g, open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'wb')) # pickle.dump保存 print("Populating trees...") results = Parallel(n_jobs=num_cores)( delayed(generate_maps)(x, g, features_df, k, m) for x in data.values) # results = Parallel ( n_jobs=num_cores ) (delayed ( generate_maps ) ( x , g , features_df ,) for x in data.values ) # data.values 是232行,每一行一个样本。269列,每一列一个微生物特征的纯数据,不带名字 #x 为data从第一行即第一个样本遍历到最后一行最后一个样本,再把第一行转置。即一个269个元素的列,代表一个样本 my_maps = np.array(np.take(results, 1, 1).tolist()) counts = np.count_nonzero(my_maps, axis=0) my_benchmark = np.array(np.take(results, 0, 1).tolist()) my_benchmark_tree = np.array(np.take(results, 2, 1).tolist()) tree_features = g.graph_vector_features() my_benchmark_df = pd.DataFrame(index=tree_features, data=np.transpose(my_benchmark_tree)) my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean() tree_features = my_benchmark_df.index my_benchmark_tree = np.transpose(my_benchmark_df.values) num_tree_features = len(tree_features) print("There are %d tree features..." % (num_tree_features)) return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df