def load_genusData(gfilename): #read csv file csv_list, nrows, ncols = csvReader.csv_reader(gfilename) x_data = csv_list[0][2:] genus_dic = {} table_sum = [0 for i in range(len(x_data))] bottom_data = {} bottom_data[0] = [0 for i in range(len(x_data))] for row_num in range(1, nrows): current_genus = csv_list[row_num][0] current_type = csv_list[row_num][1] genus_dic[row_num] = {} genus_dic[row_num]['type'] = current_type genus_dic[row_num]['rate'] = csv_list[row_num][2:] genus_dic[row_num]['name'] = current_genus table_sum = [i + j for i, j in zip(table_sum, csv_list[row_num][2:])] bottom_data[row_num] = table_sum #conver to percent for row_num in range(1, nrows): genus_dic[row_num]['rate'] = [ i / j * 100 for i, j in zip(genus_dic[row_num]['rate'], table_sum) ] bottom_data[row_num] = [ i / j * 100 for i, j in zip(bottom_data[row_num], table_sum) ] return nrows, x_data, genus_dic, bottom_data
def load_bacterium(genus_filename, group_filename): #read csv file csv_list, nrows, ncols = csvReader.csv_reader(genus_filename) #read group file group, group_names = load_groupData(group_filename) bacterium = {} sample_data = [] group_id = [] sample_names = [] for col_num in range(2, ncols): sample_name = csv_list[0][col_num] group_id.append(group[sample_name]) sample_names.append(sample_name) data = [] for row_num in range(1, nrows): data.append(csv_list[row_num][col_num]) sample_data.append(data) bacterium['data'] = np.asarray(sample_data) bacterium['sample'] = np.asarray(sample_names) bacterium['group_id'] = np.asarray(group_id) bacterium['group_names'] = np.asarray(group_names) return bacterium
def get_genusSize(gfilename): csv_list, nrows, ncols = csvReader.csv_reader(gfilename) genus_size = {} for row_num in range(1, nrows): current_genus = csv_list[row_num][0] genus_size[current_genus] = csv_list[row_num][2:] return genus_size
def load_groupData(group_filename): #read group file csv_list, nrows, ncols = csvReader.csv_reader(group_filename) group = {} group_names = [] for n in range(nrows): if not csv_list[n][1].lower() in group_names: group_names.append(csv_list[n][1].lower()) group[csv_list[n][0]] = group_names.index(csv_list[n][1].lower()) return group, group_names
def kinship(dir): kinList = [[], []] for s in range(dir): csvFile = getCSV(rootdir, s) famList = csv_reader(csvFile) #in famList, fathers are indexed in 0 and sons in 1 #print famList for rows in famList: imFather = folderFind(rootdir, s, rows[0]) imSon = folderFind(rootdir, s, rows[1]) if len(imFather) > 0 and len(imSon) > 0: if len(imFather) > len(imSon): for i in range(len(imFather) - len(imSon)): imSon.append(imSon[0]) elif len(imSon) > len(imFather): for i in range(len(imSon) - len(imFather)): imFather.append(imFather[0]) kinList[0].append(imFather) kinList[1].append(imSon) #print len(kinList[1]) #print len(kinList[0]) dads = [val for sublist in kinList[0] for val in sublist] #Flatten lists sons = [val for sublist in kinList[1] for val in sublist] #Flatten lists print len(dads) #print len(sons) kinList = [[], []] #Clear list to copy unidimentional list for n in range(len(dads)): kinList[0].append(dads[n]) kinList[1].append(sons[n]) return kinList
def load_speciesData(sfilename, gfilename): #read csv file csv_list, nrows, ncols = csvReader.csv_reader(sfilename) genus_size = get_genusSize(gfilename) x_data = csv_list[0][2:] species_dic = {} bottom_data = {} for row_num in range(1, nrows): current_genus = csv_list[row_num][0] current_species = csv_list[row_num][1] if (species_dic.get(current_genus) == None): n = 1 species_dic[current_genus] = {} bottom_data[current_genus] = {} bottom_data[current_genus][0] = [0 for i in range(len(x_data))] n = n - 1 n = n + 1 species_dic[current_genus][n] = {} if genus_size.get(current_genus) != None: #size of species_bar is absolute value! not relative! species_dic[current_genus][n]['rate'] = [ i * j for i, j in zip(csv_list[row_num][2:], genus_size[current_genus]) ] species_dic[current_genus][n]['name'] = current_species bottom_data[current_genus][n] = [ i + j for i, j in zip(bottom_data[current_genus][n - 1], species_dic[current_genus][n]['rate']) ] return nrows, x_data, species_dic, bottom_data
def run(filename, group_filename, checked) : #read csv file csv_list, nrows, ncols = csvReader.csv_reader(filename) x_data = csv_list[0][2:] #load group file group, group_names = load_groupData(group_filename) global group1 global group2 group1 = [] group2 = [] #index of group in x_data group1_ind = [] group2_ind = [] #separate group1 and group2 for n in range(len(group)) : group1_id = group_names.index(checked[0]) group2_id = group_names.index(checked[1]) if group.values()[n] == group1_id : group1.append(group.keys()[n]) group1_ind.append(x_data.index(group.keys()[n])) elif group.values()[n] == group2_id : group2.append(group.keys()[n]) group2_ind.append(x_data.index(group.keys()[n])) filtered = {} filtered['data'] = [] filtered['genus'] = [] filtered['pvalue'] = [] #extract data which have low pvalue for row_num in range(1,nrows): current_genus = csv_list[row_num][0] current_type = csv_list[row_num][1] test_group1 = [] test_group2 = [] for i in group1_ind : test_group1.append(csv_list[row_num][2+i]) for i in group2_ind : test_group2.append(csv_list[row_num][2+i]) # calculate pvalue pvalue = sci.ttest_ind(test_group1, test_group2, equal_var=True)[1] if pvalue < 0.05 : filtered['data'].append(test_group1+test_group2) filtered['genus'].append(current_genus+"("+str(current_type)[0]+")") filtered['pvalue'].append(round(pvalue,4)) d3 = bd.draw_graph(group1, group2, checked, x_label = [i for i in range(len(group1+group2))], y_label=[i for i in range(len(filtered['data']))], pvalue_label = filtered['pvalue'], fit_data = filtered['data'], genus_data = filtered['genus'], title = "Original dataset", x = [i for i in range(len(group1+group2))]) d3.draw() if len(filtered['data']) > 0 : biclustering(filtered, checked) else : print "no data" plt.show()
def run(genus_filename, group_filename): #load genus file csv_list, nrows, ncols = csvReader.csv_reader(genus_filename) #load bacterium data bacterium = load_bacterium(genus_filename, group_filename) X = bacterium['data'] y = bacterium['group_id'] group_names = bacterium['group_names'] n_components = 2 pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) colors = ['navy', 'turquoise', 'darkorange', 'green', 'yellow'] f, ax1 = plt.subplots(1, figsize=(6, 6)) plt.subplots_adjust(left=None, bottom=0.2, right=None, top=None, wspace=None, hspace=None) #draw scatters for X_transformed, title in [(X_pca, "PCA")]: for color, i, group_name in zip(colors, range(len(group_names)), group_names): plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], color=color, lw=2, label=group_name) plt.title(title + " of bacterium dataset") plt.legend(loc="best", shadow=False, scatterpoints=1) plt.axis([-1, 1, -0.5, 0.5]) global ann ann = [] #set annotation def annotation(): for label, x, y in zip(bacterium['sample'], X_pca[:, 0], X_pca[:, 1]): ann.append( ax1.annotate(label, xy=(x, y), xytext=(-2, 2), textcoords='offset points', ha='right', va='bottom', fontsize=6, color='gray')) global txt, arr txt = [] arr = [] #loadings def loading(load): for i in range(len(pca.components_[0])): x, y = pca.components_[0][i], pca.components_[1][i] if x > load or y > load: arr.append( ax1.arrow(0, 0, x * 0.5, y * 0.5, color='coral', width=0.001, head_width=0.01)) txt.append( ax1.text(x * 0.25, y * 0.25, csv_list[i + 1][0], color='coral', ha='center', va='center', fontsize=7)) f.canvas.draw() axloading = plt.axes([0.20, 0.10, 0.62, 0.03], axisbg='lightgoldenrodyellow') sloading = Slider(axloading, 'loading', 0.05, 0.5, valinit=0.3, color='coral') def update(val): for i in range(len(txt)): txt[i].remove() arr[i].remove() arr[:] = [] txt[:] = [] load = sloading.val loading(load) f.canvas.draw_idle() sloading.on_changed(update) showax = plt.axes([0.7, 0.025, 0.1, 0.04]) hideax = plt.axes([0.8, 0.025, 0.1, 0.04]) bshow = Button(showax, 'show', color='lightgoldenrodyellow', hovercolor='0.975') bhide = Button(hideax, 'hide', color='lightgoldenrodyellow', hovercolor='0.975') def show(event): annotation() def hide(event): for i in range(len(txt)): txt[i].remove() arr[i].remove() arr[:] = [] txt[:] = [] for i in range(len(ann)): ann[i].remove() ann[:] = [] f.canvas.draw() bshow.on_clicked(show) bhide.on_clicked(hide) plt.show()