def parse_data(data_descriptors, groups, factors=None, auto_labels=False, sep='\t'): #data_descriptors and factors can be in two different formats: #1) fn1?colA?colB?colC -> read all the columns of the file at once #2) fn1?colA fn2?colB ... -> read the files column by column as you go if len(data_descriptors) == 1 and data_descriptors[0].count('?') > 1: data_pre_parsed = True data_descriptors_split = data_descriptors[0].split('?') df_data = plot_utils.read_table(data_descriptors_split[0], usecols=map(int, data_descriptors_split[1:]), sep=sep) data_descriptors = [data_descriptors_split[0] + '?' + column for column in data_descriptors_split[1:]] else: data_pre_parsed = False if factors is not None: if len(factors) == 1: #the special case of factors[0].count('?') == 1 needs to be pre-processed as well #because factors allow to specify a single factors column for multiple data_items! factors_pre_parsed = True factors_split = factors[0].split('?') df_factors = plot_utils.read_table(factors_split[0], usecols=map(int, factors_split[1:]), sep=sep) factors = [factors_split[0] + '?' + column for column in factors_split[1:]] else: factors_pre_parsed = False grouped_data_descriptors, i = [], 0 for g in groups: grouped_data_descriptors.append(data_descriptors[i:i + g]) i += g if i != len(data_descriptors): raise ValueError data, labels = [], [] for i, data_descriptors_group in enumerate(grouped_data_descriptors): if factors is not None and len(data_descriptors_group) != 1: raise ValueError data_group = [] for data_descriptor in data_descriptors_group: data_filename, data_column = data_descriptor.split('?') data_column = int(data_column) if not data_pre_parsed: df_data = plot_utils.read_table(data_filename, usecols=[data_column], sep=sep) if factors is None: data_group.append(df_data[data_column].tolist()) labels.append(data_descriptor if not auto_labels else read_labels([data_descriptor], sep=sep)[0]) else: factors_filename, factors_column = factors[i].split('?') if len(factors) != 1 else factors[0].split('?') factors_column = int(factors_column) if not factors_pre_parsed: df_factors = plot_utils.read_table(factors_filename, usecols=[factors_column], sep=sep) if len(df_factors[factors_column]) != len(df_data[data_column]): raise ValueError factorised = factorise(zip(df_factors[factors_column].tolist(), df_data[data_column].tolist())) for factor in sorted(factorised.keys(), reverse=True): data_group.append(map(lambda x: x[1], factorised[factor])) labels.append((data_descriptor if not auto_labels else read_labels([data_descriptor], sep=sep)[0]) + ' (' + str(factor) + ')') data.append(data_group) return data, labels
def parse_data(data_descriptors, auto_labels=False, sep='\t'): x, y, data_filename_x, data_filename_y = [], [], None, None if len(data_descriptors) == 1: data_filename_x, x_col, y_col = data_descriptors[0].split('?') data_filename_y = data_filename_x elif len(data_descriptors) == 2: data_filename_x, x_col = data_descriptors[0].split('?') data_filename_y, y_col = data_descriptors[1].split('?') else: raise PlotsError(message='unexpected format: ' + str(data_descriptors)) x_col, y_col = int(x_col), int(y_col) if data_filename_x == data_filename_y: df = plot_utils.read_table(data_filename_x, usecols=[x_col, y_col], sep=sep) x = df[x_col].tolist() y = df[y_col].tolist() else: x = plot_utils.read_table(data_filename_x, usecols=[x_col], sep=sep)[x_col].tolist() y = plot_utils.read_table(data_filename_y, usecols=[y_col], sep=sep)[y_col].tolist() x_label = (data_filename_x[data_filename_x.rfind(sep) + 1:] + '?' + str(x_col)) if not auto_labels else read_labels( data_filename_x, x_col, sep=sep) y_label = (data_filename_y[data_filename_y.rfind(sep) + 1:] + '?' + str(y_col)) if not auto_labels else read_labels( data_filename_y, y_col, sep=sep) ''''x = np.array(x) x = -1 * np.log(x) x[x==np.inf] = 9.0 x[x==-0.0] = 0.0 x = list(x)''' ''''x = np.array(x) disease4 = x==4.0 disease5 = x==5.0 x = -1 * np.log(x) x[x==np.inf] = 9.0 x[x==-0.0] = 0.0 x[disease4] = 10.0 x[disease5] = 10.0 x = list(x)''' '''y = -1 * np.log(np.array(y)) y[y==np.inf] = 9.0 y[y==-0.0] = 0.0 y = list(y)''' return x, y, x_label, y_label
def parse_data(data_descriptors, sep='\t'): data, x, y = [], None, None for data_descriptor in data_descriptors: dd_split = data_descriptor.split('?') data_filename = dd_split[0] columns = map(int, dd_split[1:]) df = plot_utils.read_table(data_filename, usecols=columns, sep=sep) if x is None: subdata, x, y = make_curve_tuples(df, columns) data.extend(subdata) else: y = df[columns[0]].tolist() data.append((x, y)) subdata, x, y = make_curve_tuples(df, columns[1:]) data.extend(subdata) if x is not None: PlotsError(message='odd number of columns to plot') return data
def plot_distro(data, data_descriptor=None, ax=None, grid_columns=None, x_tics=None, y_tics=None, linewidth=2,\ box_plot=False, violin_plot=False, bar_plot=False, point_plot=False, count_plot=False, swarm_plot=False, strip_plot=False,\ y_range=None, x_log=False, y_log=False, sep=None, horizontal=False, order=None, hue_order=None,\ box_outlier=5, box_whis=1.5, box_notch=False,\ violin_scale='area', violin_inner='box', violin_cut=2, violin_split=False, violin_scale_hue=False,\ estimator=plot_utils.ESTIMATORS['mean'], ci=95, capsize=0.2,\ strip_jitter=True, points_colour=None,\ point_markers='o', point_marker_size=2,\ title=None, x_label=None, y_label=None, rotate_x_tics=None, bold_x_tics=False,\ hide_x_tick_marks=False, hide_y_tick_marks=False,\ hide_x_ticks=False, hide_y_ticks=False,\ label=None, show_legend=True, legend_out=False, legend_out_pad=None,\ despine=True, style='whitegrid_ticks', fontsize=16, colours=None, palette=None, reverse_palette=False, ncolours=None, figsize=None, fig_padding=0.1, dpi=None, output=None, out_format=None,\ box_kwargs=None, violin_kwargs=None, bar_kwargs=None, point_kwargs=None, count_kwargs=None, swarm_kwargs=None, strip_kwargs=None, legend_kwargs=None, kwargs=None, color=None): ''' Parameters ---------- data : pandas.DataFrame with indexes 'x', 'y', 'hue' or data : 3-tuple of lists x, y, and hue ([...], [...], [...]), thus for a single plot data=(None, [...], None) ''' if not (box_plot or violin_plot or bar_plot or point_plot or count_plot or swarm_plot or strip_plot): raise PlotsError( message= 'Specify a plot to plot: box or violin or bar or count or swarm or strip' ) if count_plot and (box_plot or violin_plot or bar_plot or point_plot or swarm_plot or strip_plot): raise PlotsError( message='Count plot cannot be combined with any other plot') if point_plot and (box_plot or violin_plot or bar_plot or count_plot or swarm_plot or strip_plot): raise PlotsError( message='Point plot cannot be combined with any other plot') # PARSED DATA IS SUPPLIED AS pd.DataFrame({'x':[] , 'y':[] , 'hue':[] }) OR ([...], [...], [...]) if data is not None: if data_descriptor is not None: raise PlotsError( message= 'You can specify only one of the mutually exclusive arguments: "data" or "data_descriptor"' ) if grid_columns is not None: raise PlotsError( message= 'The grid_columns option is only used when the data is supplied as a filename' ) # pd.DataFrame({'x':[] , 'y':[] , 'hue':[] }) if isinstance(data, pd.DataFrame) and ( ('y' not in data and not count_plot) or ('x' not in data and count_plot)): raise PlotsError( message= 'The dataframe has to have a "y" column, optionally also "x" and "hue" columns' if not count_plot else 'The dataframe has to have a "x" column, optionally also a "hue" column' ) # ([...], [...], [...]) else: if len(data) != 3 or ((data[1] is None and not count_plot) or (data[0] is None and count_plot)): raise PlotsError( message= 'The data should be a pandas.DataFrame or a 3-tuple of lists (x, y, hue). ' + ('The y list cannot be None, x and hue are optional (can be None).' if not count_plot else 'The x list cannot be None, hue is optional (can be None), y is ignored (set it None).' )) x, y, hue = data data = make_xyhue_dataframe(x, y, hue) # DATA IS SUPPLIED AS A FILENAME - THE FILE IS A THREE_COLUMN_FILE OR A GRID_LIKE_FILE else: if data_descriptor is None: raise PlotsError( message='You must specify "data" or "data_descriptor"') # GRID_LIKE_FILE if '?' not in data_descriptor: if grid_columns is None: raise PlotsError( message= 'You must specify columns for y, or x and y, or x, y and hue, or specify grid-columns: filename?y or filename?x?y or filename?x?y?hue or filename --grid_columns x1 x2 x3' ) if x_tics is None: raise PlotsError( message= 'When specifying --grid_columns, you have to specify also the labels for xtics (--xtics)' ) if len(x_tics) != len(grid_columns): raise PlotsError( message= 'The number of columns (--grid_columns) differ from the number of xtics (--xtics)' ) x, y, hue = read_grid_as_xyhue(data_descriptor, grid_columns, x_tics, sep=sep, comment='#') data = make_xyhue_dataframe(x, y, hue) x_tics = None #hack; they have been already assigned; this will prevent re-assigning again below # THREE_COLUMN_FILE else: if grid_columns is not None: raise PlotsError( message= 'When specifying columns using ?, you cannot specify --grid_columns.' ) data_descriptor_split = data_descriptor.split('?') filename = data_descriptor_split[0] if len(data_descriptor_split) == 2: columns_names = { 'y': int(data_descriptor_split[1]) } if not count_plot else { 'x': int(data_descriptor_split[1]) } elif len(data_descriptor_split) == 3: columns_names = { 'x': int(data_descriptor_split[1]), 'y': int(data_descriptor_split[2]) } if not count_plot else { 'x': int(data_descriptor_split[1]), 'hue': int(data_descriptor_split[2]) } elif len(data_descriptor_split) == 4: if count_plot: raise PlotsError( message='For count plot, you can only specify x and hue' ) columns_names = { 'x': int(data_descriptor_split[1]), 'y': int(data_descriptor_split[2]), 'hue': int(data_descriptor_split[3]) } else: raise PlotsError( message= 'You can specify only up to 3 columns for x, y and hue: filename or filename?y or filename?x?y or filename?x?y?hue' ) names, columns = zip( *sorted(columns_names.items(), key=lambda x: x[1])) data = plot_utils.read_table(filename, usecols=columns, names=names, sep=sep) if x_tics is not None: if 'x' in data: raise PlotsError( message= 'You specified the x-categories in your data, thus you cannot use the xtics option' ) elif len(x_tics) != 1: raise PlotsError( message= 'You can specify only one x-category using xtics (unless you use a grid-like file input)' ) else: data['x'] = np.array([x_tics[0]] * len(data['y'])) if not show_legend and (legend_out or legend_kwargs != None): raise PlotsError( message= 'If you hide the legend (--hide_legend or show_legend=False), you cannot set it outside (legend_out) or set it properties (legend_kwargs)' ) if y_range is not None and (len(y_range) != 2 or y_range[0] >= y_range[1]): raise PlotsError( message= 'You need to provide exactly two numbers to set yrange: "min max"') for my_order, variable in ((order, 'x'), (hue_order, 'hue')): if my_order is not None: if variable not in data: raise PlotsError( message= 'You specified order for %s but your data does not contain %s' % (variable, variable)) set_variable = set(data[variable]) if len(my_order) != len(set_variable) or set( my_order) != set_variable: raise PlotsError( message='The specified order does not match %s' % variable) if ci is not None and ci != 'std' and not callable(ci) and (ci < 0 or ci > 100): raise PlotsError(message='"ci" must be None or within 0 and 100') #if (swarm or strip) and 'hue' in data: raise PlotsError(message='Swarmplot is not supported when plotting plots with hue.') if colours is not None: if palette is not None: raise PlotsError( message= 'You can specify only one of the mutually exclusive arguments: "colours" or "palette"' ) if ncolours is not None: raise PlotsError( message= 'You cannot specify "ncolours" when you specified "colours"') if figsize is not None and len(figsize) != 2: raise PlotsError( message= 'You need to provide exactly two numbers to set figure size: "width height"' ) ## font = plot_utils.init_plot_style(style, fontsize, colours, palette, reverse_palette, ncolours, hide_x_tick_marks=hide_x_tick_marks, hide_y_tick_marks=hide_y_tick_marks) fig = plt.figure() if figsize is not None: fig.set_figwidth(figsize[0]) fig.set_figheight(figsize[1]) if dpi is not None: fig.set_dpi(dpi) default_kwargs = { 'ax': ax, 'x': 'x' if 'x' in data else None, 'y': 'y' if 'y' in data else None, 'hue': 'hue' if 'hue' in data else None, 'data': data, 'orient': 'h' if horizontal else 'v', 'order': order, 'hue_order': hue_order, 'linewidth': linewidth } if box_plot: axs = sb.boxplot(**plot_utils.merged_kwargs( default_kwargs, dict(fliersize=box_outlier, whis=box_whis, notch=box_notch, flierprops={'marker': 'o'}), kwargs, box_kwargs)) if violin_plot: axs = sb.violinplot(**plot_utils.merged_kwargs( default_kwargs, dict(scale=violin_scale, inner=violin_inner, cut=violin_cut, split=violin_split, scale_hue=violin_scale_hue), kwargs, violin_kwargs)) if bar_plot: axs = sb.barplot(**plot_utils.merged_kwargs( default_kwargs, dict(estimator=estimator, ci=ci, capsize=capsize), kwargs, bar_kwargs)) if point_plot: axs = sb.pointplot(**plot_utils.merged_kwargs( default_kwargs, dict(markers=point_markers, estimator=estimator, ci=ci, capsize=capsize), kwargs, point_kwargs)) if count_plot: axs = sb.countplot( **plot_utils.merged_kwargs(default_kwargs, kwargs, count_kwargs)) if swarm_plot: axs = sb.swarmplot(**plot_utils.merged_kwargs( default_kwargs, dict(edgecolor='black', linewidth=1), dict(facecolor=points_colour) if points_colour is not None else {}, kwargs, swarm_kwargs)) if strip_plot: axs = sb.stripplot(**plot_utils.merged_kwargs( default_kwargs, dict(edgecolor='black', linewidth=1, jitter=strip_jitter, split=True), dict(facecolor=points_colour) if points_colour is not None else {}, kwargs, strip_kwargs)) if point_plot: plt.setp(axs.collections, sizes=[point_marker_size]) plt.setp(axs.lines, linewidth=linewidth) if y_range is not None: axs.set_ylim(y_range[0], y_range[1]) axs.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f')) if y_tics is not None: axs.set_yticks(y_tics) if x_log: axs.set_xscale('log') if y_log: axs.set_yscale('log') if x_label is not None: axs.set_xlabel(x_label, labelpad=8, fontproperties=font.get('b')) else: axs.set_xlabel('', labelpad=8) if y_label is not None: axs.set_ylabel(y_label, labelpad=10, fontproperties=font.get('b')) else: axs.set_ylabel('', labelpad=10) if title is not None: ttl = axs.set_title(title, fontproperties=font.get('b')) ttl.set_position([.5, 1.05]) if label is not None: axs.text(label[1], label[2], label[0], horizontalalignment='left', verticalalignment='top', transform=axs.transAxes, fontproperties=font.get('b')) plt.setp(axs.get_xticklabels(), rotation=rotate_x_tics) plot_utils.set_fontproperties(font.get('b' if bold_x_tics else 'n'), axs.get_xticklabels()) plot_utils.set_fontproperties(font.get('n'), axs.get_yticklabels()) if hide_x_ticks: axs.xaxis.set_major_locator(ticker.NullLocator()) if hide_y_ticks: axs.yaxis.set_major_locator(ticker.NullLocator()) if despine or style in ['despine_ticks', 'whitegrid_ticks']: sb.despine(top=True, right=True) #Legend is a little tricky if the option is to be outside of the plot artists = [] if show_legend: legend_handles, legend_labels = axs.get_legend_handles_labels() num_plots = sum([ box_plot, violin_plot, bar_plot, point_plot, count_plot, swarm_plot, strip_plot ]) if num_plots != 1: legend_handles, legend_labels = legend_handles[:len( legend_handles) / num_plots], legend_labels[:len(legend_labels ) / num_plots] legend_out_kwargs = dict( bbox_to_anchor=(1, 1), loc=2, borderpad=legend_out_pad) if legend_out else None legend_handles, legend_labels = legend_handles[::-1], legend_labels[:: -1] legend = axs.legend( legend_handles, legend_labels, **plot_utils.merged_kwargs(dict(prop=font.get('n')), legend_out_kwargs, legend_kwargs)) artists.append(legend) plt.tight_layout() if output is not None: plt.savefig(output, format=out_format, dpi=dpi, additional_artists=artists, bbox_inches='tight', pad_inches=fig_padding) else: plt.show() plt.close()