Exemple #1
0
def parse_data(data_descriptors, groups, factors=None, auto_labels=False, sep='\t'):
	#data_descriptors and factors can be in two different formats:
	#1) fn1?colA?colB?colC -> read all the columns of the file at once
	#2) fn1?colA fn2?colB ... -> read the files column by column as you go
	if len(data_descriptors) == 1 and data_descriptors[0].count('?') > 1:
		data_pre_parsed = True
		data_descriptors_split = data_descriptors[0].split('?')
		df_data = plot_utils.read_table(data_descriptors_split[0], usecols=map(int, data_descriptors_split[1:]), sep=sep)
		data_descriptors = [data_descriptors_split[0] + '?' + column for column in data_descriptors_split[1:]]
	else:
		data_pre_parsed = False
		
	if factors is not None:
		if len(factors) == 1:
			#the special case of factors[0].count('?') == 1 needs to be pre-processed as well
			#because factors allow to specify a single factors column for multiple data_items!
			factors_pre_parsed = True
			factors_split = factors[0].split('?')
			df_factors = plot_utils.read_table(factors_split[0], usecols=map(int, factors_split[1:]), sep=sep)
			factors = [factors_split[0] + '?' + column for column in factors_split[1:]]
		else:
			factors_pre_parsed = False
	
	grouped_data_descriptors, i = [], 0
	for g in groups:
		grouped_data_descriptors.append(data_descriptors[i:i + g])
		i += g
	if i != len(data_descriptors): raise ValueError
	
	data, labels = [], []
	for i, data_descriptors_group in enumerate(grouped_data_descriptors):
		if factors is not None and len(data_descriptors_group) != 1: raise ValueError
		data_group = []
		for data_descriptor in data_descriptors_group:
			data_filename, data_column = data_descriptor.split('?')
			data_column = int(data_column)
			if not data_pre_parsed: df_data = plot_utils.read_table(data_filename, usecols=[data_column], sep=sep)
			if factors is None:
				data_group.append(df_data[data_column].tolist())
				labels.append(data_descriptor if not auto_labels else read_labels([data_descriptor], sep=sep)[0])
			else:
				factors_filename, factors_column = factors[i].split('?') if len(factors) != 1 else factors[0].split('?')
				factors_column = int(factors_column)
				if not factors_pre_parsed: df_factors = plot_utils.read_table(factors_filename, usecols=[factors_column], sep=sep)
				if len(df_factors[factors_column]) != len(df_data[data_column]): raise ValueError
				factorised = factorise(zip(df_factors[factors_column].tolist(), df_data[data_column].tolist()))
				for factor in sorted(factorised.keys(), reverse=True):
					data_group.append(map(lambda x: x[1], factorised[factor]))
					labels.append((data_descriptor if not auto_labels else read_labels([data_descriptor], sep=sep)[0]) + ' (' + str(factor) + ')')
		data.append(data_group)
	
	return data, labels
Exemple #2
0
def parse_data(data_descriptors, auto_labels=False, sep='\t'):
    x, y, data_filename_x, data_filename_y = [], [], None, None
    if len(data_descriptors) == 1:
        data_filename_x, x_col, y_col = data_descriptors[0].split('?')
        data_filename_y = data_filename_x
    elif len(data_descriptors) == 2:
        data_filename_x, x_col = data_descriptors[0].split('?')
        data_filename_y, y_col = data_descriptors[1].split('?')
    else:
        raise PlotsError(message='unexpected format: ' + str(data_descriptors))
    x_col, y_col = int(x_col), int(y_col)

    if data_filename_x == data_filename_y:
        df = plot_utils.read_table(data_filename_x,
                                   usecols=[x_col, y_col],
                                   sep=sep)
        x = df[x_col].tolist()
        y = df[y_col].tolist()
    else:
        x = plot_utils.read_table(data_filename_x, usecols=[x_col],
                                  sep=sep)[x_col].tolist()
        y = plot_utils.read_table(data_filename_y, usecols=[y_col],
                                  sep=sep)[y_col].tolist()

    x_label = (data_filename_x[data_filename_x.rfind(sep) + 1:] + '?' +
               str(x_col)) if not auto_labels else read_labels(
                   data_filename_x, x_col, sep=sep)
    y_label = (data_filename_y[data_filename_y.rfind(sep) + 1:] + '?' +
               str(y_col)) if not auto_labels else read_labels(
                   data_filename_y, y_col, sep=sep)
    ''''x = np.array(x)
	x = -1 * np.log(x)
	x[x==np.inf] = 9.0
	x[x==-0.0] = 0.0
	x = list(x)'''
    ''''x = np.array(x)
	disease4 = x==4.0
	disease5 = x==5.0
	x = -1 * np.log(x)
	x[x==np.inf] = 9.0
	x[x==-0.0] = 0.0
	x[disease4] = 10.0
	x[disease5] = 10.0
	x = list(x)'''
    '''y = -1 * np.log(np.array(y))
	y[y==np.inf] = 9.0
	y[y==-0.0] = 0.0
	y = list(y)'''

    return x, y, x_label, y_label
Exemple #3
0
def parse_data(data_descriptors, sep='\t'):
    data, x, y = [], None, None
    for data_descriptor in data_descriptors:
        dd_split = data_descriptor.split('?')
        data_filename = dd_split[0]
        columns = map(int, dd_split[1:])
        df = plot_utils.read_table(data_filename, usecols=columns, sep=sep)
        if x is None:
            subdata, x, y = make_curve_tuples(df, columns)
            data.extend(subdata)
        else:
            y = df[columns[0]].tolist()
            data.append((x, y))
            subdata, x, y = make_curve_tuples(df, columns[1:])
            data.extend(subdata)

    if x is not None: PlotsError(message='odd number of columns to plot')

    return data
def plot_distro(data, data_descriptor=None, ax=None, grid_columns=None, x_tics=None, y_tics=None, linewidth=2,\
    box_plot=False, violin_plot=False, bar_plot=False, point_plot=False, count_plot=False, swarm_plot=False, strip_plot=False,\
    y_range=None, x_log=False, y_log=False, sep=None, horizontal=False, order=None, hue_order=None,\
    box_outlier=5, box_whis=1.5, box_notch=False,\
    violin_scale='area', violin_inner='box', violin_cut=2, violin_split=False, violin_scale_hue=False,\
    estimator=plot_utils.ESTIMATORS['mean'], ci=95, capsize=0.2,\
    strip_jitter=True, points_colour=None,\
    point_markers='o', point_marker_size=2,\
    title=None, x_label=None, y_label=None, rotate_x_tics=None, bold_x_tics=False,\
    hide_x_tick_marks=False, hide_y_tick_marks=False,\
    hide_x_ticks=False, hide_y_ticks=False,\
    label=None, show_legend=True, legend_out=False, legend_out_pad=None,\
    despine=True, style='whitegrid_ticks', fontsize=16, colours=None, palette=None, reverse_palette=False, ncolours=None, figsize=None, fig_padding=0.1, dpi=None, output=None, out_format=None,\
    box_kwargs=None, violin_kwargs=None, bar_kwargs=None, point_kwargs=None, count_kwargs=None, swarm_kwargs=None, strip_kwargs=None, legend_kwargs=None, kwargs=None,
    color=None):
    '''
	Parameters
	----------
	data : pandas.DataFrame with indexes 'x', 'y', 'hue'
	or
	data : 3-tuple of lists x, y, and hue ([...], [...], [...]), thus for a single plot data=(None, [...], None)
	'''

    if not (box_plot or violin_plot or bar_plot or point_plot or count_plot
            or swarm_plot or strip_plot):
        raise PlotsError(
            message=
            'Specify a plot to plot: box or violin or bar or count or swarm or strip'
        )
    if count_plot and (box_plot or violin_plot or bar_plot or point_plot
                       or swarm_plot or strip_plot):
        raise PlotsError(
            message='Count plot cannot be combined with any other plot')
    if point_plot and (box_plot or violin_plot or bar_plot or count_plot
                       or swarm_plot or strip_plot):
        raise PlotsError(
            message='Point plot cannot be combined with any other plot')

    # PARSED DATA IS SUPPLIED AS pd.DataFrame({'x':[] , 'y':[] , 'hue':[] }) OR ([...], [...], [...])
    if data is not None:
        if data_descriptor is not None:
            raise PlotsError(
                message=
                'You can specify only one of the mutually exclusive arguments: "data" or "data_descriptor"'
            )
        if grid_columns is not None:
            raise PlotsError(
                message=
                'The grid_columns option is only used when the data is supplied as a filename'
            )
        # pd.DataFrame({'x':[] , 'y':[] , 'hue':[] })
        if isinstance(data, pd.DataFrame) and (
            ('y' not in data and not count_plot) or
            ('x' not in data and count_plot)):
            raise PlotsError(
                message=
                'The dataframe has to have a "y" column, optionally also "x" and "hue" columns'
                if not count_plot else
                'The dataframe has to have a "x" column, optionally also a "hue" column'
            )
        # ([...], [...], [...])
        else:
            if len(data) != 3 or ((data[1] is None and not count_plot) or
                                  (data[0] is None and count_plot)):
                raise PlotsError(
                    message=
                    'The data should be a pandas.DataFrame or a 3-tuple of lists (x, y, hue). '
                    +
                    ('The y list cannot be None, x and hue are optional (can be None).'
                     if not count_plot else
                     'The x list cannot be None, hue is optional (can be None), y is ignored (set it None).'
                     ))
            x, y, hue = data
            data = make_xyhue_dataframe(x, y, hue)

    # DATA IS SUPPLIED AS A FILENAME - THE FILE IS A THREE_COLUMN_FILE OR A GRID_LIKE_FILE
    else:
        if data_descriptor is None:
            raise PlotsError(
                message='You must specify "data" or "data_descriptor"')
        # GRID_LIKE_FILE
        if '?' not in data_descriptor:
            if grid_columns is None:
                raise PlotsError(
                    message=
                    'You must specify columns for y, or x and y, or x, y and hue, or specify grid-columns: filename?y or filename?x?y or filename?x?y?hue or filename --grid_columns x1 x2 x3'
                )
            if x_tics is None:
                raise PlotsError(
                    message=
                    'When specifying --grid_columns, you have to specify also the labels for xtics (--xtics)'
                )
            if len(x_tics) != len(grid_columns):
                raise PlotsError(
                    message=
                    'The number of columns (--grid_columns) differ from the number of xtics (--xtics)'
                )
            x, y, hue = read_grid_as_xyhue(data_descriptor,
                                           grid_columns,
                                           x_tics,
                                           sep=sep,
                                           comment='#')
            data = make_xyhue_dataframe(x, y, hue)
            x_tics = None  #hack; they have been already assigned; this will prevent re-assigning again below
        # THREE_COLUMN_FILE
        else:
            if grid_columns is not None:
                raise PlotsError(
                    message=
                    'When specifying columns using ?, you cannot specify --grid_columns.'
                )
            data_descriptor_split = data_descriptor.split('?')
            filename = data_descriptor_split[0]
            if len(data_descriptor_split) == 2:
                columns_names = {
                    'y': int(data_descriptor_split[1])
                } if not count_plot else {
                    'x': int(data_descriptor_split[1])
                }
            elif len(data_descriptor_split) == 3:
                columns_names = {
                    'x': int(data_descriptor_split[1]),
                    'y': int(data_descriptor_split[2])
                } if not count_plot else {
                    'x': int(data_descriptor_split[1]),
                    'hue': int(data_descriptor_split[2])
                }
            elif len(data_descriptor_split) == 4:
                if count_plot:
                    raise PlotsError(
                        message='For count plot, you can only specify x and hue'
                    )
                columns_names = {
                    'x': int(data_descriptor_split[1]),
                    'y': int(data_descriptor_split[2]),
                    'hue': int(data_descriptor_split[3])
                }
            else:
                raise PlotsError(
                    message=
                    'You can specify only up to 3 columns for x, y and hue: filename or filename?y or filename?x?y or filename?x?y?hue'
                )
            names, columns = zip(
                *sorted(columns_names.items(), key=lambda x: x[1]))
            data = plot_utils.read_table(filename,
                                         usecols=columns,
                                         names=names,
                                         sep=sep)

    if x_tics is not None:
        if 'x' in data:
            raise PlotsError(
                message=
                'You specified the x-categories in your data, thus you cannot use the xtics option'
            )
        elif len(x_tics) != 1:
            raise PlotsError(
                message=
                'You can specify only one x-category using xtics (unless you use a grid-like file input)'
            )
        else:
            data['x'] = np.array([x_tics[0]] * len(data['y']))

    if not show_legend and (legend_out or legend_kwargs != None):
        raise PlotsError(
            message=
            'If you hide the legend (--hide_legend or show_legend=False), you cannot set it outside (legend_out) or set it properties (legend_kwargs)'
        )
    if y_range is not None and (len(y_range) != 2 or y_range[0] >= y_range[1]):
        raise PlotsError(
            message=
            'You need to provide exactly two numbers to set yrange: "min max"')
    for my_order, variable in ((order, 'x'), (hue_order, 'hue')):
        if my_order is not None:
            if variable not in data:
                raise PlotsError(
                    message=
                    'You specified order for %s but your data does not contain %s'
                    % (variable, variable))
            set_variable = set(data[variable])
            if len(my_order) != len(set_variable) or set(
                    my_order) != set_variable:
                raise PlotsError(
                    message='The specified order does not match %s' % variable)
    if ci is not None and ci != 'std' and not callable(ci) and (ci < 0
                                                                or ci > 100):
        raise PlotsError(message='"ci" must be None or within 0 and 100')

    #if (swarm or strip) and 'hue' in data: raise PlotsError(message='Swarmplot is not supported when plotting plots with hue.')
    if colours is not None:
        if palette is not None:
            raise PlotsError(
                message=
                'You can specify only one of the mutually exclusive arguments: "colours" or "palette"'
            )
        if ncolours is not None:
            raise PlotsError(
                message=
                'You cannot specify "ncolours" when you specified "colours"')
    if figsize is not None and len(figsize) != 2:
        raise PlotsError(
            message=
            'You need to provide exactly two numbers to set figure size: "width height"'
        )

    ##

    font = plot_utils.init_plot_style(style,
                                      fontsize,
                                      colours,
                                      palette,
                                      reverse_palette,
                                      ncolours,
                                      hide_x_tick_marks=hide_x_tick_marks,
                                      hide_y_tick_marks=hide_y_tick_marks)
    fig = plt.figure()
    if figsize is not None:
        fig.set_figwidth(figsize[0])
        fig.set_figheight(figsize[1])
    if dpi is not None: fig.set_dpi(dpi)

    default_kwargs = {
        'ax': ax,
        'x': 'x' if 'x' in data else None,
        'y': 'y' if 'y' in data else None,
        'hue': 'hue' if 'hue' in data else None,
        'data': data,
        'orient': 'h' if horizontal else 'v',
        'order': order,
        'hue_order': hue_order,
        'linewidth': linewidth
    }

    if box_plot:
        axs = sb.boxplot(**plot_utils.merged_kwargs(
            default_kwargs,
            dict(fliersize=box_outlier,
                 whis=box_whis,
                 notch=box_notch,
                 flierprops={'marker': 'o'}), kwargs, box_kwargs))
    if violin_plot:
        axs = sb.violinplot(**plot_utils.merged_kwargs(
            default_kwargs,
            dict(scale=violin_scale,
                 inner=violin_inner,
                 cut=violin_cut,
                 split=violin_split,
                 scale_hue=violin_scale_hue), kwargs, violin_kwargs))
    if bar_plot:
        axs = sb.barplot(**plot_utils.merged_kwargs(
            default_kwargs, dict(estimator=estimator, ci=ci, capsize=capsize),
            kwargs, bar_kwargs))
    if point_plot:
        axs = sb.pointplot(**plot_utils.merged_kwargs(
            default_kwargs,
            dict(markers=point_markers,
                 estimator=estimator,
                 ci=ci,
                 capsize=capsize), kwargs, point_kwargs))
    if count_plot:
        axs = sb.countplot(
            **plot_utils.merged_kwargs(default_kwargs, kwargs, count_kwargs))
    if swarm_plot:
        axs = sb.swarmplot(**plot_utils.merged_kwargs(
            default_kwargs, dict(edgecolor='black', linewidth=1),
            dict(facecolor=points_colour) if points_colour is not None else {},
            kwargs, swarm_kwargs))
    if strip_plot:
        axs = sb.stripplot(**plot_utils.merged_kwargs(
            default_kwargs,
            dict(edgecolor='black',
                 linewidth=1,
                 jitter=strip_jitter,
                 split=True),
            dict(facecolor=points_colour) if points_colour is not None else {},
            kwargs, strip_kwargs))

    if point_plot:
        plt.setp(axs.collections, sizes=[point_marker_size])
        plt.setp(axs.lines, linewidth=linewidth)
    if y_range is not None: axs.set_ylim(y_range[0], y_range[1])
    axs.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.2f'))
    if y_tics is not None: axs.set_yticks(y_tics)
    if x_log: axs.set_xscale('log')
    if y_log: axs.set_yscale('log')
    if x_label is not None:
        axs.set_xlabel(x_label, labelpad=8, fontproperties=font.get('b'))
    else:
        axs.set_xlabel('', labelpad=8)
    if y_label is not None:
        axs.set_ylabel(y_label, labelpad=10, fontproperties=font.get('b'))
    else:
        axs.set_ylabel('', labelpad=10)
    if title is not None:
        ttl = axs.set_title(title, fontproperties=font.get('b'))
        ttl.set_position([.5, 1.05])
    if label is not None:
        axs.text(label[1],
                 label[2],
                 label[0],
                 horizontalalignment='left',
                 verticalalignment='top',
                 transform=axs.transAxes,
                 fontproperties=font.get('b'))
    plt.setp(axs.get_xticklabels(), rotation=rotate_x_tics)
    plot_utils.set_fontproperties(font.get('b' if bold_x_tics else 'n'),
                                  axs.get_xticklabels())
    plot_utils.set_fontproperties(font.get('n'), axs.get_yticklabels())
    if hide_x_ticks: axs.xaxis.set_major_locator(ticker.NullLocator())
    if hide_y_ticks: axs.yaxis.set_major_locator(ticker.NullLocator())
    if despine or style in ['despine_ticks', 'whitegrid_ticks']:
        sb.despine(top=True, right=True)

    #Legend is a little tricky if the option is to be outside of the plot
    artists = []
    if show_legend:
        legend_handles, legend_labels = axs.get_legend_handles_labels()
        num_plots = sum([
            box_plot, violin_plot, bar_plot, point_plot, count_plot,
            swarm_plot, strip_plot
        ])
        if num_plots != 1:
            legend_handles, legend_labels = legend_handles[:len(
                legend_handles) / num_plots], legend_labels[:len(legend_labels
                                                                 ) / num_plots]
        legend_out_kwargs = dict(
            bbox_to_anchor=(1, 1), loc=2,
            borderpad=legend_out_pad) if legend_out else None
        legend_handles, legend_labels = legend_handles[::-1], legend_labels[::
                                                                            -1]
        legend = axs.legend(
            legend_handles, legend_labels,
            **plot_utils.merged_kwargs(dict(prop=font.get('n')),
                                       legend_out_kwargs, legend_kwargs))
        artists.append(legend)

    plt.tight_layout()

    if output is not None:
        plt.savefig(output,
                    format=out_format,
                    dpi=dpi,
                    additional_artists=artists,
                    bbox_inches='tight',
                    pad_inches=fig_padding)
    else:
        plt.show()
    plt.close()