def test_df_from_input_json_bad(self, open_file, json): open_file.return_value = MagicMock() json.loads = MagicMock() json.loads.side_effect = ValueError() args = MagicMock(names=[], input_options=['json']) in_file = MagicMock() with self.assertRaises(SystemExit): io_lib.df_from_input(args, in_file=in_file)
def test_df_from_input_no_input(self, pd_mock): def raiser(*args, **kwargs): raise ValueError() pd_mock.read_csv = raiser args = MagicMock(names=[], input_options=[]) in_file = MagicMock() with self.assertRaises(SystemExit): io_lib.df_from_input(args, in_file=in_file)
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) plot_lib.set_plot_styling(args) # no plotting if output requested if args.quiet: counts, edges = np.histogram( df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Create strings from a dataframe using python str.format() template. This tool is particularly useful for generating a list of commands that for piping into p.parallel. ----------------------------------------------------------------------- Examples: * Create commands to touch a sequence of files in /tmp seq 10 | p.df --names n -i noheader\\ | p.format -t 'touch /tmp/file{n:02d}.txt' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in') parser.add_argument('-t', '--template', required=True, help='A python template string', nargs=1) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # write out the strings stream = OutStream(args.template[0]) for rec in df.to_dict('records'): stream.write(**rec)
def test_df_from_input_create_names(self, pd_mock): df_in = pd.DataFrame(columns=[1, 2]) pd_mock.read_csv = MagicMock(return_value=df_in) pd_mock.Index = pd.Index args = MagicMock(names=[], input_options=['noheader']) df = io_lib.df_from_input(args, in_file=None) self.assertEqual(['c0', 'c1'], list(df.columns))
def main(): msg = textwrap.dedent( """ Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3 * Show the annual and 59-day peaks in the sealevel spectrum p.example_data -d sealevel\\ | p.df 'df["day"] = 365.25 * df.year'\\ 'df["day"] = df.day - df.day.iloc[0]'\\ | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\ | p.df 'df[df.period < 720]'\\ | p.plot -x period -y amp --xlim 1 400\\ --title 'Sea-surface height spectrum'\\ --xlabel 'period (days)' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') parser.add_argument('-t', '--time_col', help='Time Column', nargs=1, required=True, type=str) parser.add_argument('-y', '--observation_col', help='Observation column', nargs=1, dest='val_col', required=True, type=str) parser.add_argument('--interp_exp', help='Interpolate by this power of 2', nargs=1, type=int, default=[1]) parser.add_argument( '--freq_order', action='store_true', dest='freq_order', default=False, help='Order output by freqency instead of period') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = lomb_scargle_lib.lomb_scargle( df, args.time_col[0], args.val_col[0], args.interp_exp[0], args.freq_order) # write dataframe to output io_lib.df_to_output(args, df)
def test_df_from_input_json_names(self, open_file, json): open_file.return_value = MagicMock() json.loads = MagicMock(return_value=[{'a': 1}, {'a': 2}]) args = MagicMock(names=['a'], input_options=['json']) in_file = MagicMock() df = io_lib.df_from_input(args, in_file=in_file) self.assertEqual(list(df.columns), ['a']) self.assertEqual(list(df.a), [1, 2])
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser x_col = args.x[0] if args.x else None cols = args.y if args.y else [df.columns[0]] cols_to_check = cols + [x_col] if x_col else cols validate_args(args, cols_to_check, df) df = smooth(df, cols, x_col) io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Remove outliers from DataFrame columns using a recursive sigma-edit algorithm. The algorithm will recursively NaN out values greater than sigma_thresh standard deviations away from sample mean. ----------------------------------------------------------------------- Examples: * Do a 2.5-sigma edit on a gamma distribution and show histogram p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\ | p.df 'df["c1"] = df.c0'\\ | p.sig_edit -c c1 -t 2.5\\ | p.df 'pd.melt(df)' --names raw edited\\ | p.facet_grid --hue variable --map pl.hist\\ --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold", nargs=1, required=True, type=float) parser.add_argument("-c", "--cols", required=True, help="Column(s) to sigma-edit", nargs="+") parser.add_argument("--max_iter", help="Max number of recursions", nargs=1, type=int, default=[20]) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = outlier_lib.sigma_edit_dataframe(args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0]) # write dataframe to output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Creates interactive xy plots. Loosely based around matplotlib's pyplot.plot command. ----------------------------------------------------------------------- Examples: * Really simple plot p.linspace 1 10 7 | p.plot -x c0 -y c0 * Plot two traces p.linspace 0 6.28 100\\ | p.df 'df["cos"]=np.cos(df.t)' 'df["sin"]=np.sin(df.t)'\\ --names t\\ | p.plot -x t -y sin cos\\ --style '.-' 'o-' --alpha 1 .2 --legend best * Plot sea-level time series p.example_data -d sealevel\\ | p.plot -x year -y sealevel_mm --style '.'\\ --xlabel year --ylabel 'relative sea level (mm)'\\ --title 'Sea Level Rise' --legend best --xlim 1995 2015 ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'xy_plotting', 'decorating') parser.add_argument("-a", "--alpha", help="Set opacity level(s)", nargs='+', default=[1.], type=float, metavar='alpha') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # set the appropriate theme plot_lib.set_plot_styling(args) # draw the plot plot_lib.draw_xy_plot(args, df)
def main(): msg = textwrap.dedent( """ Creates interactive xy plots. Loosely based around matplotlib's pyplot.plot command. ----------------------------------------------------------------------- Examples: * Really simple plot p.linspace 1 10 7 | p.plot -x c0 -y c0 * Plot two traces p.linspace 0 6.28 100\\ | p.df 'df["cos"]=np.cos(df.t)' 'df["sin"]=np.sin(df.t)'\\ --names t\\ | p.plot -x t -y sin cos\\ --style '.-' 'o-' --alpha 1 .2 --legend best * Plot sea-level time series p.example_data -d sealevel\\ | p.plot -x year -y sealevel_mm --style '.'\\ --xlabel year --ylabel 'relative sea level (mm)'\\ --title 'Sea Level Rise' --legend best --xlim 1995 2015 ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'xy_plotting', 'decorating') parser.add_argument( "-a", "--alpha", help="Set opacity level(s)", nargs='+', default=[1.], type=float, metavar='alpha') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # set the appropriate theme plot_lib.set_plot_styling(args) # draw the plot plot_lib.draw_xy_plot(args, df)
def main(): msg = textwrap.dedent( """ Remove outliers from DataFrame columns using a recursive sigma-edit algorithm. The algorithm will recursively NaN out values greater than sigma_thresh standard deviations away from sample mean. ----------------------------------------------------------------------- Examples: * Do a 2.5-sigma edit on a gamma distribution and show histogram p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\ | p.df 'df["c1"] = df.c0'\\ | p.sig_edit -c c1 -t 2.5\\ | p.df 'pd.melt(df)' --names raw edited\\ | p.facet_grid --hue variable --map pl.hist\\ --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold", nargs=1, required=True, type=float) parser.add_argument("-c", "--cols", required=True, help="Column(s) to sigma-edit", nargs="+") parser.add_argument("--max_iter", help="Max number of recursions", nargs=1, type=int, default=[20]) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = outlier_lib.sigma_edit_dataframe( args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0]) # write dataframe to output io_lib.df_to_output(args, df)
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) # no plotting if output requested if args.quiet: counts, edges = np.histogram(df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: module_checker_lib.check_for_modules(['matplotlib']) plot_lib = get_imports('pandashells.lib.plot_lib') plot_lib.set_plot_styling(args) df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent(""" Tool to merge datasets. Similar functionality to database joins. The arguments closely parallel those of the pandas merge command. See the pandas merge documentation for more details. ----------------------------------------------------------------------- Examples: * Merge election polls with electoral-college numbers p.merge <(p.example_data -d election) \\ <(p.example_data -d electoral_college) \\ --how left --on state \\ | p.df -o table | head ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'], dest='how', default=['inner'], nargs=1, help="Type of join. Default='inner'") msg = 'List of of columns on which to join' parser.add_argument('--on', nargs='+', metavar='col', type=str, dest='on', help=msg) msg = 'List of of columns from left file to join on. ' parser.add_argument('--left_on', nargs='+', metavar='col', type=str, dest='left_on', help=msg) msg = 'List of of columns from right file to join on. ' parser.add_argument('--right_on', nargs='+', metavar='col', type=str, dest='right_on', help=msg) msg = 'List of suffixes appended to identically ' msg += 'named columns' parser.add_argument('--suffixes', nargs=2, metavar='_x _y', type=str, dest='suffixes', default=['_x', '_y'], help=msg) parser.add_argument("file", help="Files to join", nargs=2, type=str, metavar='file') args = parser.parse_args() validate_args(args) # get merge options from cli how = args.how[0] on = args.on if args.on else None left_on = args.left_on if args.left_on else None right_on = args.right_on if args.right_on else None suffixes = args.suffixes # get file names left_name, right_name = tuple(args.file) # load the dataframes df_left = io_lib.df_from_input(args, left_name) df_right = io_lib.df_from_input(args, right_name) # perform the merge dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on, right_on=right_on, sort=True, suffixes=suffixes) # output the joined frame io_lib.df_to_output(args, dfj)
def main(): msg = textwrap.dedent( """ Tool to merge datasets. Similar functionality to database joins. The arguments closely parallel those of the pandas merge command. See the pandas merge documentation for more details. ----------------------------------------------------------------------- Examples: * Merge election polls with electoral-college numbers p.merge <(p.example_data -d election) \\ <(p.example_data -d electoral_college) \\ --how left --on state \\ | p.df -o table | head ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'], dest='how', default=['inner'], nargs=1, help="Type of join. Default='inner'") msg = 'List of of columns on which to join' parser.add_argument('--on', nargs='+', metavar='col', type=str, dest='on', help=msg) msg = 'List of of columns from left file to join on. ' parser.add_argument('--left_on', nargs='+', metavar='col', type=str, dest='left_on', help=msg) msg = 'List of of columns from right file to join on. ' parser.add_argument('--right_on', nargs='+', metavar='col', type=str, dest='right_on', help=msg) msg = 'List of suffixes appended to identically ' msg += 'named columns' parser.add_argument('--suffixes', nargs=2, metavar='_x _y', type=str, dest='suffixes', default=['_x', '_y'], help=msg) parser.add_argument("file", help="Files to join", nargs=2, type=str, metavar='file') args = parser.parse_args() validate_args(args) # get merge options from cli how = args.how[0] on = args.on if args.on else None left_on = args.left_on if args.left_on else None right_on = args.right_on if args.right_on else None suffixes = args.suffixes # get file names left_name, right_name = tuple(args.file) # load the dataframes df_left = io_lib.df_from_input(args, left_name) df_right = io_lib.df_from_input(args, right_name) # perform the merge dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on, right_on=right_on, sort=True, suffixes=suffixes) # output the joined frame io_lib.df_to_output(args, dfj)
def main(): msg = textwrap.dedent( """ Plots the emperical cumulative distribution function (ECDF). ----------------------------------------------------------------------- Examples: * Plot ECDF for 10k samples from the standard normal distribution. p.rand -t normal -n 10000 | p.cdf -c c0 * Instead of plotting, send ECDF values to stdout p.rand -t normal -n 10000 | p.cdf -c c0 -q | head ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) # specify column to use parser.add_argument( "-c", "--col", required=True, nargs=1, help="Column to plot distribution") parser.add_argument( '-n', '--n_points', nargs=1, type=int, help='Number of output points (default is twice input len)') parser.add_argument( '-q', '--quiet', action='store_true', default=False, help='Quiet mean no plots. Send numeric output to stdout instead') # parse arguments arg_lib.add_args(parser, 'decorating', 'io_in', 'io_out',) args = parser.parse_args() # get the input dataframe and extract column df = io_lib.df_from_input(args) x = df[args.col[0]].values # create the output distribution n_out = 2 * len(x) if args.n_points is None else args.n_points[0] x_out = np.linspace(min(x), max(x), n_out) y_out = ECDF(x)(x_out) # send values to stdout if quiet specified if args.quiet: df_out = pd.DataFrame( {'x': x_out, 'p_less': y_out, 'p_greater': 1 - y_out}) df_out = df_out[['x', 'p_less', 'p_greater']] io_lib.df_to_output(args, df_out) return # set the appropriate theme ad make plot plot_lib.set_plot_styling(args) pl.plot(x_out, y_out, label='P({} < x)'.format(args.col[0])) pl.plot(x_out, 1. - y_out, label='P({} > x)'.format(args.col[0])) pl.xlabel('x') pl.legend(loc='best') plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Creates faceted plots using seaborn FacetGrid. With this tool, you can create a group of plots which show aspects of the same dataset broken down in different ways. See the seaborn FacetGrid documentation for more detail. The --map argument to this function specifies a function to use for generating each of the plots. The following modules are available in the namespace: pl = pylab sns = seaborn ----------------------------------------------------------------------- Examples: * Scatterplot of tips vs bill for different combinations of sex, smoker, and day of the week: p.example_data -d tips | \\ p.facet_grid --row smoker --col sex --hue day \\ --map pl.scatter \\ --args total_bill tip --kwargs 'alpha=.2' 's=100' * Histogram of tips broken down by sex, smoker and day p.example_data -d tips | p.facet_grid --col day \\ --row sex --hue smoker --sharex --sharey --aspect 1 \\ --map pl.hist --args tip \\ --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in') msg = 'Different values of this variable in separate rows' parser.add_argument( '--row', nargs=1, type=str, dest='row', metavar='row', help=msg) msg = 'Different values of this variable in separate columns' parser.add_argument( '--col', nargs=1, type=str, dest='col', metavar='col', help=msg) msg = 'Different values of this variable in separate colors' parser.add_argument( '--hue', nargs=1, type=str, dest='hue', metavar='hue', help=msg) msg = 'The aspect ratio of each plot' parser.add_argument( '--aspect', nargs=1, type=float, dest='aspect', metavar='aspect', default=[2], help=msg) msg = 'The size of each plot (default=4)' parser.add_argument( '--size', nargs=1, type=float, dest='size', metavar='size', help=msg, default=[4]) msg = 'The plotting function to use for each facet' parser.add_argument( '--map', nargs=1, type=str, dest='map', metavar='map', required=True, help=msg) msg = 'The args to pass to the plotting function' parser.add_argument( '--args', nargs='+', type=str, dest='args', metavar='args', required=True, help=msg) msg = 'Plotting function kwargs expressed as \'a=1\' \'b=2\' ... ' parser.add_argument( '--kwargs', nargs='+', type=str, dest='kwargs', metavar='kwargs', help=msg) msg = 'Share x axis' parser.add_argument('--sharex', action='store_true', dest='sharex', default=False, help=msg) msg = 'Share y axis' parser.add_argument('--sharey', action='store_true', dest='sharey', default=False, help=msg) msg = 'x axis limits when sharex=True' parser.add_argument( '--xlim', nargs=2, type=float, dest='xlim', metavar='xlim', help=msg) msg = 'y axis limits when sharex=True' parser.add_argument( '--ylim', nargs=2, type=float, dest='ylim', metavar='ylim', help=msg) msg = "Save the figure to this file" parser.add_argument('--savefig', nargs=1, type=str, help=msg) warnings.filterwarnings('ignore') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) facet_grid_kwargs = { 'row': args.row[0] if args.row else None, 'col': args.col[0] if args.col else None, 'hue': args.hue[0] if args.hue else None, 'aspect': args.aspect[0], 'size': args.size[0], 'sharex': args.sharex, 'sharey': args.sharey, 'xlim': args.xlim if args.xlim else None, 'ylim': args.ylim if args.ylim else None, } grid = sns.FacetGrid(df, **facet_grid_kwargs) map_func_name = args.map[0] scope = {'pl': pl, 'sns': sns, 'map_func_name': map_func_name} exec('map_func = {}'.format(map_func_name), scope) map_func = scope['map_func'] map_args = args.args map_kwargs = {} if args.kwargs: for kwarg in args.kwargs: exec('map_kwargs.update(dict({}))'.format(kwarg)) grid.map(map_func, *map_args, **map_kwargs) # noqa defined in exec above grid.add_legend() plot_lib.show(args)
def test_df_from_input_tsv(self, pd_mock): pd_mock.read_csv = MagicMock(return_value=pd.DataFrame()) args = MagicMock(names=[], input_options=['tsv']) in_file = MagicMock() io_lib.df_from_input(args, in_file=in_file) self.assertEqual(pd_mock.read_csv.call_args_list[0][0][0], in_file)
def test_df_from_input_with_infile(self, pd_mock): pd_mock.read_csv = MagicMock(return_value=pd.DataFrame()) args = MagicMock(names=[], input_options=[]) in_file = MagicMock() io_lib.df_from_input(args, in_file=in_file) self.assertEqual(pd_mock.read_csv.call_args_list[0][0][0], in_file)
def main(): # pragma: no cover # read command line arguments msg = textwrap.dedent( """ Enables pandas dataframe processing at the unix command line. This is the real workhorse of the pandashells toolkit. It reads data from stdin as a dataframe, which is passed through any number of pandas operations provided on the command line. Output is always to stdout. Each operation assumes data is in a dataframe named df. Operations performed on this dataframe will overwrite the df variable with the results of that operation. Special consideration is taken for assignments such as df['a'] = df.b + df.c. These are understood to agument the input dataframe with a new column. By way of example, this command: p.df 'df.groupby(by="a").b.count()' 'df.reset_index()' is equivalent to the python expressions: df = df.groupby(by="a").b.count() df = df.reset_index() In addition to providing access to pandas dataframes, a number of modules are loaded into the namespace so as to be accessible from the command line. These modules are: pd = pandas np = numpy scp = scipy pl = pylab parse = dateutil.parser.parse datetime = datetime re = re ** Important ** When creating chains of dataframe operations (see examples), it is important to express your chain of operations before any options. This is because some options can take multiple arguments and the parser won't be able to properly decode your meaning. For example: cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader # GOOD cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1' # BAD Input can be read in different formats as specified by the -i switch. The most common formats are csv and table (white-space-delimited). In either of these formats, p.df can accomodate input data that either does or doesn not have a header row. When no header row is indicated, The columns of the Dataframe will be labeled as c0, c1, ..., cN. Plotting methods invoked on a Dataframe generate no output, but create an interactive plot instead. There are a number of plot specific options available at the command line that govern the details of how these plots are rendered (e.g. --xlim, --legend, etc). ----------------------------------------------------------------------- Examples: * Print a csv file in nice tabular format p.example_data -d tips | p.df -o table | head * Print a csv file to json p.example_data -d tips | head | p.df -o json * Transform csv to json then to table p.example_data -d tips | head | p.df -o json \\ | p.df -i json -o table * Select by row p.example_data -d tips \\ | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table * Extract columns p.example_data -d tips \\ | p.df 'df[["total_bill", "tip"]].head()' -o table * Perform grouped aggregations p.example_data -d tips | p.df \\ 'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index * Use pandas plotting methods p.example_data -d tips | p.df \\ 'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\ --xlabel 'Dollars' --title 'Total Bills by Day' * Convert between tabular and csv format with/without header rows seq 10 | awk '{print $1, 2*$1}'\\ | p.df --names a b -i table noheader | p.df -o table noheader ----------------------------------------------------------------------- """ ) from pandashells.lib import arg_lib parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating') msg = ( '(MUST come before any options) ' '[statement ...] Statement(s) to execute. ' ) parser.add_argument( "statement", help=msg, nargs="*") args = parser.parse_args() get_modules_and_shortcuts(args.statement) from pandashells.lib import io_lib # get the input dataframe df = io_lib.df_from_input(args) # execute the statements in order # plot commands are terminal statements so will call sys.exit() for cmd in args.statement: df = process_command(args, cmd, df) # write the output io_lib.df_to_output(args, df)
def main(): # pragma: no cover # read command line arguments msg = textwrap.dedent( """ Enables pandas dataframe processing at the unix command line. This is the real workhorse of the pandashells toolkit. It reads data from stdin as a dataframe, which is passed through any number of pandas operations provided on the command line. Output is always to stdout. Each operation assumes data is in a dataframe named df. Operations performed on this dataframe will overwrite the df variable with the results of that operation. Special consideration is taken for assignments such as df['a'] = df.b + df.c. These are understood to agument the input dataframe with a new column. By way of example, this command: p.df 'df.groupby(by="a").b.count()' 'df.reset_index()' is equivalent to the python expressions: df = df.groupby(by="a").b.count() df = df.reset_index() In addition to providing access to pandas dataframes, a number of modules are loaded into the namespace so as to be accessible from the command line. These modules are: pd = pandas np = numpy scp = scipy pl = pylab parse = dateutil.parser.parse datetime = datetime re = re ** Important ** When creating chains of dataframe operations (see examples), it is important to express your chain of operations before any options. This is because some options can take multiple arguments and the parser won't be able to properly decode your meaning. For example: cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader # GOOD cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1' # BAD Input can be read in different formats as specified by the -i switch. The most common formats are csv and table (white-space-delimited). In either of these formats, p.df can accomodate input data that either does or doesn not have a header row. When no header row is indicated, The columns of the Dataframe will be labeled as c0, c1, ..., cN. Plotting methods invoked on a Dataframe generate no output, but create an interactive plot instead. There are a number of plot specific options available at the command line that govern the details of how these plots are rendered (e.g. --xlim, --legend, etc). ----------------------------------------------------------------------- Examples: * Print a csv file in nice tabular format p.example_data -d tips | p.df -o table | head * Select by row p.example_data -d tips \\ | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table * Extract columns p.example_data -d tips \\ | p.df 'df[["total_bill", "tip"]].head()' -o table * Perform grouped aggregations p.example_data -d tips | p.df \\ 'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index * Use pandas plotting methods p.example_data -d tips | p.df \\ 'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\ --xlabel 'Dollars' --title 'Total Bills by Day' * Convert between tabular and csv format with/without header rows seq 10 | awk '{print $1, 2*$1}'\\ | p.df --names a b -i table noheader | p.df -o table noheader ----------------------------------------------------------------------- """ ) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating', 'example') msg = ( '(MUST come before any options) ' '[statement ...] Statement(s) to execute. ' ) parser.add_argument( "statement", help=msg, nargs="*") args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # execute the statements in order # plot commands are terminal statements so will call sys.exit() for cmd in args.statement: df = process_command(args, cmd, df) # write the output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3 * Show the annual and 59-day peaks in the sealevel spectrum p.example_data -d sealevel\\ | p.df 'df["day"] = 365.25 * df.year'\\ 'df["day"] = df.day - df.day.iloc[0]'\\ | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\ | p.df 'df[df.period < 720]'\\ | p.plot -x period -y amp --xlim 1 400\\ --title 'Sea-surface height spectrum'\\ --xlabel 'period (days)' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') parser.add_argument('-t', '--time_col', help='Time Column', nargs=1, required=True, type=str) parser.add_argument('-y', '--observation_col', help='Observation column', nargs=1, dest='val_col', required=True, type=str) parser.add_argument('--interp_exp', help='Interpolate by this power of 2', nargs=1, type=int, default=[1]) parser.add_argument('--freq_order', action='store_true', dest='freq_order', default=False, help='Order output by freqency instead of period') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = lomb_scargle_lib.lomb_scargle(df, args.time_col[0], args.val_col[0], args.interp_exp[0], args.freq_order) # write dataframe to output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Creates faceted plots using seaborn FacetGrid. With this tool, you can create a group of plots which show aspects of the same dataset broken down in different ways. See the seaborn FacetGrid documentation for more detail. The --map argument to this function specifies a function to use for generating each of the plots. The following modules are available in the namespace: pl = pylab sns = seaborn ----------------------------------------------------------------------- Examples: * Scatterplot of tips vs bill for different combinations of sex, smoker, and day of the week: p.example_data -d tips | \\ p.facet_grid --row smoker --col sex --hue day \\ --map pl.scatter \\ --args total_bill tip --kwargs 'alpha=.2' 's=100' * Histogram of tips broken down by sex, smoker and day p.example_data -d tips | p.facet_grid --col day \\ --row sex --hue smoker --sharex --sharey --aspect 1 \\ --map pl.hist --args tip \\ --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in') msg = 'Different values of this variable in separate rows' parser.add_argument('--row', nargs=1, type=str, dest='row', metavar='row', help=msg) msg = 'Different values of this variable in separate columns' parser.add_argument('--col', nargs=1, type=str, dest='col', metavar='col', help=msg) msg = 'Different values of this variable in separate colors' parser.add_argument('--hue', nargs=1, type=str, dest='hue', metavar='hue', help=msg) msg = 'The aspect ratio of each plot' parser.add_argument('--aspect', nargs=1, type=float, dest='aspect', metavar='aspect', default=[2], help=msg) msg = 'The size of each plot (default=4)' parser.add_argument('--size', nargs=1, type=float, dest='size', metavar='size', help=msg, default=[4]) msg = 'The plotting function to use for each facet' parser.add_argument('--map', nargs=1, type=str, dest='map', metavar='map', required=True, help=msg) msg = 'The args to pass to the plotting function' parser.add_argument('--args', nargs='+', type=str, dest='args', metavar='args', required=True, help=msg) msg = 'Plotting function kwargs expressed as \'a=1\' \'b=2\' ... ' parser.add_argument('--kwargs', nargs='+', type=str, dest='kwargs', metavar='kwargs', help=msg) msg = 'Share x axis' parser.add_argument('--sharex', action='store_true', dest='sharex', default=False, help=msg) msg = 'Share y axis' parser.add_argument('--sharey', action='store_true', dest='sharey', default=False, help=msg) msg = 'x axis limits when sharex=True' parser.add_argument('--xlim', nargs=2, type=float, dest='xlim', metavar='xlim', help=msg) msg = 'y axis limits when sharex=True' parser.add_argument('--ylim', nargs=2, type=float, dest='ylim', metavar='ylim', help=msg) msg = "Save the figure to this file" parser.add_argument('--savefig', nargs=1, type=str, help=msg) warnings.filterwarnings('ignore') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) facet_grid_kwargs = { 'row': args.row[0] if args.row else None, 'col': args.col[0] if args.col else None, 'hue': args.hue[0] if args.hue else None, 'aspect': args.aspect[0], 'size': args.size[0], 'sharex': args.sharex, 'sharey': args.sharey, 'xlim': args.xlim if args.xlim else None, 'ylim': args.ylim if args.ylim else None, } grid = sns.FacetGrid(df, **facet_grid_kwargs) map_func_name = args.map[0] scope = {'pl': pl, 'sns': sns, 'map_func_name': map_func_name} exec('map_func = {}'.format(map_func_name), scope) map_func = scope['map_func'] map_args = args.args map_kwargs = {} if args.kwargs: for kwarg in args.kwargs: exec('map_kwargs.update(dict({}))'.format(kwarg)) grid.map(map_func, *map_args, **map_kwargs) # noqa defined in exec above grid.add_legend() plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Create a single variable regression plot of specified order. ----------------------------------------------------------------------- Examples: * Fit a line to synthetic data with boostrap errors. p.linspace 0 10 20 \\ | p.df 'df["y_true"] = .2 * df.x' \\ 'df["noise"] = np.random.randn(20)' \\ 'df["y"] = df.y_true + df.noise' --names x \\ | p.regplot -x x -y y * Fit a quadratic to synthetic data with boostrap errors. p.linspace 0 10 40 \\ | p.df 'df["y_true"] = .5 * df.x + .3 * df.x ** 2'\\ 'df["noise"] = np.random.randn(40)' \\ 'df["y"] = df.y_true + df.noise' --names x \\ | p.regplot -x x -y y --order 2 * Fit sealevel data with no bootstrap p.example_data -d sealevel\\ | p.regplot -x year -y sealevel_mm --n_boot 1 ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating') msg = 'Column for dependent variable' parser.add_argument('-x', nargs=1, type=str, dest='x', metavar='col', help=msg, required=True) msg = 'Column for independent variable' parser.add_argument('-y', nargs=1, type=str, dest='y', metavar='col', help=msg, required=True) msg = 'The order of the polynomial to fit (default = 1)' parser.add_argument('--order', help=msg, nargs=1, default=[1], type=int) msg = 'Number of bootstrap samples for uncertainty region (default=1000)' parser.add_argument( '--n_boot', help=msg, nargs=1, default=[1000], type=int) parser.add_argument('-a', '--alpha', help='Set opacity', nargs=1, default=[0.5], type=float) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # extract command line params x = df[args.x[0]].values y = df[args.y[0]].values # do a polyfit with the specified order coeffs = np.polyfit(x, y, args.order[0]) label = make_label(coeffs, args.savefig) sns.regplot( x, y, order=args.order[0], n_boot=args.n_boot[0], line_kws={'label': label, 'color': CC[2], 'alpha': .5}, scatter_kws={'alpha': args.alpha[0], 'color': CC[0]}) pl.legend(loc='best') pl.xlabel(args.x[0]) pl.ylabel(args.y[0]) plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: module_checker_lib.check_for_modules(['matplotlib', 'seaborn']) plot_lib = get_module('pandashells.lib.plot_lib') mpl = get_module('matplotlib') pl = get_module('pylab') sns = get_module('seaborn') pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)
def main(): msg = textwrap.dedent(""" Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Creates faceted plots using seaborn FacetGrid. With this tool, you can create a group of plots which show aspects of the same dataset broken down in different ways. See the seaborn FacetGrid documentation for more detail. The --map argument to this function specifies a function to use for generating each of the plots. The following modules are available in the namespace: pl = pylab sns = seaborn ----------------------------------------------------------------------- Examples: * Scatterplot of tips vs bill for different combinations of sex, smoker, and day of the week: p.example_data -d tips | \\ p.facet_grid --row smoker --col sex --hue day \\ --map pl.scatter \\ --args total_bill tip --kwargs 'alpha=.2' 's=100' * Histogram of tips broken down by sex, smoker and day p.example_data -d tips | p.facet_grid --col day \\ --row sex --hue smoker --sharex --sharey --aspect 1 \\ --map pl.hist --args tip \\ --kwargs 'alpha=.2' 'range=[0, 10]' 'bins=20' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, "io_in") msg = "Different values of this variable in separate rows" parser.add_argument("--row", nargs=1, type=str, dest="row", metavar="row", help=msg) msg = "Different values of this variable in separate columns" parser.add_argument("--col", nargs=1, type=str, dest="col", metavar="col", help=msg) msg = "Different values of this variable in separate colors" parser.add_argument("--hue", nargs=1, type=str, dest="hue", metavar="hue", help=msg) msg = "The aspect ratio of each plot" parser.add_argument("--aspect", nargs=1, type=float, dest="aspect", metavar="aspect", default=[2], help=msg) msg = "The size of each plot (default=4)" parser.add_argument("--size", nargs=1, type=float, dest="size", metavar="size", help=msg, default=[4]) msg = "The plotting function to use for each facet" parser.add_argument("--map", nargs=1, type=str, dest="map", metavar="map", required=True, help=msg) msg = "The args to pass to the plotting function" parser.add_argument("--args", nargs="+", type=str, dest="args", metavar="args", required=True, help=msg) msg = "Plotting function kwargs expressed as 'a=1' 'b=2' ... " parser.add_argument("--kwargs", nargs="+", type=str, dest="kwargs", metavar="kwargs", help=msg) msg = "Share x axis" parser.add_argument("--sharex", action="store_true", dest="sharex", default=False, help=msg) msg = "Share y axis" parser.add_argument("--sharey", action="store_true", dest="sharey", default=False, help=msg) msg = "x axis limits when sharex=True" parser.add_argument("--xlim", nargs=2, type=float, dest="xlim", metavar="xlim", help=msg) msg = "y axis limits when sharex=True" parser.add_argument("--ylim", nargs=2, type=float, dest="ylim", metavar="ylim", help=msg) msg = "Save the figure to this file" parser.add_argument("--savefig", nargs=1, type=str, help=msg) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) facet_grid_kwargs = { "row": args.row[0] if args.row else None, "col": args.col[0] if args.col else None, "hue": args.hue[0] if args.hue else None, "aspect": args.aspect[0], "size": args.size[0], "sharex": args.sharex, "sharey": args.sharey, "xlim": args.xlim if args.xlim else None, "ylim": args.ylim if args.ylim else None, } grid = sns.FacetGrid(df, **facet_grid_kwargs) map_func_name = args.map[0] scope = {"pl": pl, "sns": sns, "map_func_name": map_func_name} exec("map_func = {}".format(map_func_name), scope) map_func = scope["map_func"] map_args = args.args map_kwargs = {} if args.kwargs: for kwarg in args.kwargs: exec("map_kwargs.update(dict({}))".format(kwarg)) grid.map(map_func, *map_args, **map_kwargs) # noqa defined in exec above grid.add_legend() plot_lib.show(args)