def main(): msg = "Generate a linearly spaced set of data points." msg = textwrap.dedent( """ Generate a linearly spaced set of data points. ----------------------------------------------------------------------- Examples: * Generate 7 points between 1 and 10 p.linspace 1 10 7 ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_out') msg = 'start end npoints' parser.add_argument("numbers", help=msg, type=str, nargs=3, metavar='') # parse arguments args = parser.parse_args() min_val, max_val = float(args.numbers[0]), float(args.numbers[1]) N = int(args.numbers[2]) df = pd.DataFrame({'c0': np.linspace(min_val, max_val, N)}) # write dataframe to output io_lib.df_to_output(args, df)
def test_df_to_output_broken_stdout(self, sys_mock): args_mock = MagicMock(output_options=['table']) df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1]) sys_mock.stdout.write = MagicMock(side_effect=IOError) io_lib.df_to_output(args_mock, df) self.assertTrue(sys_mock.stdout.write.called)
def main(): msg = "Generate a linearly spaced set of data points." msg = textwrap.dedent(""" Generate a linearly spaced set of data points. ----------------------------------------------------------------------- Examples: * Generate 7 points between 1 and 10 p.linspace 1 10 7 ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_out', 'example') msg = 'start end npoints' parser.add_argument("numbers", help=msg, type=str, nargs=3, metavar='') # parse arguments args = parser.parse_args() min_val, max_val = float(args.numbers[0]), float(args.numbers[1]) N = int(args.numbers[2]) df = pd.DataFrame({'c0': np.linspace(min_val, max_val, N)}) # write dataframe to output io_lib.df_to_output(args, df)
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) plot_lib.set_plot_styling(args) # no plotting if output requested if args.quiet: counts, edges = np.histogram( df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent( """ Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3 * Show the annual and 59-day peaks in the sealevel spectrum p.example_data -d sealevel\\ | p.df 'df["day"] = 365.25 * df.year'\\ 'df["day"] = df.day - df.day.iloc[0]'\\ | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\ | p.df 'df[df.period < 720]'\\ | p.plot -x period -y amp --xlim 1 400\\ --title 'Sea-surface height spectrum'\\ --xlabel 'period (days)' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') parser.add_argument('-t', '--time_col', help='Time Column', nargs=1, required=True, type=str) parser.add_argument('-y', '--observation_col', help='Observation column', nargs=1, dest='val_col', required=True, type=str) parser.add_argument('--interp_exp', help='Interpolate by this power of 2', nargs=1, type=int, default=[1]) parser.add_argument( '--freq_order', action='store_true', dest='freq_order', default=False, help='Order output by freqency instead of period') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = lomb_scargle_lib.lomb_scargle( df, args.time_col[0], args.val_col[0], args.interp_exp[0], args.freq_order) # write dataframe to output io_lib.df_to_output(args, df)
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser x_col = args.x[0] if args.x else None cols = args.y if args.y else [df.columns[0]] cols_to_check = cols + [x_col] if x_col else cols validate_args(args, cols_to_check, df) df = smooth(df, cols, x_col) io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Remove outliers from DataFrame columns using a recursive sigma-edit algorithm. The algorithm will recursively NaN out values greater than sigma_thresh standard deviations away from sample mean. ----------------------------------------------------------------------- Examples: * Do a 2.5-sigma edit on a gamma distribution and show histogram p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\ | p.df 'df["c1"] = df.c0'\\ | p.sig_edit -c c1 -t 2.5\\ | p.df 'pd.melt(df)' --names raw edited\\ | p.facet_grid --hue variable --map pl.hist\\ --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold", nargs=1, required=True, type=float) parser.add_argument("-c", "--cols", required=True, help="Column(s) to sigma-edit", nargs="+") parser.add_argument("--max_iter", help="Max number of recursions", nargs=1, type=int, default=[20]) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = outlier_lib.sigma_edit_dataframe(args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0]) # write dataframe to output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent( """ Remove outliers from DataFrame columns using a recursive sigma-edit algorithm. The algorithm will recursively NaN out values greater than sigma_thresh standard deviations away from sample mean. ----------------------------------------------------------------------- Examples: * Do a 2.5-sigma edit on a gamma distribution and show histogram p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\ | p.df 'df["c1"] = df.c0'\\ | p.sig_edit -c c1 -t 2.5\\ | p.df 'pd.melt(df)' --names raw edited\\ | p.facet_grid --hue variable --map pl.hist\\ --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold", nargs=1, required=True, type=float) parser.add_argument("-c", "--cols", required=True, help="Column(s) to sigma-edit", nargs="+") parser.add_argument("--max_iter", help="Max number of recursions", nargs=1, type=int, default=[20]) # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = outlier_lib.sigma_edit_dataframe( args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0]) # write dataframe to output io_lib.df_to_output(args, df)
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) # no plotting if output requested if args.quiet: counts, edges = np.histogram(df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: module_checker_lib.check_for_modules(['matplotlib']) plot_lib = get_imports('pandashells.lib.plot_lib') plot_lib.set_plot_styling(args) df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent(""" Return random samples from common probability distrubtions. ----------------------------------------------------------------------- Examples: uniform: p.rand -n 1000 -t uniform --min=0 --max=1 | p.hist normal: p.rand -n 1000 -t normal --mu=0 --sigma=1 | p.hist poisson: p.rand -n 1000 -t poisson --mu=1 | p.hist beta: p.rand -n 1000 -t beta --alpha=2 --beta=6 | p.hist gamma: p.rand -n 1000 -t gamma --alpha=1 --beta=1 | p.hist binomial: p.rand -n 1000 -t binomial --N=10 --p=0.4 | p.hist ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) parser.add_argument( '-t', '--type', nargs=1, type=str, default=['uniform'], choices=['uniform', 'normal', 'beta', 'gamma', 'binomial', 'poisson'], help='type of distribution (default=\'uniform\')') parser.add_argument('-n', '--num_samples', nargs=1, default=[10], type=int, help='The number of rows to generate (default=10)') parser.add_argument( '-c', '--columns', nargs=1, default=[1], type=int, help='The number of columns to generate per row (default=1)') parser.add_argument( '--N', nargs=1, default=[10], type=int, help=('(Binomial Dist) Largest possible value for random variable. ' '(default=10)')) parser.add_argument( '--p', nargs=1, default=[.5], type=float, help=('(Binomial Dist) Bernoulli probability for each trial' '(default=.5)')) parser.add_argument( '--mu', nargs=1, type=float, help='(Normal, Poisson) Mean (defaults: normal:0, poisson:1') parser.add_argument('--sigma', nargs=1, default=[1.], type=float, help='(Normal) standard deviation, (default: 1)') parser.add_argument('--min', nargs=1, default=[0.], type=float, help='(Uniform) Minimum value of range, (default: 0)') parser.add_argument('--max', nargs=1, default=[1.], type=float, help='(Uniform) Maximum value of range, (default: 1)') parser.add_argument('--alpha', nargs=1, default=[2.], type=float, help='(Beta, Gamma) (default: 2)') parser.add_argument('--beta', nargs=1, default=[2.], type=float, help='(Beta, Gamma) (default: 2)') arg_lib.add_args(parser, 'io_out') # parse arguments args = parser.parse_args() # set some defaults args = fill_default_mu(args) # get the samples df = get_samples(args) # write dataframe to output io_lib.df_to_output(args, df)
def main(): # pragma: no cover # read command line arguments msg = textwrap.dedent( """ Enables pandas dataframe processing at the unix command line. This is the real workhorse of the pandashells toolkit. It reads data from stdin as a dataframe, which is passed through any number of pandas operations provided on the command line. Output is always to stdout. Each operation assumes data is in a dataframe named df. Operations performed on this dataframe will overwrite the df variable with the results of that operation. Special consideration is taken for assignments such as df['a'] = df.b + df.c. These are understood to agument the input dataframe with a new column. By way of example, this command: p.df 'df.groupby(by="a").b.count()' 'df.reset_index()' is equivalent to the python expressions: df = df.groupby(by="a").b.count() df = df.reset_index() In addition to providing access to pandas dataframes, a number of modules are loaded into the namespace so as to be accessible from the command line. These modules are: pd = pandas np = numpy scp = scipy pl = pylab parse = dateutil.parser.parse datetime = datetime re = re ** Important ** When creating chains of dataframe operations (see examples), it is important to express your chain of operations before any options. This is because some options can take multiple arguments and the parser won't be able to properly decode your meaning. For example: cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader # GOOD cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1' # BAD Input can be read in different formats as specified by the -i switch. The most common formats are csv and table (white-space-delimited). In either of these formats, p.df can accomodate input data that either does or doesn not have a header row. When no header row is indicated, The columns of the Dataframe will be labeled as c0, c1, ..., cN. Plotting methods invoked on a Dataframe generate no output, but create an interactive plot instead. There are a number of plot specific options available at the command line that govern the details of how these plots are rendered (e.g. --xlim, --legend, etc). ----------------------------------------------------------------------- Examples: * Print a csv file in nice tabular format p.example_data -d tips | p.df -o table | head * Print a csv file to json p.example_data -d tips | head | p.df -o json * Transform csv to json then to table p.example_data -d tips | head | p.df -o json \\ | p.df -i json -o table * Select by row p.example_data -d tips \\ | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table * Extract columns p.example_data -d tips \\ | p.df 'df[["total_bill", "tip"]].head()' -o table * Perform grouped aggregations p.example_data -d tips | p.df \\ 'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index * Use pandas plotting methods p.example_data -d tips | p.df \\ 'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\ --xlabel 'Dollars' --title 'Total Bills by Day' * Convert between tabular and csv format with/without header rows seq 10 | awk '{print $1, 2*$1}'\\ | p.df --names a b -i table noheader | p.df -o table noheader ----------------------------------------------------------------------- """ ) from pandashells.lib import arg_lib parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating') msg = ( '(MUST come before any options) ' '[statement ...] Statement(s) to execute. ' ) parser.add_argument( "statement", help=msg, nargs="*") args = parser.parse_args() get_modules_and_shortcuts(args.statement) from pandashells.lib import io_lib # get the input dataframe df = io_lib.df_from_input(args) # execute the statements in order # plot commands are terminal statements so will call sys.exit() for cmd in args.statement: df = process_command(args, cmd, df) # write the output io_lib.df_to_output(args, df)
def main(): # pragma: no cover # read command line arguments msg = textwrap.dedent( """ Enables pandas dataframe processing at the unix command line. This is the real workhorse of the pandashells toolkit. It reads data from stdin as a dataframe, which is passed through any number of pandas operations provided on the command line. Output is always to stdout. Each operation assumes data is in a dataframe named df. Operations performed on this dataframe will overwrite the df variable with the results of that operation. Special consideration is taken for assignments such as df['a'] = df.b + df.c. These are understood to agument the input dataframe with a new column. By way of example, this command: p.df 'df.groupby(by="a").b.count()' 'df.reset_index()' is equivalent to the python expressions: df = df.groupby(by="a").b.count() df = df.reset_index() In addition to providing access to pandas dataframes, a number of modules are loaded into the namespace so as to be accessible from the command line. These modules are: pd = pandas np = numpy scp = scipy pl = pylab parse = dateutil.parser.parse datetime = datetime re = re ** Important ** When creating chains of dataframe operations (see examples), it is important to express your chain of operations before any options. This is because some options can take multiple arguments and the parser won't be able to properly decode your meaning. For example: cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader # GOOD cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1' # BAD Input can be read in different formats as specified by the -i switch. The most common formats are csv and table (white-space-delimited). In either of these formats, p.df can accomodate input data that either does or doesn not have a header row. When no header row is indicated, The columns of the Dataframe will be labeled as c0, c1, ..., cN. Plotting methods invoked on a Dataframe generate no output, but create an interactive plot instead. There are a number of plot specific options available at the command line that govern the details of how these plots are rendered (e.g. --xlim, --legend, etc). ----------------------------------------------------------------------- Examples: * Print a csv file in nice tabular format p.example_data -d tips | p.df -o table | head * Select by row p.example_data -d tips \\ | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table * Extract columns p.example_data -d tips \\ | p.df 'df[["total_bill", "tip"]].head()' -o table * Perform grouped aggregations p.example_data -d tips | p.df \\ 'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index * Use pandas plotting methods p.example_data -d tips | p.df \\ 'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\ --xlabel 'Dollars' --title 'Total Bills by Day' * Convert between tabular and csv format with/without header rows seq 10 | awk '{print $1, 2*$1}'\\ | p.df --names a b -i table noheader | p.df -o table noheader ----------------------------------------------------------------------- """ ) parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating', 'example') msg = ( '(MUST come before any options) ' '[statement ...] Statement(s) to execute. ' ) parser.add_argument( "statement", help=msg, nargs="*") args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # execute the statements in order # plot commands are terminal statements so will call sys.exit() for cmd in args.statement: df = process_command(args, cmd, df) # write the output io_lib.df_to_output(args, df)
def test_df_to_output_bad_type(self, csv_writer_mock): args_mock = MagicMock(output_options=['bad']) df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1]) io_lib.df_to_output(args_mock, df) csv_writer_mock.assert_called_with(df, True, False, 'nan')
def main(): msg = textwrap.dedent( """ Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: module_checker_lib.check_for_modules(['matplotlib', 'seaborn']) plot_lib = get_module('pandashells.lib.plot_lib') mpl = get_module('matplotlib') pl = get_module('pylab') sns = get_module('seaborn') pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)
def main(): msg = textwrap.dedent(""" Tool to merge datasets. Similar functionality to database joins. The arguments closely parallel those of the pandas merge command. See the pandas merge documentation for more details. ----------------------------------------------------------------------- Examples: * Merge election polls with electoral-college numbers p.merge <(p.example_data -d election) \\ <(p.example_data -d electoral_college) \\ --how left --on state \\ | p.df -o table | head ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'], dest='how', default=['inner'], nargs=1, help="Type of join. Default='inner'") msg = 'List of of columns on which to join' parser.add_argument('--on', nargs='+', metavar='col', type=str, dest='on', help=msg) msg = 'List of of columns from left file to join on. ' parser.add_argument('--left_on', nargs='+', metavar='col', type=str, dest='left_on', help=msg) msg = 'List of of columns from right file to join on. ' parser.add_argument('--right_on', nargs='+', metavar='col', type=str, dest='right_on', help=msg) msg = 'List of suffixes appended to identically ' msg += 'named columns' parser.add_argument('--suffixes', nargs=2, metavar='_x _y', type=str, dest='suffixes', default=['_x', '_y'], help=msg) parser.add_argument("file", help="Files to join", nargs=2, type=str, metavar='file') args = parser.parse_args() validate_args(args) # get merge options from cli how = args.how[0] on = args.on if args.on else None left_on = args.left_on if args.left_on else None right_on = args.right_on if args.right_on else None suffixes = args.suffixes # get file names left_name, right_name = tuple(args.file) # load the dataframes df_left = io_lib.df_from_input(args, left_name) df_right = io_lib.df_from_input(args, right_name) # perform the merge dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on, right_on=right_on, sort=True, suffixes=suffixes) # output the joined frame io_lib.df_to_output(args, dfj)
def main(): msg = textwrap.dedent( """ Tool to merge datasets. Similar functionality to database joins. The arguments closely parallel those of the pandas merge command. See the pandas merge documentation for more details. ----------------------------------------------------------------------- Examples: * Merge election polls with electoral-college numbers p.merge <(p.example_data -d election) \\ <(p.example_data -d electoral_college) \\ --how left --on state \\ | p.df -o table | head ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'], dest='how', default=['inner'], nargs=1, help="Type of join. Default='inner'") msg = 'List of of columns on which to join' parser.add_argument('--on', nargs='+', metavar='col', type=str, dest='on', help=msg) msg = 'List of of columns from left file to join on. ' parser.add_argument('--left_on', nargs='+', metavar='col', type=str, dest='left_on', help=msg) msg = 'List of of columns from right file to join on. ' parser.add_argument('--right_on', nargs='+', metavar='col', type=str, dest='right_on', help=msg) msg = 'List of suffixes appended to identically ' msg += 'named columns' parser.add_argument('--suffixes', nargs=2, metavar='_x _y', type=str, dest='suffixes', default=['_x', '_y'], help=msg) parser.add_argument("file", help="Files to join", nargs=2, type=str, metavar='file') args = parser.parse_args() validate_args(args) # get merge options from cli how = args.how[0] on = args.on if args.on else None left_on = args.left_on if args.left_on else None right_on = args.right_on if args.right_on else None suffixes = args.suffixes # get file names left_name, right_name = tuple(args.file) # load the dataframes df_left = io_lib.df_from_input(args, left_name) df_right = io_lib.df_from_input(args, right_name) # perform the merge dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on, right_on=right_on, sort=True, suffixes=suffixes) # output the joined frame io_lib.df_to_output(args, dfj)
def main(): msg = textwrap.dedent( """ Plots the emperical cumulative distribution function (ECDF). ----------------------------------------------------------------------- Examples: * Plot ECDF for 10k samples from the standard normal distribution. p.rand -t normal -n 10000 | p.cdf -c c0 * Instead of plotting, send ECDF values to stdout p.rand -t normal -n 10000 | p.cdf -c c0 -q | head ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) # specify column to use parser.add_argument( "-c", "--col", required=True, nargs=1, help="Column to plot distribution") parser.add_argument( '-n', '--n_points', nargs=1, type=int, help='Number of output points (default is twice input len)') parser.add_argument( '-q', '--quiet', action='store_true', default=False, help='Quiet mean no plots. Send numeric output to stdout instead') # parse arguments arg_lib.add_args(parser, 'decorating', 'io_in', 'io_out',) args = parser.parse_args() # get the input dataframe and extract column df = io_lib.df_from_input(args) x = df[args.col[0]].values # create the output distribution n_out = 2 * len(x) if args.n_points is None else args.n_points[0] x_out = np.linspace(min(x), max(x), n_out) y_out = ECDF(x)(x_out) # send values to stdout if quiet specified if args.quiet: df_out = pd.DataFrame( {'x': x_out, 'p_less': y_out, 'p_greater': 1 - y_out}) df_out = df_out[['x', 'p_less', 'p_greater']] io_lib.df_to_output(args, df_out) return # set the appropriate theme ad make plot plot_lib.set_plot_styling(args) pl.plot(x_out, y_out, label='P({} < x)'.format(args.col[0])) pl.plot(x_out, 1. - y_out, label='P({} > x)'.format(args.col[0])) pl.xlabel('x') pl.legend(loc='best') plot_lib.refine_plot(args) plot_lib.show(args)
def main(): msg = textwrap.dedent(""" Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3 * Show the annual and 59-day peaks in the sealevel spectrum p.example_data -d sealevel\\ | p.df 'df["day"] = 365.25 * df.year'\\ 'df["day"] = df.day - df.day.iloc[0]'\\ | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\ | p.df 'df[df.period < 720]'\\ | p.plot -x period -y amp --xlim 1 400\\ --title 'Sea-surface height spectrum'\\ --xlabel 'period (days)' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') parser.add_argument('-t', '--time_col', help='Time Column', nargs=1, required=True, type=str) parser.add_argument('-y', '--observation_col', help='Observation column', nargs=1, dest='val_col', required=True, type=str) parser.add_argument('--interp_exp', help='Interpolate by this power of 2', nargs=1, type=int, default=[1]) parser.add_argument('--freq_order', action='store_true', dest='freq_order', default=False, help='Order output by freqency instead of period') # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) df = lomb_scargle_lib.lomb_scargle(df, args.time_col[0], args.val_col[0], args.interp_exp[0], args.freq_order) # write dataframe to output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent( """ Return random samples from common probability distrubtions. ----------------------------------------------------------------------- Examples: uniform: p.rand -n 1000 -t uniform --min=0 --max=1 | p.hist normal: p.rand -n 1000 -t normal --mu=0 --sigma=1 | p.hist poisson: p.rand -n 1000 -t poisson --mu=1 | p.hist beta: p.rand -n 1000 -t beta --alpha=2 --beta=6 | p.hist gamma: p.rand -n 1000 -t gamma --alpha=1 --beta=1 | p.hist binomial: p.rand -n 1000 -t binomial --N=10 --p=0.4 | p.hist ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) parser.add_argument( '-t', '--type', nargs=1, type=str, default=['uniform'], choices=['uniform', 'normal', 'beta', 'gamma', 'binomial', 'poisson'], help='type of distribution (default=\'uniform\')') parser.add_argument( '-n', '--num_samples', nargs=1, default=[10], type=int, help='The number of rows to generate (default=10)') parser.add_argument( '-c', '--columns', nargs=1, default=[1], type=int, help='The number of columns to generate per row (default=1)') parser.add_argument( '--N', nargs=1, default=[10], type=int, help=( '(Binomial Dist) Largest possible value for random variable. ' '(default=10)' ) ) parser.add_argument( '--p', nargs=1, default=[.5], type=float, help=( '(Binomial Dist) Bernoulli probability for each trial' '(default=.5)' ) ) parser.add_argument( '--mu', nargs=1, type=float, help='(Normal, Poisson) Mean (defaults: normal:0, poisson:1') parser.add_argument( '--sigma', nargs=1, default=[1.], type=float, help='(Normal) standard deviation, (default: 1)') parser.add_argument( '--min', nargs=1, default=[0.], type=float, help='(Uniform) Minimum value of range, (default: 0)') parser.add_argument( '--max', nargs=1, default=[1.], type=float, help='(Uniform) Maximum value of range, (default: 1)') parser.add_argument( '--alpha', nargs=1, default=[2.], type=float, help='(Beta, Gamma) (default: 2)') parser.add_argument( '--beta', nargs=1, default=[2.], type=float, help='(Beta, Gamma) (default: 2)') arg_lib.add_args(parser, 'io_out', 'example') # parse arguments args = parser.parse_args() # set some defaults args = fill_default_mu(args) # get the samples df = get_samples(args) # write dataframe to output io_lib.df_to_output(args, df)
def main(): msg = textwrap.dedent(""" Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out', 'example') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)