def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) # no plotting if output requested if args.quiet: counts, edges = np.histogram( df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: module_checker_lib.check_for_modules(['matplotlib']) plot_lib = get_imports('pandashells.lib.plot_lib') plot_lib.set_plot_styling(args) df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
def get_modules_and_shortcuts(command_list): warnings.filterwarnings('ignore') names_shortcuts = [ ('datetime', 'datetime'), ('numpy', 'np'), ('scipy', 'scp'), ('pylab', 'pl'), ('seaborn', 'sns'), ] base_requirements = [ ('pandas', 'pd'), ('dateutil', 'dateutil'), ] out = base_requirements + [ tup for tup in names_shortcuts if '{}.'.format(tup[1]) in ' '.join(command_list) ] if needs_plots(command_list): out = list(set([('pylab', 'pl')] + out)) # make sure required modules are installed module_checker_lib.check_for_modules([m for (m, s) in out]) warnings.resetwarnings() return out
def main(): args = get_input_args() df = io_lib.df_from_input(args) # extract parameters from arg parser nbins = args.nbins[0] range_tup = args.range layout_tup = args.layout alpha = args.alpha[0] do_density = args.density sharex = args.sharex sharey = args.sharey cols = args.cols if args.cols else [df.columns[0]] validate_args(args, cols, df) # no plotting if output requested if args.quiet: counts, edges = np.histogram(df[cols[0]], bins=nbins, range=range_tup, density=do_density) centers = edges[:-1] + 0.5 * np.diff(edges) df_out = pd.DataFrame({'bins': centers, 'counts': counts}) io_lib.df_to_output(args, df_out) # otherwise do plotting else: module_checker_lib.check_for_modules(['matplotlib']) plot_lib = get_imports('pandashells.lib.plot_lib') plot_lib.set_plot_styling(args) df.hist(cols, bins=nbins, range=range_tup, alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup, normed=do_density) plot_lib.refine_plot(args) plot_lib.show(args)
#! /usr/bin/env python # standard library imports import argparse import textwrap import sys # noqa from pandashells.lib import arg_lib, module_checker_lib module_checker_lib.check_for_modules(['pandas', 'gatspy']) from pandashells.lib import io_lib, lomb_scargle_lib def main(): msg = textwrap.dedent(""" Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3 * Show the annual and 59-day peaks in the sealevel spectrum p.example_data -d sealevel\\
#! /usr/bin/env python # standard library imports import sys import argparse import textwrap from pandashells.lib import module_checker_lib, arg_lib, io_lib, plot_lib # import required dependencies module_checker_lib.check_for_modules(['numpy', 'pandas', 'matplotlib']) import numpy as np import pandas as pd def get_input_args(): msg = textwrap.dedent(""" Plot histograms from input data. Can either plot just a single histogram or a grid of histograms with different columns of data. When multiple columns are specified, creates a grid of histograms, one for each specified column. ----------------------------------------------------------------------- Examples: * Plot histogram of a beta distriubtion p.rand -t beta --alpha 3 --beta 10 -n 10000\\ | p.hist --names beta -n 50 * Plot a sid-by-side comparison of a gamma and normal distriubtion
#! /usr/bin/env python # standard library imports import argparse import textwrap import sys # NOQA need this for mock testig from pandashells.lib import module_checker_lib, arg_lib, io_lib, plot_lib # import required dependencies module_checker_lib.check_for_modules([ 'pandas', 'numpy', 'matplotlib', 'statsmodels' ]) import pandas as pd import numpy as np import pylab as pl from statsmodels.distributions.empirical_distribution import ECDF def main(): msg = textwrap.dedent( """ Plots the emperical cumulative distribution function (ECDF). ----------------------------------------------------------------------- Examples:
#! /usr/bin/env python import sys import re from pandashells.lib import module_checker_lib module_checker_lib.check_for_modules( ['matplotlib', 'dateutil', 'mpld3', 'seaborn']) from dateutil.parser import parse import matplotlib as mpl import pylab as pl import seaborn as sns import mpld3 def show(args): # if figure saving requested if hasattr(args, 'savefig') and args.savefig: # save html if requested rex_html = re.compile('.*?\.html$') if rex_html.match(args.savefig[0]): fig = pl.gcf() html = mpld3.fig_to_html(fig) with open(args.savefig[0], 'w') as outfile: outfile.write(html) return # save image types pl.savefig(args.savefig[0]) # otherwise show to screen
def test_check_for_modules_unrecognized(self): """ check_for_modules() raises error when module is unrecognized """ with self.assertRaises(ValueError): check_for_modules(['not_a_module'])
def test_check_for_modules_bad(self): """ check_for_modules() correctly identifies missing modules """ with self.assertRaises(ImportError): check_for_modules(['fakemodule1', 'fakemodule2'])
#! /usr/bin/env python # standard library imports import sys import argparse import textwrap from pandashells.lib import module_checker_lib, arg_lib, io_lib, plot_lib # import required dependencies module_checker_lib.check_for_modules(['numpy', 'pandas', 'matplotlib']) import numpy as np import pandas as pd def get_input_args(): msg = textwrap.dedent( """ Plot histograms from input data. Can either plot just a single histogram or a grid of histograms with different columns of data. When multiple columns are specified, creates a grid of histograms, one for each specified column. ----------------------------------------------------------------------- Examples: * Plot histogram of a beta distriubtion p.rand -t beta --alpha 3 --beta 10 -n 10000\\ | p.hist --names beta -n 50
#! /usr/bin/env python # standard library imports import sys import argparse import importlib import textwrap from pandashells.lib import module_checker_lib, arg_lib module_checker_lib.check_for_modules(['pandas', 'supersmoother']) from supersmoother import SuperSmoother from pandashells.lib import io_lib import numpy as np # this silly function makes mock testing easier def get_imports(name): # pragma no cover return importlib.import_module(name) def get_input_args(): msg = textwrap.dedent( """ Smooths data in specified column. Uses algorithm[1] from the supersmoother python package for smoothing with cross validation to determine best span. [1] Friedman, J. H. (1984) A variable span scatterplot smoother. Laboratory for Computational Statistics, Stanford University
def test_check_for_modules_no_modules(self, import_module_mock): """ check_for_modules() does nothing when module list is empty """ check_for_modules([]) self.assertFalse(import_module_mock.called)
#! /usr/bin/env python # standard library imports import sys import argparse import importlib import textwrap from pandashells.lib import module_checker_lib, arg_lib module_checker_lib.check_for_modules(['pandas', 'supersmoother']) from supersmoother import SuperSmoother from pandashells.lib import io_lib import numpy as np # this silly function makes mock testing easier def get_imports(name): # pragma no cover return importlib.import_module(name) def get_input_args(): msg = textwrap.dedent(""" Smooths data in specified column. Uses algorithm[1] from the supersmoother python package for smoothing with cross validation to determine best span. [1] Friedman, J. H. (1984) A variable span scatterplot smoother. Laboratory for Computational Statistics, Stanford University Technical Report No. 5.
def lomb_scargle(df, time_col, val_col, interp_exponent=0, freq_order=False): """ :type df: pandas.DataFrame :param df: An input dataframe :type time_col: str :param time_col: The column of the dataframe holding the timestamps :type val_col: str :param val_col: The column of the dataframe holding the observations :type interp_exp: int :param interp_exp: Interpolate the spectrum by this power of two :type freq_order: bool :param freq_order: If set to True spectrum is returned in frequency order instead of period order (default=False) :rtype: Pandas DataFrame :returns: A dataframe with columns: period, freq, power, amplitude """ # do imports here to avoid loading plot libraries when this # module is loaded in __init__.py # which then doesn't allow for doing matplotlib.use() later from pandashells.lib import module_checker_lib module_checker_lib.check_for_modules(['gatspy', 'pandas', 'numpy']) import gatspy # only care about timestamped values df = df[[time_col, val_col]].dropna() # standardize column names, remove mean from values, and sort by time df = df.rename(columns={time_col: 't', val_col: 'y'}).sort_index(by=['t']) df['y'] = df['y'] - df.y.mean() # compute total energy in the time series E_in = np.sum((df.y * df.y)) # appropriately zero-pad the timeseries before taking spectrum pre_pad_length = len(df) t_pad, y_pad = _compute_pad(df.t.values, interp_exponent=interp_exponent) if len(t_pad) > 0: df = df.append(pd.DataFrame({ 't': t_pad, 'y': y_pad }), ignore_index=True) # fit the lombs scargle model to the time series model = gatspy.periodic.LombScargleFast() model.fit(df.t.values, df.y.values, 1) # compute params for getting results out of lomb scargle fit f0, df, N = _compute_params(df.t.values) f = f0 + df * np.arange(N) p = 1. / f # retrieve the lomb scarge fit and normalize for power / amplitude yf = model.score_frequency_grid(f0, df, N) yf_power = 2 * yf * E_in * len(yf) / float(pre_pad_length)**2 yf_amp = np.sqrt(yf_power) # generate the output dataframe df = pd.DataFrame({ 'freq': f, 'period': p, 'power': yf_power, 'amp': yf_amp })[['period', 'freq', 'power', 'amp']] # order by period if desired if not freq_order: df = df.sort_index(by='period') return df
def main(): msg = textwrap.dedent(""" Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: module_checker_lib.check_for_modules(['matplotlib', 'seaborn']) plot_lib = get_module('pandashells.lib.plot_lib') mpl = get_module('matplotlib') pl = get_module('pylab') sns = get_module('seaborn') pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)
#! /usr/bin/env python # standard library imports import sys import argparse import textwrap import importlib from pandashells.lib import module_checker_lib, arg_lib # import required dependencies module_checker_lib.check_for_modules(['pandas', 'statsmodels', 'scipy']) from pandashells.lib import io_lib import scipy as scp # NOQA import statsmodels.formula.api as sm # this silly function helps use side_effect in mocking tests def get_module(name): # pragma nocover return importlib.import_module(name) def main(): msg = textwrap.dedent(""" Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. -----------------------------------------------------------------------
def test_check_for_modules_bad(self): """ check_for_modules() correctly identifies missing modules """ with self.assertRaises(SystemExit): check_for_modules(['fakemodule1', 'fakemodule2'])
def test_check_for_modules_existing_module(self): """ check_for_modules() successfully finds existing module """ check_for_modules(['os'])
#! /usr/bin/env python # standard library imports import sys # NOQA import sys to allow for mocking sys.argv in tests import argparse import textwrap from pandashells.lib import module_checker_lib, arg_lib module_checker_lib.check_for_modules(['pandas']) from pandashells.lib import io_lib import numpy as np import pandas as pd def main(): msg = "Generate a linearly spaced set of data points." msg = textwrap.dedent( """ Generate a linearly spaced set of data points. ----------------------------------------------------------------------- Examples: * Generate 7 points between 1 and 10 p.linspace 1 10 7 ----------------------------------------------------------------------- """
#! /usr/bin/env python # standard library imports import sys import argparse import textwrap import importlib from pandashells.lib import module_checker_lib, arg_lib # import required dependencies module_checker_lib.check_for_modules(['pandas', 'statsmodels', 'scipy']) from pandashells.lib import io_lib import scipy as scp # NOQA import statsmodels.formula.api as sm # this silly function helps use side_effect in mocking tests def get_module(name): # pragma nocover return importlib.import_module(name) def main(): msg = textwrap.dedent( """ Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals.
def main(): msg = textwrap.dedent( """ Performs (multivariable) linear regression. The fitting model is specified using the R-like, patsy syntax. Input is from stdin and output is either fitting information or the input data with columns added for the fit and residuals. ----------------------------------------------------------------------- Examples: * Fit a line to the sea-level data p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year' * Fit a trend plus annual cycle to sealevel data p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' * Examine residual ECDF of trend plus annual fit p.example_data -d sealevel \\ | p.df 'df["sin"] = np.sin(2 * np.pi * df.year)' \\ | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\ | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\ | p.cdf -c 'resid_' --title 'ECDF of trend + annual' * Detrend sealevel data to more clearly reveal oscillations p.example_data -d sealevel \\ | p.regress -m 'sealevel_mm ~ year' --fit \\ | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\ --title 'Global Sea Surface Height' * Set origin of sealevel data to 0 and regress with no intercept p.example_data -d sealevel\\ | p.df 'df["year"] = df.year - df.year.iloc[0]'\\ 'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\ | p.regress -m 'sealevel_mm ~ year - 1' --fit\\ | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\ --alpha .2 1 --legend best --title 'Force Zero Intercept' ----------------------------------------------------------------------- """ ) # read command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=msg) arg_lib.add_args(parser, 'io_in', 'io_out') # specify columns to histogram parser.add_argument("-m", "--model", type=str, nargs=1, required=True, help="The model expressed in patsy syntax") msg = "Return input with fit and residual appended" parser.add_argument("--fit", action="store_true", dest='retfit', default=False, help=msg) parser.add_argument("--plot", action="store_true", default=False, help="Make residual plots") # parse arguments args = parser.parse_args() # get the input dataframe df = io_lib.df_from_input(args) # fit the model and add fit, resid columns result = sm.ols(formula=args.model[0], data=df).fit() df['fit_'] = result.fittedvalues df['resid_'] = result.resid # add and output the fit results if requested if args.retfit: io_lib.df_to_output(args, df) return # print the fit summary sys.stdout.write('\n{}\n'.format(result.summary())) sys.stdout.flush() # do plots if requested if args.plot: module_checker_lib.check_for_modules(['matplotlib', 'seaborn']) plot_lib = get_module('pandashells.lib.plot_lib') mpl = get_module('matplotlib') pl = get_module('pylab') sns = get_module('seaborn') pl.subplot(211) pl.plot(df.fit_, df.resid_, '.', alpha=.5) pl.xlabel('Fit') pl.ylabel('Residual') pl.title(args.model[0]) pl.subplot(212) sns.distplot(df.resid_, bins=50) pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared)) pl.ylabel('Counts') # annoying issue with osx backend forces if statement here if mpl.get_backend().lower() in ['agg', 'macosx']: pl.gcf().set_tight_layout(True) else: pl.gcf().tight_layout() plot_lib.show(args)
#! /usr/bin/env python # standard library imports import argparse import textwrap import re import sys # NOQA just use this for patching in tests from pandashells.lib import module_checker_lib, arg_lib # import required dependencies module_checker_lib.check_for_modules(["pandas", "matplotlib", "seaborn"]) from pandashells.lib import plot_lib, io_lib import numpy as np import matplotlib as mpl import pylab as pl import seaborn as sns sns.set_context("talk") CC = mpl.rcParams["axes.color_cycle"] def make_label(coeffs, savefig): label_plain = "y = " for nn, coeff in enumerate(coeffs[::-1]): if nn > 0: label_plain += " + " if nn == 0: label_plain += "(%0.4g)" % coeff elif nn == 1:
#! /usr/bin/env python # standard library imports import argparse import textwrap import re import sys # NOQA just use this for patching in tests import warnings warnings.filterwarnings('ignore') from pandashells.lib import module_checker_lib, arg_lib # import required dependencies module_checker_lib.check_for_modules( ['pandas', 'matplotlib', 'seaborn']) from pandashells.lib import plot_lib, io_lib import numpy as np import matplotlib as mpl import pylab as pl import seaborn as sns sns.set_context('talk') CC = mpl.rcParams['axes.color_cycle'] def make_label(coeffs, savefig): label_plain = 'y = ' for nn, coeff in enumerate(coeffs[::-1]): if nn > 0: label_plain += ' + '
#! /usr/bin/env python import sys import re from pandashells.lib import module_checker_lib module_checker_lib.check_for_modules( ['matplotlib', 'dateutil', 'mpld3', 'seaborn']) from dateutil.parser import parse import matplotlib as mpl import pylab as pl import seaborn as sns import mpld3 def show(args): # if figure saving requested if hasattr(args, 'savefig') and args.savefig: # save html if requested rex_html = re.compile('.*?\.html$') if rex_html.match(args.savefig[0]): fig = pl.gcf() html = mpld3.fig_to_html(fig) with open(args.savefig[0], 'w') as outfile: outfile.write(html) return # save image types pl.savefig(args.savefig[0]) # otherwise show to screen else:
#! /usr/bin/env python # standard library imports import argparse import textwrap import sys # NOQA importing sys so I can mock sys.argv in tests from pandashells.lib import module_checker_lib, arg_lib module_checker_lib.check_for_modules(['pandas']) from pandashells.lib import io_lib import pandas as pd import numpy as np # want different default mu values for normal and poisson distributions def fill_default_mu(args): if args.type[0] == 'normal': args.mu = [0.] if args.mu is None else args.mu elif args.type[0] == 'poisson': args.mu = [1.] if args.mu is None else args.mu return args def get_samples(args): """ Return samples from selected distribution """ # dictionary to hold numpy arguments for different distributions
#! /usr/bin/env python # standard library imports import argparse import textwrap import sys # noqa from pandashells.lib import arg_lib, module_checker_lib module_checker_lib.check_for_modules(["pandas", "gatspy"]) from pandashells.lib import io_lib, lomb_scargle_lib def main(): msg = textwrap.dedent( """ Computes a spectrogram using the lomb-scargle algorithm provided by the gatspy module. The input time series need not have evenly spaced time-stamps. The FFT-based algorithm has complexity O[N*log(N)]. ----------------------------------------------------------------------- Examples: * Plot the spectrum of a simple sine wave p.linspace 0 10 100 \\ | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\ --names time\\ | p.lomb_scargle -t time -y value --interp_exp 3\\ | p.plot -x period -y amp --xlim 0 3