コード例 #1
0
ファイル: p_linspace.py プロジェクト: alayassir/pandashells
def main():
    msg = "Generate a linearly spaced set of data points."
    msg = textwrap.dedent(
        """
        Generate a linearly spaced set of data points.

        -----------------------------------------------------------------------
        Examples:

            * Generate 7 points between 1 and 10
                p.linspace 1 10 7

        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_out')

    msg = 'start end npoints'
    parser.add_argument("numbers", help=msg, type=str, nargs=3, metavar='')

    # parse arguments
    args = parser.parse_args()
    min_val, max_val = float(args.numbers[0]), float(args.numbers[1])
    N = int(args.numbers[2])

    df = pd.DataFrame({'c0': np.linspace(min_val, max_val, N)})

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #2
0
ファイル: io_lib_tests.py プロジェクト: iiSeymour/pandashells
    def test_df_to_output_broken_stdout(self, sys_mock):
        args_mock = MagicMock(output_options=['table'])
        df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1])
        sys_mock.stdout.write = MagicMock(side_effect=IOError)

        io_lib.df_to_output(args_mock, df)
        self.assertTrue(sys_mock.stdout.write.called)
コード例 #3
0
def main():
    msg = "Generate a linearly spaced set of data points."
    msg = textwrap.dedent("""
        Generate a linearly spaced set of data points.

        -----------------------------------------------------------------------
        Examples:

            * Generate 7 points between 1 and 10
                p.linspace 1 10 7

        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_out', 'example')

    msg = 'start end npoints'
    parser.add_argument("numbers", help=msg, type=str, nargs=3, metavar='')

    # parse arguments
    args = parser.parse_args()
    min_val, max_val = float(args.numbers[0]), float(args.numbers[1])
    N = int(args.numbers[2])

    df = pd.DataFrame({'c0': np.linspace(min_val, max_val, N)})

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #4
0
ファイル: p_hist.py プロジェクト: richwu/pandashells
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    nbins = args.nbins[0]
    range_tup = args.range
    layout_tup = args.layout
    alpha = args.alpha[0]
    do_density = args.density
    sharex = args.sharex
    sharey = args.sharey
    cols = args.cols if args.cols else [df.columns[0]]

    validate_args(args, cols, df)
    plot_lib.set_plot_styling(args)

    # no plotting if output requested
    if args.quiet:
        counts, edges = np.histogram(
            df[cols[0]], bins=nbins, range=range_tup, density=do_density)
        centers = edges[:-1] + 0.5 * np.diff(edges)
        df_out = pd.DataFrame({'bins': centers, 'counts': counts})
        io_lib.df_to_output(args, df_out)

    # otherwise do plotting
    else:
        df.hist(cols, bins=nbins, range=range_tup,
                alpha=alpha, sharex=sharex, sharey=sharey, layout=layout_tup,
                normed=do_density)

        plot_lib.refine_plot(args)
        plot_lib.show(args)
コード例 #5
0
ファイル: p_lomb_scargle.py プロジェクト: richwu/pandashells
def main():
    msg = textwrap.dedent(
        """
        Computes a spectrogram using the lomb-scargle algorithm provided by
        the gatspy module.  The input time series need not have evenly spaced
        time-stamps.  The FFT-based algorithm has complexity O[N*log(N)].

        -----------------------------------------------------------------------
        Examples:

            * Plot the spectrum of a simple sine wave
                  p.linspace 0 10 100 \\
                  | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\
                        --names time\\
                  | p.lomb_scargle -t time -y value --interp_exp 3\\
                  | p.plot -x period -y amp --xlim 0 3

            * Show the annual and 59-day peaks in the sealevel spectrum
                p.example_data -d sealevel\\
                | p.df 'df["day"] = 365.25 * df.year'\\
                        'df["day"] = df.day - df.day.iloc[0]'\\
                | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\
                | p.df 'df[df.period < 720]'\\
                | p.plot -x period -y amp --xlim 1 400\\
                         --title 'Sea-surface height spectrum'\\
                         --xlabel 'period (days)'

        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    parser.add_argument('-t', '--time_col', help='Time Column',
                        nargs=1, required=True, type=str)

    parser.add_argument('-y', '--observation_col', help='Observation column',
                        nargs=1, dest='val_col', required=True, type=str)

    parser.add_argument('--interp_exp', help='Interpolate by this power of 2',
                        nargs=1, type=int, default=[1])
    parser.add_argument(
        '--freq_order', action='store_true', dest='freq_order', default=False,
        help='Order output by freqency instead of period')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = lomb_scargle_lib.lomb_scargle(
        df, args.time_col[0], args.val_col[0], args.interp_exp[0],
        args.freq_order)

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #6
0
    def test_df_to_output_broken_stdout(self, sys_mock):
        args_mock = MagicMock(output_options=['table'])
        df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1])
        sys_mock.stdout.write = MagicMock(side_effect=IOError)

        io_lib.df_to_output(args_mock, df)
        self.assertTrue(sys_mock.stdout.write.called)
コード例 #7
0
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    x_col = args.x[0] if args.x else None
    cols = args.y if args.y else [df.columns[0]]
    cols_to_check = cols + [x_col] if x_col else cols
    validate_args(args, cols_to_check, df)
    df = smooth(df, cols, x_col)
    io_lib.df_to_output(args, df)
コード例 #8
0
ファイル: p_smooth.py プロジェクト: alayassir/pandashells
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    x_col = args.x[0] if args.x else None
    cols = args.y if args.y else [df.columns[0]]
    cols_to_check = cols + [x_col] if x_col else cols
    validate_args(args, cols_to_check, df)
    df = smooth(df, cols, x_col)
    io_lib.df_to_output(args, df)
コード例 #9
0
def main():
    msg = textwrap.dedent("""
        Remove outliers from DataFrame columns using a recursive sigma-edit
        algorithm.  The algorithm will recursively NaN out values greater than
        sigma_thresh standard deviations away from sample mean.

        -----------------------------------------------------------------------
        Examples:

            * Do a 2.5-sigma edit on a gamma distribution and show histogram
                p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\
                | p.df 'df["c1"] = df.c0'\\
                | p.sig_edit -c c1 -t 2.5\\
                | p.df 'pd.melt(df)' --names raw edited\\
                | p.facet_grid --hue variable --map pl.hist\\
                   --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50'
        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument("-t",
                        "--sigma_thresh",
                        help="Sigma threshold",
                        nargs=1,
                        required=True,
                        type=float)
    parser.add_argument("-c",
                        "--cols",
                        required=True,
                        help="Column(s) to sigma-edit",
                        nargs="+")
    parser.add_argument("--max_iter",
                        help="Max number of recursions",
                        nargs=1,
                        type=int,
                        default=[20])

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = outlier_lib.sigma_edit_dataframe(args.sigma_thresh[0],
                                          args.cols,
                                          df,
                                          max_iter=args.max_iter[0])

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #10
0
ファイル: p_sig_edit.py プロジェクト: richwu/pandashells
def main():
    msg = textwrap.dedent(
        """
        Remove outliers from DataFrame columns using a recursive sigma-edit
        algorithm.  The algorithm will recursively NaN out values greater than
        sigma_thresh standard deviations away from sample mean.

        -----------------------------------------------------------------------
        Examples:

            * Do a 2.5-sigma edit on a gamma distribution and show histogram
                p.rand -n 1000 -t gamma --alpha=3 --beta=.01\\
                | p.df 'df["c1"] = df.c0'\\
                | p.sig_edit -c c1 -t 2.5\\
                | p.df 'pd.melt(df)' --names raw edited\\
                | p.facet_grid --hue variable --map pl.hist\\
                   --args value --kwargs 'alpha=.2' 'range=[0, 1000]' 'bins=50'
        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument("-t", "--sigma_thresh", help="Sigma threshold",
                        nargs=1, required=True, type=float)
    parser.add_argument("-c", "--cols", required=True,
                        help="Column(s) to sigma-edit", nargs="+")
    parser.add_argument("--max_iter", help="Max number of recursions",
                        nargs=1, type=int, default=[20])

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = outlier_lib.sigma_edit_dataframe(
        args.sigma_thresh[0], args.cols, df, max_iter=args.max_iter[0])

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #11
0
ファイル: p_hist.py プロジェクト: subodhchhabra/pandashells
def main():
    args = get_input_args()
    df = io_lib.df_from_input(args)

    # extract parameters from arg parser
    nbins = args.nbins[0]
    range_tup = args.range
    layout_tup = args.layout
    alpha = args.alpha[0]
    do_density = args.density
    sharex = args.sharex
    sharey = args.sharey
    cols = args.cols if args.cols else [df.columns[0]]

    validate_args(args, cols, df)

    # no plotting if output requested
    if args.quiet:
        counts, edges = np.histogram(df[cols[0]],
                                     bins=nbins,
                                     range=range_tup,
                                     density=do_density)
        centers = edges[:-1] + 0.5 * np.diff(edges)
        df_out = pd.DataFrame({'bins': centers, 'counts': counts})
        io_lib.df_to_output(args, df_out)

    # otherwise do plotting
    else:
        module_checker_lib.check_for_modules(['matplotlib'])
        plot_lib = get_imports('pandashells.lib.plot_lib')
        plot_lib.set_plot_styling(args)
        df.hist(cols,
                bins=nbins,
                range=range_tup,
                alpha=alpha,
                sharex=sharex,
                sharey=sharey,
                layout=layout_tup,
                normed=do_density)

        plot_lib.refine_plot(args)
        plot_lib.show(args)
コード例 #12
0
def main():
    msg = textwrap.dedent("""
        Return random samples from common probability distrubtions.

        -----------------------------------------------------------------------
        Examples:

            uniform:  p.rand -n 1000 -t uniform  --min=0    --max=1   | p.hist
            normal:   p.rand -n 1000 -t normal   --mu=0     --sigma=1 | p.hist
            poisson:  p.rand -n 1000 -t poisson  --mu=1               | p.hist
            beta:     p.rand -n 1000 -t beta     --alpha=2  --beta=6  | p.hist
            gamma:    p.rand -n 1000 -t gamma    --alpha=1  --beta=1  | p.hist
            binomial: p.rand -n 1000 -t binomial --N=10     --p=0.4   | p.hist
        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    parser.add_argument(
        '-t',
        '--type',
        nargs=1,
        type=str,
        default=['uniform'],
        choices=['uniform', 'normal', 'beta', 'gamma', 'binomial', 'poisson'],
        help='type of distribution (default=\'uniform\')')
    parser.add_argument('-n',
                        '--num_samples',
                        nargs=1,
                        default=[10],
                        type=int,
                        help='The number of rows to generate (default=10)')
    parser.add_argument(
        '-c',
        '--columns',
        nargs=1,
        default=[1],
        type=int,
        help='The number of columns to generate per row (default=1)')
    parser.add_argument(
        '--N',
        nargs=1,
        default=[10],
        type=int,
        help=('(Binomial Dist) Largest possible value for random variable. '
              '(default=10)'))
    parser.add_argument(
        '--p',
        nargs=1,
        default=[.5],
        type=float,
        help=('(Binomial Dist) Bernoulli probability for each trial'
              '(default=.5)'))
    parser.add_argument(
        '--mu',
        nargs=1,
        type=float,
        help='(Normal, Poisson) Mean (defaults: normal:0, poisson:1')
    parser.add_argument('--sigma',
                        nargs=1,
                        default=[1.],
                        type=float,
                        help='(Normal) standard deviation, (default: 1)')
    parser.add_argument('--min',
                        nargs=1,
                        default=[0.],
                        type=float,
                        help='(Uniform) Minimum value of range, (default: 0)')
    parser.add_argument('--max',
                        nargs=1,
                        default=[1.],
                        type=float,
                        help='(Uniform) Maximum value of range, (default: 1)')
    parser.add_argument('--alpha',
                        nargs=1,
                        default=[2.],
                        type=float,
                        help='(Beta, Gamma)  (default: 2)')
    parser.add_argument('--beta',
                        nargs=1,
                        default=[2.],
                        type=float,
                        help='(Beta, Gamma)  (default: 2)')

    arg_lib.add_args(parser, 'io_out')

    # parse arguments
    args = parser.parse_args()

    # set some defaults
    args = fill_default_mu(args)

    # get the samples
    df = get_samples(args)

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #13
0
ファイル: p_df.py プロジェクト: subodhchhabra/pandashells
def main():  # pragma: no cover
    # read command line arguments
    msg = textwrap.dedent(
        """
        Enables pandas dataframe processing at the unix command line.

        This is the real workhorse of the pandashells toolkit.  It reads data
        from stdin as a dataframe, which is passed through any number of pandas
        operations provided on the command line.  Output is always to stdout.

        Each operation assumes data is in a dataframe named df.  Operations
        performed on this dataframe will overwrite the df variable with
        the results of that operation.  Special consideration is taken for
        assignments such as df['a'] = df.b + df.c.  These are understood
        to agument the input dataframe with a new column. By way of example,
        this command:
            p.df 'df.groupby(by="a").b.count()' 'df.reset_index()'
        is equivalent to the python expressions:
            df = df.groupby(by="a").b.count()
            df = df.reset_index()

        In addition to providing access to pandas dataframes, a number of
        modules are loaded into the namespace so as to be accessible from the
        command line.  These modules are:
            pd = pandas
            np = numpy
            scp = scipy
            pl = pylab
            parse = dateutil.parser.parse
            datetime = datetime
            re = re

        ** Important **
        When creating chains of dataframe operations (see examples), it is
        important to express your chain of operations before any options. This
        is because some options can take multiple arguments and the parser
        won't be able to properly decode your meaning.
        For example:
            cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader  # GOOD
            cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1'  # BAD

        Input can be read in different formats as specified by the -i switch.
        The most common formats are csv and table (white-space-delimited).  In
        either of these formats, p.df can accomodate input data that either
        does or doesn not have a header row.  When no header row is indicated,
        The columns of the Dataframe will be labeled as c0, c1, ..., cN.

        Plotting methods invoked on a Dataframe generate no output, but
        create an interactive plot instead.  There are a number of plot
        specific options available at the command line that govern the details
        of how these plots are rendered (e.g. --xlim, --legend, etc).

        -----------------------------------------------------------------------
        Examples:

            * Print a csv file in nice tabular format
                p.example_data -d tips | p.df -o table | head

            * Print a csv file to json
                p.example_data -d tips | head | p.df -o json

            * Transform csv to json then to table
                p.example_data -d tips | head | p.df -o json \\
                | p.df -i json -o table

            * Select by row
                p.example_data -d tips \\
                | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table

            * Extract columns
                p.example_data -d tips \\
                | p.df 'df[["total_bill", "tip"]].head()' -o table

            * Perform grouped aggregations
                p.example_data -d tips | p.df \\
                'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index

            * Use pandas plotting methods
                p.example_data -d tips | p.df \\
                'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\
                --xlabel 'Dollars' --title 'Total Bills by Day'

            * Convert between tabular and csv format with/without header rows
                seq 10 | awk '{print $1, 2*$1}'\\
                | p.df --names a b -i table noheader | p.df -o table noheader

        -----------------------------------------------------------------------
        """
    )
    from pandashells.lib import arg_lib

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)
    arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating')
    msg = (
        '(MUST come before any options) '
        '[statement ...] Statement(s) to execute. '
    )
    parser.add_argument(
        "statement", help=msg, nargs="*")
    args = parser.parse_args()

    get_modules_and_shortcuts(args.statement)
    from pandashells.lib import io_lib

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # execute the statements in order
    # plot commands are terminal statements so will call sys.exit()
    for cmd in args.statement:
        df = process_command(args, cmd, df)

    # write the output
    io_lib.df_to_output(args, df)
コード例 #14
0
ファイル: p_df.py プロジェクト: richwu/pandashells
def main():  # pragma: no cover
    # read command line arguments
    msg = textwrap.dedent(
        """
        Enables pandas dataframe processing at the unix command line.

        This is the real workhorse of the pandashells toolkit.  It reads data
        from stdin as a dataframe, which is passed through any number of pandas
        operations provided on the command line.  Output is always to stdout.

        Each operation assumes data is in a dataframe named df.  Operations
        performed on this dataframe will overwrite the df variable with
        the results of that operation.  Special consideration is taken for
        assignments such as df['a'] = df.b + df.c.  These are understood
        to agument the input dataframe with a new column. By way of example,
        this command:
            p.df 'df.groupby(by="a").b.count()' 'df.reset_index()'
        is equivalent to the python expressions:
            df = df.groupby(by="a").b.count()
            df = df.reset_index()

        In addition to providing access to pandas dataframes, a number of
        modules are loaded into the namespace so as to be accessible from the
        command line.  These modules are:
            pd = pandas
            np = numpy
            scp = scipy
            pl = pylab
            parse = dateutil.parser.parse
            datetime = datetime
            re = re

        ** Important **
        When creating chains of dataframe operations (see examples), it is
        important to express your chain of operations before any options. This
        is because some options can take multiple arguments and the parser
        won't be able to properly decode your meaning.
        For example:
            cat file.csv | p.df 'df["x"] = df.y + 1' -o table noheader  # GOOD
            cat file.csv | p.df -o table noheader 'df["x"] = df.y + 1'  # BAD

        Input can be read in different formats as specified by the -i switch.
        The most common formats are csv and table (white-space-delimited).  In
        either of these formats, p.df can accomodate input data that either
        does or doesn not have a header row.  When no header row is indicated,
        The columns of the Dataframe will be labeled as c0, c1, ..., cN.

        Plotting methods invoked on a Dataframe generate no output, but
        create an interactive plot instead.  There are a number of plot
        specific options available at the command line that govern the details
        of how these plots are rendered (e.g. --xlim, --legend, etc).

        -----------------------------------------------------------------------
        Examples:

            * Print a csv file in nice tabular format
                p.example_data -d tips | p.df -o table | head

            * Select by row
                p.example_data -d tips \\
                | p.df 'df[df.sex=="Female"]' 'df[df.smoker=="Yes"]' -o table

            * Extract columns
                p.example_data -d tips \\
                | p.df 'df[["total_bill", "tip"]].head()' -o table

            * Perform grouped aggregations
                p.example_data -d tips | p.df \\
                'df.groupby(by=["sex", "smoker"]).tip.sum()' -o table index

            * Use pandas plotting methods
                p.example_data -d tips | p.df \\
                'df.groupby(by="day").total_bill.sum().plot(kind="barh")'\\
                --xlabel 'Dollars' --title 'Total Bills by Day'

            * Convert between tabular and csv format with/without header rows
                seq 10 | awk '{print $1, 2*$1}'\\
                | p.df --names a b -i table noheader | p.df -o table noheader

        -----------------------------------------------------------------------
        """
    )

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)
    arg_lib.add_args(parser, 'io_in', 'io_out', 'decorating', 'example')
    msg = (
        '(MUST come before any options) '
        '[statement ...] Statement(s) to execute. '
    )
    parser.add_argument(
        "statement", help=msg, nargs="*")
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # execute the statements in order
    # plot commands are terminal statements so will call sys.exit()
    for cmd in args.statement:
        df = process_command(args, cmd, df)

    # write the output
    io_lib.df_to_output(args, df)
コード例 #15
0
ファイル: io_lib_tests.py プロジェクト: iiSeymour/pandashells
 def test_df_to_output_bad_type(self, csv_writer_mock):
     args_mock = MagicMock(output_options=['bad'])
     df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1])
     io_lib.df_to_output(args_mock, df)
     csv_writer_mock.assert_called_with(df, True, False, 'nan')
コード例 #16
0
ファイル: p_regress.py プロジェクト: alayassir/pandashells
def main():
    msg = textwrap.dedent(
        """
        Performs (multivariable) linear regression.  The fitting model
        is specified using the R-like, patsy syntax.  Input is from stdin
        and output is either fitting information or the input data
        with columns added for the fit and residuals.

        -----------------------------------------------------------------------
        Examples:
            * Fit a line to the sea-level data
                p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year'

            * Fit a trend plus annual cycle to sealevel data
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin'

            * Examine residual ECDF of trend plus annual fit
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\
                | p.cdf -c 'resid_' --title 'ECDF of trend + annual'

            * Detrend sealevel data to more clearly reveal oscillations
                p.example_data -d sealevel \\
                | p.regress -m 'sealevel_mm ~ year' --fit \\
                | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\
                         --title 'Global Sea Surface Height'

            * Set origin of sealevel data to 0 and regress with no intercept
                p.example_data -d sealevel\\
                | p.df 'df["year"] = df.year - df.year.iloc[0]'\\
                'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\
                | p.regress -m 'sealevel_mm ~ year - 1' --fit\\
                | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\
                     --alpha .2 1 --legend best --title 'Force Zero Intercept'

        -----------------------------------------------------------------------
        """
    )

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    # specify columns to histogram
    parser.add_argument("-m", "--model", type=str, nargs=1, required=True,
                        help="The model expressed in patsy syntax")

    msg = "Return input with fit and residual appended"
    parser.add_argument("--fit", action="store_true", dest='retfit',
                        default=False, help=msg)

    parser.add_argument("--plot", action="store_true",
                        default=False, help="Make residual plots")

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # fit the model and add fit, resid columns
    result = sm.ols(formula=args.model[0], data=df).fit()
    df['fit_'] = result.fittedvalues
    df['resid_'] = result.resid

    # add and output the fit results if requested
    if args.retfit:
        io_lib.df_to_output(args, df)
        return

    # print the fit summary
    sys.stdout.write('\n{}\n'.format(result.summary()))
    sys.stdout.flush()

    # do plots if requested
    if args.plot:
        module_checker_lib.check_for_modules(['matplotlib', 'seaborn'])
        plot_lib = get_module('pandashells.lib.plot_lib')
        mpl = get_module('matplotlib')
        pl = get_module('pylab')
        sns = get_module('seaborn')

        pl.subplot(211)
        pl.plot(df.fit_, df.resid_, '.', alpha=.5)
        pl.xlabel('Fit')
        pl.ylabel('Residual')
        pl.title(args.model[0])

        pl.subplot(212)
        sns.distplot(df.resid_, bins=50)
        pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared))
        pl.ylabel('Counts')

        # annoying issue with osx backend forces if statement here
        if mpl.get_backend().lower() in ['agg', 'macosx']:
            pl.gcf().set_tight_layout(True)
        else:
            pl.gcf().tight_layout()

        plot_lib.show(args)
コード例 #17
0
ファイル: p_merge.py プロジェクト: djangosporti/pandashells
def main():
    msg = textwrap.dedent("""
        Tool to merge datasets.  Similar functionality to database
        joins. The arguments closely parallel those of the pandas merge
        command.  See the pandas merge documentation for more details.

        -----------------------------------------------------------------------
        Examples:

            * Merge election polls with electoral-college numbers
                p.merge <(p.example_data -d election) \\
                        <(p.example_data -d electoral_college) \\
                        --how left --on state \\
                | p.df -o table | head
        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument('--how',
                        choices=['left', 'right', 'inner', 'outer'],
                        dest='how',
                        default=['inner'],
                        nargs=1,
                        help="Type of join.  Default='inner'")

    msg = 'List of of columns on which to join'
    parser.add_argument('--on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='on',
                        help=msg)

    msg = 'List of of columns from left file to join on. '
    parser.add_argument('--left_on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='left_on',
                        help=msg)

    msg = 'List of of columns from right file to join on. '
    parser.add_argument('--right_on',
                        nargs='+',
                        metavar='col',
                        type=str,
                        dest='right_on',
                        help=msg)

    msg = 'List of suffixes appended to identically '
    msg += 'named columns'
    parser.add_argument('--suffixes',
                        nargs=2,
                        metavar='_x _y',
                        type=str,
                        dest='suffixes',
                        default=['_x', '_y'],
                        help=msg)

    parser.add_argument("file",
                        help="Files to join",
                        nargs=2,
                        type=str,
                        metavar='file')

    args = parser.parse_args()
    validate_args(args)

    # get merge options from cli
    how = args.how[0]
    on = args.on if args.on else None
    left_on = args.left_on if args.left_on else None
    right_on = args.right_on if args.right_on else None
    suffixes = args.suffixes

    # get file names
    left_name, right_name = tuple(args.file)

    # load the dataframes
    df_left = io_lib.df_from_input(args, left_name)
    df_right = io_lib.df_from_input(args, right_name)

    # perform the merge
    dfj = pd.merge(df_left,
                   df_right,
                   how=how,
                   on=on,
                   left_on=left_on,
                   right_on=right_on,
                   sort=True,
                   suffixes=suffixes)

    # output the joined frame
    io_lib.df_to_output(args, dfj)
コード例 #18
0
ファイル: p_merge.py プロジェクト: richwu/pandashells
def main():
    msg = textwrap.dedent(
        """
        Tool to merge datasets.  Similar functionality to database
        joins. The arguments closely parallel those of the pandas merge
        command.  See the pandas merge documentation for more details.

        -----------------------------------------------------------------------
        Examples:

            * Merge election polls with electoral-college numbers
                p.merge <(p.example_data -d election) \\
                        <(p.example_data -d electoral_college) \\
                        --how left --on state \\
                | p.df -o table | head
        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    parser.add_argument('--how', choices=['left', 'right', 'inner', 'outer'],
                        dest='how', default=['inner'], nargs=1,
                        help="Type of join.  Default='inner'")

    msg = 'List of of columns on which to join'
    parser.add_argument('--on', nargs='+', metavar='col',
                        type=str, dest='on', help=msg)

    msg = 'List of of columns from left file to join on. '
    parser.add_argument('--left_on', nargs='+', metavar='col',
                        type=str, dest='left_on', help=msg)

    msg = 'List of of columns from right file to join on. '
    parser.add_argument('--right_on', nargs='+', metavar='col',
                        type=str, dest='right_on', help=msg)

    msg = 'List of suffixes appended to identically '
    msg += 'named columns'
    parser.add_argument('--suffixes', nargs=2, metavar='_x _y',
                        type=str, dest='suffixes', default=['_x', '_y'],
                        help=msg)

    parser.add_argument("file", help="Files to join", nargs=2, type=str,
                        metavar='file')

    args = parser.parse_args()
    validate_args(args)

    # get merge options from cli
    how = args.how[0]
    on = args.on if args.on else None
    left_on = args.left_on if args.left_on else None
    right_on = args.right_on if args.right_on else None
    suffixes = args.suffixes

    # get file names
    left_name, right_name = tuple(args.file)

    # load the dataframes
    df_left = io_lib.df_from_input(args, left_name)
    df_right = io_lib.df_from_input(args, right_name)

    # perform the merge
    dfj = pd.merge(df_left, df_right, how=how, on=on, left_on=left_on,
                   right_on=right_on, sort=True, suffixes=suffixes)

    # output the joined frame
    io_lib.df_to_output(args, dfj)
コード例 #19
0
def main():
    msg = textwrap.dedent(
        """
        Plots the emperical cumulative distribution function (ECDF).

        -----------------------------------------------------------------------
        Examples:

            * Plot ECDF for 10k samples from the standard normal distribution.
                p.rand -t normal -n 10000 | p.cdf -c c0

            * Instead of plotting, send ECDF values to stdout
                p.rand -t normal -n 10000 | p.cdf -c c0 -q | head
        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    # specify column to use
    parser.add_argument(
        "-c", "--col", required=True, nargs=1,
        help="Column to plot distribution")
    parser.add_argument(
        '-n', '--n_points', nargs=1, type=int,
        help='Number of output points (default is twice input len)')
    parser.add_argument(
        '-q', '--quiet', action='store_true', default=False,
        help='Quiet mean no plots. Send numeric output to stdout instead')

    # parse arguments
    arg_lib.add_args(parser, 'decorating', 'io_in', 'io_out',)
    args = parser.parse_args()

    # get the input dataframe and extract column
    df = io_lib.df_from_input(args)
    x = df[args.col[0]].values

    # create the output distribution
    n_out = 2 * len(x) if args.n_points is None else args.n_points[0]
    x_out = np.linspace(min(x), max(x), n_out)
    y_out = ECDF(x)(x_out)

    # send values to stdout if quiet specified
    if args.quiet:
        df_out = pd.DataFrame(
            {'x': x_out, 'p_less': y_out, 'p_greater': 1 - y_out})
        df_out = df_out[['x', 'p_less', 'p_greater']]
        io_lib.df_to_output(args, df_out)
        return

    # set the appropriate theme ad make plot
    plot_lib.set_plot_styling(args)
    pl.plot(x_out, y_out, label='P({} < x)'.format(args.col[0]))
    pl.plot(x_out, 1. - y_out, label='P({} > x)'.format(args.col[0]))
    pl.xlabel('x')
    pl.legend(loc='best')

    plot_lib.refine_plot(args)
    plot_lib.show(args)
コード例 #20
0
def main():
    msg = textwrap.dedent("""
        Computes a spectrogram using the lomb-scargle algorithm provided by
        the gatspy module.  The input time series need not have evenly spaced
        time-stamps.  The FFT-based algorithm has complexity O[N*log(N)].

        -----------------------------------------------------------------------
        Examples:

            * Plot the spectrum of a simple sine wave
                  p.linspace 0 10 100 \\
                  | p.df 'df["value"] = 7 * np.sin(2*np.pi*df.time / 1.5)'\\
                        --names time\\
                  | p.lomb_scargle -t time -y value --interp_exp 3\\
                  | p.plot -x period -y amp --xlim 0 3

            * Show the annual and 59-day peaks in the sealevel spectrum
                p.example_data -d sealevel\\
                | p.df 'df["day"] = 365.25 * df.year'\\
                        'df["day"] = df.day - df.day.iloc[0]'\\
                | p.lomb_scargle -t day -y sealevel_mm --interp_exp 3\\
                | p.df 'df[df.period < 720]'\\
                | p.plot -x period -y amp --xlim 1 400\\
                         --title 'Sea-surface height spectrum'\\
                         --xlabel 'period (days)'

        -----------------------------------------------------------------------
        """)

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out')

    parser.add_argument('-t',
                        '--time_col',
                        help='Time Column',
                        nargs=1,
                        required=True,
                        type=str)

    parser.add_argument('-y',
                        '--observation_col',
                        help='Observation column',
                        nargs=1,
                        dest='val_col',
                        required=True,
                        type=str)

    parser.add_argument('--interp_exp',
                        help='Interpolate by this power of 2',
                        nargs=1,
                        type=int,
                        default=[1])
    parser.add_argument('--freq_order',
                        action='store_true',
                        dest='freq_order',
                        default=False,
                        help='Order output by freqency instead of period')

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)
    df = lomb_scargle_lib.lomb_scargle(df, args.time_col[0], args.val_col[0],
                                       args.interp_exp[0], args.freq_order)

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #21
0
 def test_df_to_output_bad_type(self, csv_writer_mock):
     args_mock = MagicMock(output_options=['bad'])
     df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'], index=[0, 1])
     io_lib.df_to_output(args_mock, df)
     csv_writer_mock.assert_called_with(df, True, False, 'nan')
コード例 #22
0
ファイル: p_rand.py プロジェクト: richwu/pandashells
def main():
    msg = textwrap.dedent(
        """
        Return random samples from common probability distrubtions.

        -----------------------------------------------------------------------
        Examples:

            uniform:  p.rand -n 1000 -t uniform  --min=0    --max=1   | p.hist
            normal:   p.rand -n 1000 -t normal   --mu=0     --sigma=1 | p.hist
            poisson:  p.rand -n 1000 -t poisson  --mu=1               | p.hist
            beta:     p.rand -n 1000 -t beta     --alpha=2  --beta=6  | p.hist
            gamma:    p.rand -n 1000 -t gamma    --alpha=1  --beta=1  | p.hist
            binomial: p.rand -n 1000 -t binomial --N=10     --p=0.4   | p.hist
        -----------------------------------------------------------------------
        """
    )

    # read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    parser.add_argument(
        '-t', '--type', nargs=1, type=str, default=['uniform'],
        choices=['uniform', 'normal', 'beta', 'gamma', 'binomial', 'poisson'],
        help='type of distribution (default=\'uniform\')')
    parser.add_argument(
        '-n', '--num_samples', nargs=1, default=[10], type=int,
        help='The number of rows to generate (default=10)')
    parser.add_argument(
        '-c', '--columns', nargs=1, default=[1], type=int,
        help='The number of columns to generate per row (default=1)')
    parser.add_argument(
        '--N', nargs=1, default=[10], type=int,
        help=(
            '(Binomial Dist) Largest possible value for random variable. '
            '(default=10)'
        )
    )
    parser.add_argument(
        '--p', nargs=1, default=[.5], type=float,
        help=(
            '(Binomial Dist) Bernoulli probability for each trial'
            '(default=.5)'
        )
    )
    parser.add_argument(
        '--mu', nargs=1, type=float,
        help='(Normal, Poisson) Mean (defaults: normal:0, poisson:1')
    parser.add_argument(
        '--sigma', nargs=1, default=[1.], type=float,
        help='(Normal) standard deviation, (default: 1)')
    parser.add_argument(
        '--min', nargs=1, default=[0.], type=float,
        help='(Uniform) Minimum value of range, (default: 0)')
    parser.add_argument(
        '--max', nargs=1, default=[1.], type=float,
        help='(Uniform) Maximum value of range, (default: 1)')
    parser.add_argument(
        '--alpha', nargs=1, default=[2.], type=float,
        help='(Beta, Gamma)  (default: 2)')
    parser.add_argument(
        '--beta', nargs=1, default=[2.], type=float,
        help='(Beta, Gamma)  (default: 2)')

    arg_lib.add_args(parser, 'io_out', 'example')

    # parse arguments
    args = parser.parse_args()

    # set some defaults
    args = fill_default_mu(args)

    # get the samples
    df = get_samples(args)

    # write dataframe to output
    io_lib.df_to_output(args, df)
コード例 #23
0
def main():
    msg = textwrap.dedent("""
        Performs (multivariable) linear regression.  The fitting model
        is specified using the R-like, patsy syntax.  Input is from stdin
        and output is either fitting information or the input data
        with columns added for the fit and residuals.

        -----------------------------------------------------------------------
        Examples:
            * Fit a line to the sea-level data
                p.example_data -d sealevel | p.regress -m 'sealevel_mm ~ year'

            * Fit a trend plus annual cycle to sealevel data
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin'

            * Examine residual ECDF of trend plus annual fit
                p.example_data -d sealevel \\
                | p.df 'df["sin"] =  np.sin(2 * np.pi * df.year)' \\
                | p.df 'df["cos"] = np.cos(2 * np.pi * df.year)' \\
                | p.regress -m 'sealevel_mm ~ year + cos + sin' --fit \\
                | p.cdf -c 'resid_' --title 'ECDF of trend + annual'

            * Detrend sealevel data to more clearly reveal oscillations
                p.example_data -d sealevel \\
                | p.regress -m 'sealevel_mm ~ year' --fit \\
                | p.plot -x year -y resid_ --ylabel 'Trend removed (mm)' \\
                         --title 'Global Sea Surface Height'

            * Set origin of sealevel data to 0 and regress with no intercept
                p.example_data -d sealevel\\
                | p.df 'df["year"] = df.year - df.year.iloc[0]'\\
                'df["sealevel_mm"] = df.sealevel_mm - df.sealevel_mm.iloc[0]'\\
                | p.regress -m 'sealevel_mm ~ year - 1' --fit\\
                | p.plot -x year -y sealevel_mm fit_ --style '.' '-'\\
                     --alpha .2 1 --legend best --title 'Force Zero Intercept'

        -----------------------------------------------------------------------
        """)

    #  read command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter, description=msg)

    arg_lib.add_args(parser, 'io_in', 'io_out', 'example')

    # specify columns to histogram
    parser.add_argument("-m",
                        "--model",
                        type=str,
                        nargs=1,
                        required=True,
                        help="The model expressed in patsy syntax")

    msg = "Return input with fit and residual appended"
    parser.add_argument("--fit",
                        action="store_true",
                        dest='retfit',
                        default=False,
                        help=msg)

    parser.add_argument("--plot",
                        action="store_true",
                        default=False,
                        help="Make residual plots")

    # parse arguments
    args = parser.parse_args()

    # get the input dataframe
    df = io_lib.df_from_input(args)

    # fit the model and add fit, resid columns
    result = sm.ols(formula=args.model[0], data=df).fit()
    df['fit_'] = result.fittedvalues
    df['resid_'] = result.resid

    # add and output the fit results if requested
    if args.retfit:
        io_lib.df_to_output(args, df)
        return

    # print the fit summary
    sys.stdout.write('\n{}\n'.format(result.summary()))
    sys.stdout.flush()

    # do plots if requested
    if args.plot:
        pl.subplot(211)
        pl.plot(df.fit_, df.resid_, '.', alpha=.5)
        pl.xlabel('Fit')
        pl.ylabel('Residual')
        pl.title(args.model[0])

        pl.subplot(212)
        sns.distplot(df.resid_, bins=50)
        pl.xlabel('Residual with R^2 = {:0.4f}'.format(result.rsquared))
        pl.ylabel('Counts')

        # annoying issue with osx backend forces if statement here
        if mpl.get_backend().lower() in ['agg', 'macosx']:
            pl.gcf().set_tight_layout(True)
        else:
            pl.gcf().tight_layout()

        plot_lib.show(args)