Ejemplo n.º 1
0
def main():
    r"""
    Reads a csv file, keeping only certain columns.  Prints to stdout.

    Examples
    ---------
    Read a comma delimited csv file, data.csv, keep the 'name' column
    $ python cut.py -l name,age test/commafile.csv

    Use a tab delimited dataset 
    $ python cut.py -d'\t' -l name  test/tabfile.csv
    Note that -dt  -dtab -d\t -d'\t' -d\\t  also work
    """
    usage = "usage: %prog [options] dataset"
    usage += '\n'+main.__doc__
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-l", "--keep_list",
        help="Only keep variables in this (comma delimited) list."
        " [default: %default] ",
        action="store", dest='keep_list', default=None)
    parser.add_option(
        "-d", "--delimiter",
        help="Use DELIMITER as the column delimiter.  [default: %default]",
        action="store", dest='delimiter', default=',')
    parser.add_option(
        "-o", "--outfilename",
        help="Write to this file rather than stdout.  [default: %default]",
        action="store", dest='outfilename', default=None)

    (options, args) = parser.parse_args()

    ### Parse args
    # Raise an exception if the length of args is greater than 1
    assert len(args) <= 1
    # If an argument is given, then it is the 'infilename'
    # If no arguments are given, set infilename equal to None
    infilename = args[0] if args else None

    ## Handle the options
    # Change keep_list to a Python list
    keep_list = options.keep_list.split(',') if options.keep_list else None

    # Deal with tabs
    if options.delimiter in ['t', '\\t', '\t', 'tab']:
        options.delimiter = '\t'

    ## Get the infile/outfile
    infile, outfile = common.get_inout_files(infilename, options.outfilename)

    ## Call the function that does the real work
    cut_file(infile, outfile, delimiter=options.delimiter, keep_list=keep_list)

    ## Close the files iff not stdin, stdout
    common.close_files(infile, outfile)
def main():
    r"""
    DESCRIPTION
    -----------
    Subsample files or stdin and write to stdout.


    NOTES
    -----
    Assumes the first row is a header.


    EXAMPLES
    ---------
    Subsample a comma delimited dataset and redirect output to a new file
    $ python subsample.py data.csv > subsampled_data.csv

    Subsample, keeping only 10% of rows
    $ python subsample.py -r 0.1 data.csv

    """
    usage = "usage: %prog [options] dataset"
    usage += '\n'+main.__doc__
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-r", "--subsample_rate",
        help="Subsample subsample_rate, 0 <= r <= 1.  E.g. r = 0.1 keeps 10% "
        "of rows. [default: %default] ",
        action="store", dest='subsample_rate', type=float, default=0.01)
    parser.add_option(
        "-n", "--subsample_number",
        help="Subsample subsample_number, It's latent unless -r not specified"
        "and -n specified. E.g. n = 100 keeps 100 of rows. [default: %default] ",
        action="store", dest='subsample_number', type=int, default=1)
    parser.add_option(
        "-d", "--delimiter",
        help="Use DELIMITER as the column delimiter.  [default: %default]",
        action="store", dest='delimiter', default=',')
    parser.add_option(
        "-s", "--seed",
        help="Integer to seed the random number generator with. "
        "[default: %default] ",
        action="store", dest='seed', type=int, default=None)
    parser.add_option(
        "-o", "--outfilename",
        help="Write to this file rather than stdout.  [default: %default]",
        action="store", dest='outfilename', default=None)

    (options, args) = parser.parse_args()

    ### Parse args
    # Raise an exception if the length of args is greater than 1
    assert len(args) <= 1
    # If an argument is given, then it is the 'infilename'
    # If no arguments are given, set infilename equal to None
    infilename = args[0] if args else None

    ## Handle the options
    # Deal with tabs
    if options.delimiter in ['t', '\\t', '\t', 'tab']:
        options.delimiter = '\t'

    ## Get the infile/outfile
    infile, outfile = common.get_inout_files(infilename, options.outfilename)

    ## Call the function that does the real work
    subsample(
        infile, outfile, options.subsample_rate, options.subsample_number, options.delimiter,
        options.seed)

    ## Close the files iff not stdin, stdout
    common.close_files(infile, outfile)