def main(): r""" Reads a csv file, keeping only certain columns. Prints to stdout. Examples --------- Read a comma delimited csv file, data.csv, keep the 'name' column $ python cut.py -l name,age test/commafile.csv Use a tab delimited dataset $ python cut.py -d'\t' -l name test/tabfile.csv Note that -dt -dtab -d\t -d'\t' -d\\t also work """ usage = "usage: %prog [options] dataset" usage += '\n'+main.__doc__ parser = OptionParser(usage=usage) parser.add_option( "-l", "--keep_list", help="Only keep variables in this (comma delimited) list." " [default: %default] ", action="store", dest='keep_list', default=None) parser.add_option( "-d", "--delimiter", help="Use DELIMITER as the column delimiter. [default: %default]", action="store", dest='delimiter', default=',') parser.add_option( "-o", "--outfilename", help="Write to this file rather than stdout. [default: %default]", action="store", dest='outfilename', default=None) (options, args) = parser.parse_args() ### Parse args # Raise an exception if the length of args is greater than 1 assert len(args) <= 1 # If an argument is given, then it is the 'infilename' # If no arguments are given, set infilename equal to None infilename = args[0] if args else None ## Handle the options # Change keep_list to a Python list keep_list = options.keep_list.split(',') if options.keep_list else None # Deal with tabs if options.delimiter in ['t', '\\t', '\t', 'tab']: options.delimiter = '\t' ## Get the infile/outfile infile, outfile = common.get_inout_files(infilename, options.outfilename) ## Call the function that does the real work cut_file(infile, outfile, delimiter=options.delimiter, keep_list=keep_list) ## Close the files iff not stdin, stdout common.close_files(infile, outfile)
def main(): r""" DESCRIPTION ----------- Subsample files or stdin and write to stdout. NOTES ----- Assumes the first row is a header. EXAMPLES --------- Subsample a comma delimited dataset and redirect output to a new file $ python subsample.py data.csv > subsampled_data.csv Subsample, keeping only 10% of rows $ python subsample.py -r 0.1 data.csv """ usage = "usage: %prog [options] dataset" usage += '\n'+main.__doc__ parser = OptionParser(usage=usage) parser.add_option( "-r", "--subsample_rate", help="Subsample subsample_rate, 0 <= r <= 1. E.g. r = 0.1 keeps 10% " "of rows. [default: %default] ", action="store", dest='subsample_rate', type=float, default=0.01) parser.add_option( "-n", "--subsample_number", help="Subsample subsample_number, It's latent unless -r not specified" "and -n specified. E.g. n = 100 keeps 100 of rows. [default: %default] ", action="store", dest='subsample_number', type=int, default=1) parser.add_option( "-d", "--delimiter", help="Use DELIMITER as the column delimiter. [default: %default]", action="store", dest='delimiter', default=',') parser.add_option( "-s", "--seed", help="Integer to seed the random number generator with. " "[default: %default] ", action="store", dest='seed', type=int, default=None) parser.add_option( "-o", "--outfilename", help="Write to this file rather than stdout. [default: %default]", action="store", dest='outfilename', default=None) (options, args) = parser.parse_args() ### Parse args # Raise an exception if the length of args is greater than 1 assert len(args) <= 1 # If an argument is given, then it is the 'infilename' # If no arguments are given, set infilename equal to None infilename = args[0] if args else None ## Handle the options # Deal with tabs if options.delimiter in ['t', '\\t', '\t', 'tab']: options.delimiter = '\t' ## Get the infile/outfile infile, outfile = common.get_inout_files(infilename, options.outfilename) ## Call the function that does the real work subsample( infile, outfile, options.subsample_rate, options.subsample_number, options.delimiter, options.seed) ## Close the files iff not stdin, stdout common.close_files(infile, outfile)