Ejemplo n.º 1
0
def parse_args(args=None):
    parser = PathArgumentParser(
        description="Simulate data and/or run analysis")

    parser.add_argument(
        '--logging', type=str, default='WARN', help="Logging level",
        choices=[key for key in logging._levelNames.keys() if isinstance(key, str)])

    subparsers = parser.add_subparsers()

    p_simul = subparsers.add_parser('simulate', help='generate simulation')
    add_simul_args(p_simul)
    p_simul.add_argument(
        '--output', type=GzipFileType('w'), default=sys.stdout, help='File output')
    p_simul.set_defaults(func=do_simulation)

    p_clust = subparsers.add_parser('cluster', help='run clustering')
    p_clust.add_argument(
        '--input', type=GzipFileType('r'), default=sys.stdin, help='File input')
    add_clust_args(p_clust)
    p_clust.add_argument(
        '--output', type=GzipFileType('w'), default=sys.stdout, help='File output')
    p_clust.set_defaults(func=do_cluster)

    p_analy = subparsers.add_parser('analyze', help='run analysis')
    p_analy.add_argument(
        '--input', type=GzipFileType('r'), default=sys.stdin, help='File input')
    add_analy_args(p_analy)
    p_analy.add_argument(
        '--output', type=GzipFileType('w'), default=sys.stdout, help='File output')
    p_analy.set_defaults(func=do_analyze)

    p_mapper = subparsers.add_parser(
        'mapper', help='Perform multiple steps')
    add_simul_args(p_mapper)
    add_clust_args(p_mapper)
    add_analy_args(p_mapper)
    p_mapper.add_argument(
        '--output', type=GzipFileType('w'), default=sys.stdout, help='File output')
    p_mapper.set_defaults(func=do_mapper)

    p_reducer = subparsers.add_parser('reducer', help='summarize analysis results')
    add_analy_args(p_reducer)
    p_reducer.add_argument(
        '--input', type=GzipFileType('r'), default=sys.stdin, help='File input')
    p_reducer.add_argument(
        '--fig_title', type=str, default=None, help='Title (for figures generated)')
    p_reducer.add_argument(
        '--fig_format', type=str, default='svg', help='Figure format')
    p_reducer.add_argument(
        '--output', type=str, metavar='DIR', help='Output directory')
    p_reducer.set_defaults(func=do_reducer)

    namespace = parser.parse_args()
    return namespace
Ejemplo n.º 2
0
def parse_args(args=None):
    parser = PathArgumentParser()

    parser.add_argument(
        "--logging",
        type=str,
        default="WARN",
        help="Logging level",
        choices=[key for key in logging._levelNames.keys() if isinstance(key, str)],
    )

    subparsers = parser.add_subparsers()

    p_mapper = subparsers.add_parser("mapper")
    p_mapper.add_argument("--h0_err", type=float, default=1.0, help="H0 error rate")
    p_mapper.add_argument("--h1_err", type=float, default=0.5, help="H1 error rate")
    p_mapper.add_argument("--population_size", type=int, default=2000, help="population size")
    p_mapper.add_argument("--sim_size", type=int, default=1000, help="Simulation size")
    p_mapper.add_argument("--nclusters", type=int, default=20, help="number of clusters to generate")
    p_mapper.add_argument("--join_negatives", type=int, default=0, help="whether to join negatives (if split_join<0)")
    p_mapper.add_argument(
        "--split_join", type=int, default=0, help="number of splits (if positive) or joins (if negative) to perform"
    )
    p_mapper.add_argument("--sampling_warnings", type=int, default=0, help="if true, show sampling warnings")
    p_mapper.add_argument("--output", type=GzipFileType("w"), default=sys.stdout, help="Output file")
    p_mapper.add_argument("--metrics", type=str, required=True, nargs="*", help="Which metrics to compute")
    p_mapper.set_defaults(func=do_mapper)

    p_reducer = subparsers.add_parser("reducer")
    p_reducer.add_argument("--group_by", type=str, default=None, help="Field to group by")
    p_reducer.add_argument("--x_axis", type=str, default=None, help="Which column to plot as X axis")
    p_reducer.add_argument("--metrics", type=str, required=True, nargs="*", help="Which metrics to compute")
    p_reducer.add_argument("--input", type=GzipFileType("r"), default=sys.stdin, help="File input")
    p_reducer.add_argument("--output", type=str, metavar="DIR", help="Output directory")
    p_reducer.add_argument("--fig_title", type=str, default=None, help="Title (for figures generated)")
    p_reducer.add_argument("--fig_format", type=str, default="svg", help="Figure format")
    p_reducer.add_argument("--legend_loc", type=str, default="lower left", help="legend location")
    p_reducer.set_defaults(func=do_reducer)

    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 3
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--input', type=GzipFileType('r'), default=[sys.stdin], nargs='*',
                        help='Input file (in TSV format, optionally compressed)')
    parser.add_argument('--field', type=str, default='review',
                        help='Field name (Default: review)')
    parser.add_argument('--limit', type=int, default=None,
                        help='Only process this many lines (for testing)')
    parser.add_argument('--n_jobs', type=int, default=-1,
                        help="Number of jobs to run")
    parser.add_argument('--output', type=GzipFileType('w'), default=sys.stdout,
                        help='Output file')

    subparsers = parser.add_subparsers()

    parser_tokenize = subparsers.add_parser('tokenize')
    parser_tokenize.add_argument('--sentences', action='store_true',
                                 help='split on sentences')
    parser_tokenize.set_defaults(func=run_tokenize)

    parser_train = subparsers.add_parser('train')
    parser_train.add_argument('--verbose', action='store_true',
                              help='be verbose')
    parser_train.set_defaults(func=train_sentence_tokenizer)

    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 4
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--input', type=str, metavar='FILE', nargs='+',
                        help='Input files')
    parser.add_argument('--field', type=str, default='review',
                        help='Field name (Default: review)')
    parser.add_argument('--verbose', action='store_true',
                        help='be verbose')
    parser.add_argument('--output', type=str, required=True,
                        help='where to save the model to')
    parser.add_argument('--limit', type=int, default=None,
                        help='(for debugging) limit input to n lines')
    parser.add_argument('--corpus_model', type=str, default=None,
                        help='where corpus model lives (GloVe)')
    parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(),
                        help='Number of workers to use (default: same as number of CPUs)')
    parser.add_argument('--doc2vec', action='store_true',
                        help='use Doc2Vec instead of Word2Vec model')
    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 5
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--embedding',
                        type=str,
                        metavar='FILE',
                        default=None,
                        help='Input word2vec (or doc2vec) model')
    parser.add_argument('--train',
                        type=str,
                        metavar='FILE',
                        default=None,
                        help='(Labeled) training set')
    parser.add_argument('--plot_features',
                        type=str,
                        default=None,
                        help='file to save feature comparison to')
    parser.add_argument(
        '--sentences',
        type=GzipFileType('r'),
        default=None,
        help='File containing sentences in JSON format (implies doc2vec)')
    parser.add_argument(
        '--vectors',
        metavar='FILE',
        type=str,
        default=None,
        help='File containing sentence vectors in Pickle format')
    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 6
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--embedding', type=str, metavar='FILE', default=None,
                        help='Input word2vec (or doc2vec) model')
    parser.add_argument('--train', type=str, metavar='FILE', default=None,
                        help='(Labeled) training set')
    parser.add_argument('--plot_features', type=str, default=None,
                        help='file to save feature comparison to')
    parser.add_argument('--sentences', type=GzipFileType('r'), default=None,
                        help='File containing sentences in JSON format (implies doc2vec)')
    parser.add_argument('--vectors', metavar='FILE', type=str, default=None,
                        help='File containing sentence vectors in Pickle format')
    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 7
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--input', type=str, metavar='FILE', nargs='+',
                        help='Input files')
    parser.add_argument('--verbose', action='store_true',
                        help='be verbose')
    parser.add_argument('--output', type=str, required=True,
                        help='where to save the model to')
    parser.add_argument('--limit', type=int, default=None,
                        help='(for debugging) limit input to n lines')
    parser.add_argument('--corpus_model', type=str, default=None,
                        help='where corpus model lives (GloVe)')
    parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(),
                        help='Number of workers to use (default: same as number of CPUs)')
    parser.add_argument('--doc2vec', action='store_true',
                        help='use Doc2Vec instead of Word2Vec model')
    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 8
0
def parse_args(args=None):
    parser = PathArgumentParser()
    parser.add_argument('--input_labeled', type=GzipFileType('r'),
                        default=(), required=False, nargs='*',
                        help='Labeled input files (TSV format, optionally compressed)')
    parser.add_argument('--input_unlabeled', type=GzipFileType('r'),
                        default=(), required=False, nargs='*',
                        help='Unlabeled input files (TSV format, optionally compressed)')
    parser.add_argument('--n_jobs', type=int, default=-1,
                        help="Number of jobs to run")
    parser.add_argument('--output', type=GzipFileType('w'), default=sys.stdout,
                        help='Output file')

    subparsers = parser.add_subparsers()

    parser_tokenize = subparsers.add_parser('tokenize')
    parser_tokenize.add_argument('--sentences', action='store_true',
                                 help='split on sentences')
    parser_tokenize.set_defaults(func=run_tokenize)

    parser_train = subparsers.add_parser('train')
    parser_train.add_argument('--verbose', action='store_true',
                              help='be verbose')
    parser_train.set_defaults(func=train_sentence_tokenizer)

    namespace = parser.parse_args(args)
    return namespace
Ejemplo n.º 9
0
def parse_args(args=None):
    parser = PathArgumentParser()

    parser.add_argument('--logging',
                        type=str,
                        default='WARN',
                        help="Logging level",
                        choices=[
                            key for key in logging._levelNames.keys()
                            if isinstance(key, str)
                        ])

    subparsers = parser.add_subparsers()

    p_mapper = subparsers.add_parser('mapper')
    p_mapper.add_argument('--h0_err',
                          type=float,
                          default=1.0,
                          help='H0 error rate')
    p_mapper.add_argument('--h1_err',
                          type=float,
                          default=0.5,
                          help='H1 error rate')
    p_mapper.add_argument('--population_size',
                          type=int,
                          default=2000,
                          help='population size')
    p_mapper.add_argument('--sim_size',
                          type=int,
                          default=1000,
                          help='Simulation size')
    p_mapper.add_argument('--nclusters',
                          type=int,
                          default=20,
                          help='number of clusters to generate')
    p_mapper.add_argument('--join_negatives',
                          type=int,
                          default=0,
                          help='whether to join negatives (if split_join<0)')
    p_mapper.add_argument(
        '--split_join',
        type=int,
        default=0,
        help='number of splits (if positive) or joins (if negative) to perform'
    )
    p_mapper.add_argument('--sampling_warnings',
                          type=int,
                          default=0,
                          help='if true, show sampling warnings')
    p_mapper.add_argument('--output',
                          type=GzipFileType('w'),
                          default=sys.stdout,
                          help='Output file')
    p_mapper.add_argument('--metrics',
                          type=str,
                          required=True,
                          nargs='*',
                          help='Which metrics to compute')
    p_mapper.set_defaults(func=do_mapper)

    p_reducer = subparsers.add_parser('reducer')
    p_reducer.add_argument('--group_by',
                           type=str,
                           default=None,
                           help='Field to group by')
    p_reducer.add_argument('--x_axis',
                           type=str,
                           default=None,
                           help='Which column to plot as X axis')
    p_reducer.add_argument('--metrics',
                           type=str,
                           required=True,
                           nargs='*',
                           help='Which metrics to compute')
    p_reducer.add_argument('--input',
                           type=GzipFileType('r'),
                           default=sys.stdin,
                           help='File input')
    p_reducer.add_argument('--output',
                           type=str,
                           metavar='DIR',
                           help='Output directory')
    p_reducer.add_argument('--fig_title',
                           type=str,
                           default=None,
                           help='Title (for figures generated)')
    p_reducer.add_argument('--fig_format',
                           type=str,
                           default='svg',
                           help='Figure format')
    p_reducer.add_argument('--legend_loc',
                           type=str,
                           default='lower left',
                           help='legend location')
    p_reducer.set_defaults(func=do_reducer)

    namespace = parser.parse_args(args)
    return namespace