def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i','--input_dir', required=True) parser.add_argument('--file_format', default='sequence', choices=('text', 'sequence')) parser.add_argument('-o','--output_dir', required=True) parser.add_argument('--output_file_format', default='sequence', choices=('text', 'sequence')) parser.add_argument('--config', default=None) parser.add_argument('-l','--limit', required=False, default=None, type=int) parser.add_argument('-e','--emptylines',required=False,default=True) args=parser.parse_args() # Default configuration to empty config # (avoid mutable container as default) args.config = args.config or {} sparkName = "testTokenizer" sc = SparkContext(appName=sparkName) # remove positional args, everything else passed verbatim kwargs = dict_minus(as_dict(args), "input_dir", "output_dir", "config") print 'Got Options : ',kwargs testTokenizer(sc, args.input_dir, args.output_dir, args.config, **kwargs)
def main(argv=None): '''this is called if run from command line''' parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_dir', required=True) parser.add_argument('--file_format', default='sequence', choices=('text', 'sequence')) parser.add_argument('-o', '--output_dir', required=True) parser.add_argument('--output_file_format', default='sequence', choices=('text', 'sequence')) parser.add_argument('--config', default=None) parser.add_argument('-l', '--limit', required=False, default=None, type=int) parser.add_argument('-e', '--emptylines', required=False, default=True) args = parser.parse_args() # Default configuration to empty config # (avoid mutable container as default) args.config = args.config or {} sparkName = "testTokenizer" sc = SparkContext(appName=sparkName) # remove positional args, everything else passed verbatim kwargs = dict_minus(as_dict(args), "input_dir", "output_dir", "config") print 'Got Options : ', kwargs testTokenizer(sc, args.input_dir, args.output_dir, args.config, **kwargs)
dest="threshold", default=0.0, help="similarity threshold") parser.add_option("-e", "--base", dest="base", type="string", help="base file", default="") parser.add_option("-o", "--outputformat", dest="outputformat", type="string", help="output file format: text/sequence", default="text") parser.add_option("-y", "--outputtype", dest="outputtype", type="string", help="output type: csv/json", default="json") parser.add_option("-k", "--topk", dest="topk", type="int", help="top n matches", default=3) parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string", help="name for json element for matching candidates", default="candidates") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] outputFilename = args[1] print "Save to:", outputFilename kwargs = as_dict(c_options) clusterer = Clusterer(**kwargs) fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFilename,file_format='text') cluster_rdd = clusterer.compute_clusters(rdd) fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')
def __init__(self, config_filename, **p_options): self.options = as_dict(p_options) self.config = FileUtil.get_json_config(config_filename) print('In tokenizer')