def main(argv=None):
    '''this is called if run from command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input_dir', required=True)
    parser.add_argument('--file_format', default='sequence', choices=('text', 'sequence'))

    parser.add_argument('-o','--output_dir', required=True)
    parser.add_argument('--output_file_format', default='sequence', choices=('text', 'sequence'))

    parser.add_argument('--config', default=None)

    parser.add_argument('-l','--limit', required=False, default=None, type=int)
    parser.add_argument('-e','--emptylines',required=False,default=True)
    args=parser.parse_args()
    # Default configuration to empty config
    # (avoid mutable container as default)
    args.config = args.config or {}

    sparkName = "testTokenizer"
    sc = SparkContext(appName=sparkName)

    # remove positional args, everything else passed verbatim
    kwargs = dict_minus(as_dict(args), "input_dir", "output_dir", "config")
    print 'Got Options : ',kwargs
    testTokenizer(sc, args.input_dir, args.output_dir, args.config, **kwargs)
def main(argv=None):
    '''this is called if run from command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_dir', required=True)
    parser.add_argument('--file_format',
                        default='sequence',
                        choices=('text', 'sequence'))

    parser.add_argument('-o', '--output_dir', required=True)
    parser.add_argument('--output_file_format',
                        default='sequence',
                        choices=('text', 'sequence'))

    parser.add_argument('--config', default=None)

    parser.add_argument('-l',
                        '--limit',
                        required=False,
                        default=None,
                        type=int)
    parser.add_argument('-e', '--emptylines', required=False, default=True)
    args = parser.parse_args()
    # Default configuration to empty config
    # (avoid mutable container as default)
    args.config = args.config or {}

    sparkName = "testTokenizer"
    sc = SparkContext(appName=sparkName)

    # remove positional args, everything else passed verbatim
    kwargs = dict_minus(as_dict(args), "input_dir", "output_dir", "config")
    print 'Got Options : ', kwargs
    testTokenizer(sc, args.input_dir, args.output_dir, args.config, **kwargs)
                      dest="threshold", default=0.0, help="similarity threshold")
    parser.add_option("-e", "--base", dest="base", type="string",
                      help="base file", default="")
    parser.add_option("-o", "--outputformat", dest="outputformat", type="string",
                      help="output file format: text/sequence", default="text")
    parser.add_option("-y", "--outputtype", dest="outputtype", type="string",
                      help="output type: csv/json", default="json")
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")
    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    inputFilename = args[0]
    outputFilename = args[1]
    print "Save to:", outputFilename

    kwargs = as_dict(c_options)
    clusterer = Clusterer(**kwargs)
    fileUtil = FileUtil(sc)

    rdd = fileUtil.load_file(inputFilename,file_format='text')

    cluster_rdd = clusterer.compute_clusters(rdd)

    fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')

Example #4
0
 def __init__(self, config_filename, **p_options):
     self.options = as_dict(p_options)
     self.config = FileUtil.get_json_config(config_filename)
     print('In tokenizer')
Example #5
0
 def __init__(self, config_filename, **p_options):
     self.options = as_dict(p_options)
     self.config = FileUtil.get_json_config(config_filename)
     print('In tokenizer')