def main(): parser = argparse.ArgumentParser(description=""" python make_autocomplete_list.py processed-ski-area-elevations.json Create jsons for searching for ski areas. These will consist of all the n-grams found in the ski area names. """) parser.add_argument('input_file', nargs=1) parser.add_argument('-n', '--name', default='name', help='The field in the json entry which specifies its name') parser.add_argument('-i', '--importance', default='importance', help='The field in the json entry which specifies how important \ it is (more important entries are displayed higher up in \ the autocomplete suggestions') parser.add_argument('-m', '--max-entries-per-autocomplete', default=10, help='The maximum number of entries to be displayed in the \ autocomplete suggestions') parser.add_argument('-r', '--reverse-importance', default=False, action='store_true', help='Use the reverse sorting of the importance value to gauge \ the worth of individual entries') parser.add_argument('-c', '--column-names', help="The column names for the input tsv file", default=None) parser.add_argument('-d', '--delimiter', default=None, help='The delimiter separating columns in the gene_count file') parser.add_argument('--elasticsearch-url', help='Specify elasticsearch nodes to push the completions to', default=None) parser.add_argument('--print-status', action="store_true") args = parser.parse_args() tile_saver = cst.ElasticSearchTileSaver(es_path=args.elasticsearch_url, print_status=args.print_status) dataFile = cfp.FakeSparkContext.textFile(args.input_file[0]) if args.column_names is not None: args.column_names = args.column_names.split(',') if args.delimiter is None: dataFile = (dataFile.map(lambda x: x.split()) .map(lambda x: dict(zip(args.column_names, x)))) else: print("delimiter:", args.delimiter) dataFile = (dataFile.map(lambda x: x.split(args.delimiter)) .map(lambda x: dict(zip(args.column_names, x)))) print("one:", dataFile.take(1)) make_autocomplete_list(dataFile, args, tile_saver)
def main(): parser = argparse.ArgumentParser(description=""" python cooler_to_tiles.py cooler_file Requires the cooler package. """) # parser.add_argument('argument', nargs=1) # parser.add_argument('-o', '--options', default='yo', # help="Some option", type='str') # parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') parser.add_argument("filepath") parser.add_argument( "-e", "--elasticsearch-url", default=None, help="The url of the elasticsearch database where to save the tiles", ) parser.add_argument( "-b", "--bins-per-dimension", default=1, help="The number of bins to consider in each dimension", type=int, ) parser.add_argument( "-f", "--columnfile-path", default=None, help="The path to the column file where to save the tiles", ) parser.add_argument("--assembly", default=None) parser.add_argument("--log-file", default=None) parser.add_argument("--resolution", default=1000) parser.add_argument("--max-zoom", default=None, type=int) parser.add_argument("--num-threads", default=4, type=int) args = parser.parse_args() tileset_info = chg.getInfo(args.filepath) num_dimensions = 2 bins_per_dimension = tileset_info["bins_per_dimension"] max_data_in_sparse = bins_per_dimension**num_dimensions / 10 if args.elasticsearch_url is not None: tile_saver = cst.ElasticSearchTileSaver( max_data_in_sparse, bins_per_dimension, es_path=args.elasticsearch_url, log_file=args.log_file, num_dimensions=num_dimensions, ) else: tile_saver = cst.ColumnFileTileSaver( max_data_in_sparse, bins_per_dimension, file_path=args.columnfile_path, log_file=args.log_file, num_dimensions=num_dimensions, ) ############################################################################ if args.max_zoom is not None and args.max_zoom < tileset_info["max_zoom"]: max_zoom_to_generate = args.max_zoom else: max_zoom_to_generate = tileset_info["max_zoom"] coolers_matrix = {} queue = mpr.Queue() tilesaver_processes = [] finished = mpr.Value("b", False) print("num_threads:", args.num_threads) for i in range(args.num_threads): p = mpr.Process(target=cst.tile_saver_worker, args=(queue, tile_saver, finished)) p.daemon = True p.start() tilesaver_processes += [(tile_saver, p)] tileset_info["max_value"] = 0 tileset_info["min_value"] = 0 tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush() try: with h5py.File(args.filepath) as f: for i in range(max_zoom_to_generate + 1): f = h5py.File(args.filepath, "r") c = cooler.Cooler(f[str(i)]) matrix = c.matrix(balance=True, as_pixels=True, join=True) coolers_matrix[i] = {"cooler": c, "matrix": matrix} recursive_generate_tiles( col.deque([(0, 0, 0)]), coolers_matrix, tileset_info, args.resolution, max_zoom_to_generate, queue, ) except KeyboardInterrupt: print("kb interrupt:") for (ts, p) in tilesaver_processes: p.terminate() p.join() print("finished") raise finished.value = True # wait for the worker processes to finish for (ts, p) in tilesaver_processes: p.join() print("tileset_info:", tileset_info) tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush()
def main(): """ python make_tiles.py input_file Create tiles for all of the entries in the JSON file. """ parser = argparse.ArgumentParser() # parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str') # parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option') parser.add_argument("--min-pos", help="The minimum range for the tiling") parser.add_argument("--max-pos", help="The maximum range for the tiling") parser.add_argument("--assembly", default=None) parser.add_argument("-r", "--resolution", help="The resolution of the data", default=None, type=int) parser.add_argument( "-k", "--position-cols", help="The position columns (defaults to all but the last, 1-based)", default=None, ) parser.add_argument( "-v", "--value-pos", help="The value column (defaults to the last one, 1-based)", default=None, type=str, ) parser.add_argument("-z", "--max-zoom", help="The maximum zoom value", default=None, type=int) parser.add_argument("--expand-range", help="Expand ranges of values") parser.add_argument( "--ignore-0", help="Ignore ranges with a zero value", default=False, action="store_true", ) parser.add_argument( "-b", "--bins-per-dimension", default=1, help="The number of bins to consider in each dimension", type=int, ) parser.add_argument( "-e", "--elasticsearch-url", default=None, help="The url of the elasticsearch database where to save the tiles", ) parser.add_argument( "-f", "--columnfile-path", default=None, help="The path to the column file where to save the tiles", ) parser.add_argument("-n", "--num-threads", default=4, type=int) parser.add_argument("--triangular", default=False, action="store_true") parser.add_argument("--log-file", default=None) parser.add_argument("--max-queue-size", default=40000, type=int) parser.add_argument("--print-status", default=None, type=int) args = parser.parse_args() if args.resolution is None and args.max_zoom is None: print("One of --resolution and --max-zoom must be set", file=sys.stderr) sys.exit(1) first_line = sys.stdin.readline() first_line_parts = first_line.strip().split() if len(first_line_parts) == 0: print("ERROR: no input") return if args.position_cols is not None: position_cols = list(map(int, args.position_cols.split(","))) else: position_cols = None # if specific position columns aren't specified, use all but the last column if position_cols is None: position_cols = list(range(1, len(first_line_parts))) if args.assembly is not None: mins = [1 for p in position_cols] maxs = [ nc.get_chrominfo(args.assembly).total_length for p in position_cols ] else: mins = [float(p) for p in args.min_pos.split(",")] maxs = [float(p) for p in args.max_pos.split(",")] max_width = max([b - a for (a, b) in zip(mins, maxs)]) if args.expand_range is not None: expand_range = list(map(int, args.expand_range.split(","))) else: expand_range = None if args.max_zoom is None: # determine the maximum zoom level based on the domain of the data # and the resolution bins_to_display_at_max_resolution = (max_width // args.resolution // args.bins_per_dimension) max_max_zoom = math.ceil( math.log(bins_to_display_at_max_resolution) / math.log(2.0)) if max_max_zoom < 0: max_max_zoom = 0 max_zoom = int(max_max_zoom) else: max_zoom = args.max_zoom # print("max_zoom:", max_zoom) max_width = args.resolution * args.bins_per_dimension * 2**max_zoom value_pos = args.value_pos # if there's not column designated as the value column, use the last column if value_pos is None: value_pos = [len(first_line_parts) - 1] else: value_pos = [int(vp) - 1 for vp in value_pos.split(",")] max_data_in_sparse = args.bins_per_dimension**len(position_cols) // 10 """ if args.elasticsearch_url is not None: tile_saver = cst.ElasticSearchTileSaver(max_data_in_sparse, args.bins_per_dimension, num_dimensions = len(position_cols), es_path = args.elasticsearch_url) else: tile_saver = cst.EmptyTileSaver(max_data_in_sparse, args.bins_per_dimension, num_dimensions = len(position_cols)) """ print( "maxs:", maxs, "max_zoom:", max_zoom, "max_data_in_sparse:", max_data_in_sparse, "url:", args.elasticsearch_url, ) # bin_counts = col.defaultdict(col.defaultdict(int)) q = mpr.Queue(maxsize=args.max_queue_size) tilesaver_processes = [] finished = mpr.Value("b", False) if args.elasticsearch_url is not None: tile_saver = cst.ElasticSearchTileSaver( max_data_in_sparse, args.bins_per_dimension, len(position_cols), args.elasticsearch_url, args.log_file, args.print_status, initial_value=[0.0 for vp in value_pos], ) else: tile_saver = cst.ColumnFileTileSaver( max_data_in_sparse, args.bins_per_dimension, len(position_cols), args.columnfile_path, args.log_file, args.print_status, initial_value=[0.0 for vp in value_pos], ) for i in range(args.num_threads): p = mpr.Process(target=cst.tile_saver_worker, args=(q, tile_saver, finished)) p.daemon = True p.start() tilesaver_processes += [(tile_saver, p)] tileset_info = { "max_value": [0 for vp in value_pos], "min_value": [0 for vp in value_pos], "min_pos": mins, "max_pos": maxs, "max_zoom": max_zoom, "bins_per_dimension": args.bins_per_dimension, "max_width": max_width, } tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush() try: tileset_info = create_tiles( q, [first_line], sys.stdin, position_cols, value_pos, max_zoom, args.bins_per_dimension, tile_saver, expand_range, args.ignore_0, tileset_info, max_width, args.triangular, args.max_queue_size, print_status=args.print_status, ) except KeyboardInterrupt: for (ts, p) in tilesaver_processes: ts.flush() p.terminate() p.join() raise finished.value = True # wait for the worker processes to finish for (ts, p) in tilesaver_processes: p.join() print("tileset_info:", tileset_info) tile_saver.save_tile({ "tile_id": "tileset_info", "tile_value": tileset_info }) tile_saver.flush()