Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description="""

    python make_autocomplete_list.py processed-ski-area-elevations.json

    Create jsons for searching for ski areas. These will consist
    of all the n-grams found in the ski area names.
""")

    parser.add_argument('input_file', nargs=1)
    parser.add_argument('-n', '--name', default='name',
                        help='The field in the json entry which specifies its name')
    parser.add_argument('-i', '--importance', default='importance',
                        help='The field in the json entry which specifies how important \
                  it is (more important entries are displayed higher up in \
                  the autocomplete suggestions')
    parser.add_argument('-m', '--max-entries-per-autocomplete', default=10,
                        help='The maximum number of entries to be displayed in the \
                  autocomplete suggestions')
    parser.add_argument('-r', '--reverse-importance', default=False,
                        action='store_true',
                        help='Use the reverse sorting of the importance value to gauge \
                  the worth of individual entries')
    parser.add_argument('-c', '--column-names',
                        help="The column names for the input tsv file",
                        default=None)
    parser.add_argument('-d', '--delimiter', default=None,
                        help='The delimiter separating columns in the gene_count file')

    parser.add_argument('--elasticsearch-url',
                        help='Specify elasticsearch nodes to push the completions to',
                        default=None)
    parser.add_argument('--print-status', action="store_true")

    args = parser.parse_args()

    tile_saver = cst.ElasticSearchTileSaver(es_path=args.elasticsearch_url,
                                            print_status=args.print_status)

    dataFile = cfp.FakeSparkContext.textFile(args.input_file[0])

    if args.column_names is not None:
        args.column_names = args.column_names.split(',')

    if args.delimiter is None:
        dataFile = (dataFile.map(lambda x: x.split())
                            .map(lambda x: dict(zip(args.column_names, x))))
    else:
        print("delimiter:", args.delimiter)
        dataFile = (dataFile.map(lambda x: x.split(args.delimiter))
                            .map(lambda x: dict(zip(args.column_names, x))))

    print("one:", dataFile.take(1))

    make_autocomplete_list(dataFile, args, tile_saver)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(description="""
    python cooler_to_tiles.py cooler_file

    Requires the cooler package.
""")

    # parser.add_argument('argument', nargs=1)
    # parser.add_argument('-o', '--options', default='yo',
    # help="Some option", type='str')
    # parser.add_argument('-u', '--useless', action='store_true',
    # help='Another useless option')
    parser.add_argument("filepath")
    parser.add_argument(
        "-e",
        "--elasticsearch-url",
        default=None,
        help="The url of the elasticsearch database where to save the tiles",
    )
    parser.add_argument(
        "-b",
        "--bins-per-dimension",
        default=1,
        help="The number of bins to consider in each dimension",
        type=int,
    )
    parser.add_argument(
        "-f",
        "--columnfile-path",
        default=None,
        help="The path to the column file where to save the tiles",
    )
    parser.add_argument("--assembly", default=None)
    parser.add_argument("--log-file", default=None)
    parser.add_argument("--resolution", default=1000)
    parser.add_argument("--max-zoom", default=None, type=int)
    parser.add_argument("--num-threads", default=4, type=int)

    args = parser.parse_args()
    tileset_info = chg.getInfo(args.filepath)

    num_dimensions = 2
    bins_per_dimension = tileset_info["bins_per_dimension"]
    max_data_in_sparse = bins_per_dimension**num_dimensions / 10

    if args.elasticsearch_url is not None:
        tile_saver = cst.ElasticSearchTileSaver(
            max_data_in_sparse,
            bins_per_dimension,
            es_path=args.elasticsearch_url,
            log_file=args.log_file,
            num_dimensions=num_dimensions,
        )
    else:
        tile_saver = cst.ColumnFileTileSaver(
            max_data_in_sparse,
            bins_per_dimension,
            file_path=args.columnfile_path,
            log_file=args.log_file,
            num_dimensions=num_dimensions,
        )

    ############################################################################

    if args.max_zoom is not None and args.max_zoom < tileset_info["max_zoom"]:
        max_zoom_to_generate = args.max_zoom
    else:
        max_zoom_to_generate = tileset_info["max_zoom"]

    coolers_matrix = {}
    queue = mpr.Queue()

    tilesaver_processes = []
    finished = mpr.Value("b", False)

    print("num_threads:", args.num_threads)
    for i in range(args.num_threads):
        p = mpr.Process(target=cst.tile_saver_worker,
                        args=(queue, tile_saver, finished))

        p.daemon = True
        p.start()
        tilesaver_processes += [(tile_saver, p)]

    tileset_info["max_value"] = 0
    tileset_info["min_value"] = 0

    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()

    try:
        with h5py.File(args.filepath) as f:
            for i in range(max_zoom_to_generate + 1):
                f = h5py.File(args.filepath, "r")

                c = cooler.Cooler(f[str(i)])
                matrix = c.matrix(balance=True, as_pixels=True, join=True)

                coolers_matrix[i] = {"cooler": c, "matrix": matrix}

            recursive_generate_tiles(
                col.deque([(0, 0, 0)]),
                coolers_matrix,
                tileset_info,
                args.resolution,
                max_zoom_to_generate,
                queue,
            )
    except KeyboardInterrupt:
        print("kb interrupt:")
        for (ts, p) in tilesaver_processes:
            p.terminate()
            p.join()
            print("finished")
        raise

    finished.value = True
    # wait for the worker processes to finish
    for (ts, p) in tilesaver_processes:
        p.join()

    print("tileset_info:", tileset_info)
    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()
Esempio n. 3
0
def main():
    """
    python make_tiles.py input_file

    Create tiles for all of the entries in the JSON file.
    """
    parser = argparse.ArgumentParser()

    # parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    # parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')
    parser.add_argument("--min-pos", help="The minimum range for the tiling")
    parser.add_argument("--max-pos", help="The maximum range for the tiling")
    parser.add_argument("--assembly", default=None)
    parser.add_argument("-r",
                        "--resolution",
                        help="The resolution of the data",
                        default=None,
                        type=int)
    parser.add_argument(
        "-k",
        "--position-cols",
        help="The position columns (defaults to all but the last, 1-based)",
        default=None,
    )
    parser.add_argument(
        "-v",
        "--value-pos",
        help="The value column (defaults to the last one, 1-based)",
        default=None,
        type=str,
    )
    parser.add_argument("-z",
                        "--max-zoom",
                        help="The maximum zoom value",
                        default=None,
                        type=int)
    parser.add_argument("--expand-range", help="Expand ranges of values")
    parser.add_argument(
        "--ignore-0",
        help="Ignore ranges with a zero value",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "-b",
        "--bins-per-dimension",
        default=1,
        help="The number of bins to consider in each dimension",
        type=int,
    )
    parser.add_argument(
        "-e",
        "--elasticsearch-url",
        default=None,
        help="The url of the elasticsearch database where to save the tiles",
    )
    parser.add_argument(
        "-f",
        "--columnfile-path",
        default=None,
        help="The path to the column file where to save the tiles",
    )
    parser.add_argument("-n", "--num-threads", default=4, type=int)
    parser.add_argument("--triangular", default=False, action="store_true")
    parser.add_argument("--log-file", default=None)
    parser.add_argument("--max-queue-size", default=40000, type=int)
    parser.add_argument("--print-status", default=None, type=int)

    args = parser.parse_args()

    if args.resolution is None and args.max_zoom is None:
        print("One of --resolution and --max-zoom must be set",
              file=sys.stderr)
        sys.exit(1)

    first_line = sys.stdin.readline()
    first_line_parts = first_line.strip().split()
    if len(first_line_parts) == 0:
        print("ERROR: no input")
        return

    if args.position_cols is not None:
        position_cols = list(map(int, args.position_cols.split(",")))
    else:
        position_cols = None

    # if specific position columns aren't specified, use all but the last column
    if position_cols is None:
        position_cols = list(range(1, len(first_line_parts)))

    if args.assembly is not None:
        mins = [1 for p in position_cols]
        maxs = [
            nc.get_chrominfo(args.assembly).total_length for p in position_cols
        ]
    else:
        mins = [float(p) for p in args.min_pos.split(",")]
        maxs = [float(p) for p in args.max_pos.split(",")]

    max_width = max([b - a for (a, b) in zip(mins, maxs)])

    if args.expand_range is not None:
        expand_range = list(map(int, args.expand_range.split(",")))
    else:
        expand_range = None

    if args.max_zoom is None:
        # determine the maximum zoom level based on the domain of the data
        # and the resolution
        bins_to_display_at_max_resolution = (max_width // args.resolution //
                                             args.bins_per_dimension)
        max_max_zoom = math.ceil(
            math.log(bins_to_display_at_max_resolution) / math.log(2.0))

        if max_max_zoom < 0:
            max_max_zoom = 0

        max_zoom = int(max_max_zoom)
    else:
        max_zoom = args.max_zoom

    # print("max_zoom:", max_zoom)
    max_width = args.resolution * args.bins_per_dimension * 2**max_zoom

    value_pos = args.value_pos

    # if there's not column designated as the value column, use the last column
    if value_pos is None:
        value_pos = [len(first_line_parts) - 1]
    else:
        value_pos = [int(vp) - 1 for vp in value_pos.split(",")]

    max_data_in_sparse = args.bins_per_dimension**len(position_cols) // 10
    """
    if args.elasticsearch_url is not None:
        tile_saver = cst.ElasticSearchTileSaver(max_data_in_sparse,
                                                args.bins_per_dimension,
                                                num_dimensions = len(position_cols),
                                                es_path = args.elasticsearch_url)
    else:
        tile_saver = cst.EmptyTileSaver(max_data_in_sparse,
                                        args.bins_per_dimension,
                                        num_dimensions = len(position_cols))
    """

    print(
        "maxs:",
        maxs,
        "max_zoom:",
        max_zoom,
        "max_data_in_sparse:",
        max_data_in_sparse,
        "url:",
        args.elasticsearch_url,
    )

    # bin_counts = col.defaultdict(col.defaultdict(int))
    q = mpr.Queue(maxsize=args.max_queue_size)

    tilesaver_processes = []
    finished = mpr.Value("b", False)
    if args.elasticsearch_url is not None:
        tile_saver = cst.ElasticSearchTileSaver(
            max_data_in_sparse,
            args.bins_per_dimension,
            len(position_cols),
            args.elasticsearch_url,
            args.log_file,
            args.print_status,
            initial_value=[0.0 for vp in value_pos],
        )
    else:
        tile_saver = cst.ColumnFileTileSaver(
            max_data_in_sparse,
            args.bins_per_dimension,
            len(position_cols),
            args.columnfile_path,
            args.log_file,
            args.print_status,
            initial_value=[0.0 for vp in value_pos],
        )

    for i in range(args.num_threads):
        p = mpr.Process(target=cst.tile_saver_worker,
                        args=(q, tile_saver, finished))

        p.daemon = True
        p.start()
        tilesaver_processes += [(tile_saver, p)]

    tileset_info = {
        "max_value": [0 for vp in value_pos],
        "min_value": [0 for vp in value_pos],
        "min_pos": mins,
        "max_pos": maxs,
        "max_zoom": max_zoom,
        "bins_per_dimension": args.bins_per_dimension,
        "max_width": max_width,
    }

    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()

    try:
        tileset_info = create_tiles(
            q,
            [first_line],
            sys.stdin,
            position_cols,
            value_pos,
            max_zoom,
            args.bins_per_dimension,
            tile_saver,
            expand_range,
            args.ignore_0,
            tileset_info,
            max_width,
            args.triangular,
            args.max_queue_size,
            print_status=args.print_status,
        )
    except KeyboardInterrupt:
        for (ts, p) in tilesaver_processes:
            ts.flush()
            p.terminate()
            p.join()
        raise

    finished.value = True
    # wait for the worker processes to finish
    for (ts, p) in tilesaver_processes:
        p.join()

    print("tileset_info:", tileset_info)
    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()