def chromsizes_negspy_to_zarr(assembly, output, has_header):
    chrom_order = nc.get_chromorder(assembly)
    chrom_info = nc.get_chrominfo(assembly)

    chrom_rows = [{
        0: chrom_name,
        1: chrom_info.chrom_lengths[chrom_name]
    } for chrom_name in chrom_order]

    df = pd.DataFrame(columns=[0, 1], data=chrom_rows)

    num_chroms = df.shape[0]

    columns = df.columns.values.tolist()
    chrom_names = df[columns[0]].values
    chrom_sizes = df[columns[1]].values

    df["name_len"] = df[columns[0]].apply(lambda name: len(name))
    max_name_len = int(df["name_len"].max())

    z = zarr.open(output, mode='w')
    compressor = Zlib(level=1)

    z.create_dataset("names",
                     shape=(num_chroms, ),
                     dtype=f"S{max_name_len}",
                     compressor=compressor)
    z.create_dataset("sizes",
                     shape=(num_chroms, ),
                     dtype="u4",
                     compressor=compressor)
    z["names"][:] = chrom_names
    z["sizes"][:] = chrom_sizes
Esempio n. 2
0
def load_chromsizes(chromsizes_filename, assembly=None):
    """
    Load a set of chromosomes from a file or using an assembly
    identifier. If using just an assembly identifier the chromsizes
    will be loaded from the negspy repository.

    Parameters:
    -----------
    chromsizes_filename: string
        The file containing the tab-delimited chromosome sizes
    assembly: string
        Assembly name (e.g. 'hg19'). Not necessary if a chromsizes_filename is passed in
    """
    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_names = chrom_info.chrom_order
        chrom_sizes = [
            chrom_info.chrom_lengths[c] for c in chrom_info.chrom_order
        ]
    else:
        if assembly is None:
            raise ValueError("No assembly or chromsizes specified")

        chrom_info = nc.get_chrominfo(assembly)
        chrom_names = nc.get_chromorder(assembly)
        chrom_sizes = nc.get_chromsizes(assembly)

    return (chrom_info, chrom_names, chrom_sizes)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description="""
    
    python chrom_sizes.py assembly

    Print the chromosome sizes for the given assembly.
""")

    parser.add_argument('assembly')
    #parser.add_argument('argument', nargs=1)
    #parser.add_argument('-o', '--options', default='yo',
    #					 help="Some option", type='str')
    #parser.add_argument('-u', '--useless', action='store_true', 
    #					 help='Another useless option')

    args = parser.parse_args()

    for chr in nc.get_chromorder(args.assembly):
        print(chr + "\t" + str(nc.get_chrominfo(args.assembly).chrom_lengths[chr]))
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(description="""
    
    python chrom_sizes.py assembly

    Print the chromosome sizes for the given assembly.
""")

    parser.add_argument('assembly')
    #parser.add_argument('argument', nargs=1)
    #parser.add_argument('-o', '--options', default='yo',
    #					 help="Some option", type='str')
    #parser.add_argument('-u', '--useless', action='store_true',
    #					 help='Another useless option')

    args = parser.parse_args()

    for chr in nc.get_chromorder(args.assembly):
        print(chr + "\t" +
              str(nc.get_chrominfo(args.assembly).chrom_lengths[chr]))
Esempio n. 5
0
def _bigwig(filepath,
            chunk_size=14,
            zoom_step=8,
            tile_size=1024,
            output_file=None,
            assembly='hg19',
            chromsizes_filename=None,
            chromosome=None):
    last_end = 0
    data = []

    if output_file is None:
        if chromosome is None:
            output_file = op.splitext(filepath)[0] + '.hitile'
        else:
            output_file = op.splitext(
                filepath)[0] + '.' + chromosome + '.hitile'

    # Override the output file if it existts
    if op.exists(output_file):
        os.remove(output_file)
    f = h5py.File(output_file, 'w')

    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_order = [
            a for a in nc.get_chromorder_from_file(chromsizes_filename)
        ]
        chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename)
    else:
        print("there")
        chrom_info = nc.get_chrominfo(assembly)
        chrom_order = [a for a in nc.get_chromorder(assembly)]
        chrom_sizes = nc.get_chromsizes(assembly)

    print("chrom_order:", chrom_order)
    assembly_size = chrom_info.total_length

    tile_size = tile_size
    chunk_size = tile_size * 2**chunk_size  # how many values to read in at once while tiling

    dsets = []  # data sets at each zoom level
    nan_dsets = []

    # initialize the arrays which will store the values at each stored zoom level
    z = 0
    positions = []  # store where we are at the current dataset
    data_buffers = [[]]
    nan_data_buffers = [[]]

    while assembly_size / 2**z > tile_size:
        dset_length = math.ceil(assembly_size / 2**z)
        dsets += [
            f.create_dataset('values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]
        nan_dsets += [
            f.create_dataset('nan_values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]

        data_buffers += [[]]
        nan_data_buffers += [[]]

        positions += [0]
        z += zoom_step

    # load the bigWig file
    bwf = pbw.open(filepath)

    # store some meta data
    d = f.create_dataset('meta', (1, ), dtype='f')

    if chromosome is not None:
        d.attrs['min-pos'] = chrom_info.cum_chrom_lengths[chromosome]
        d.attrs['max-pos'] = chrom_info.cum_chrom_lengths[
            chromosome] + bwf.chroms()[chromosome]
    else:
        d.attrs['min-pos'] = 0
        d.attrs['max-pos'] = assembly_size
    '''
    print("chroms.keys:", bwf.chroms().keys())
    print("chroms.values:", bwf.chroms().values())
    '''

    d.attrs['zoom-step'] = zoom_step
    d.attrs['max-length'] = assembly_size
    d.attrs['assembly'] = assembly
    d.attrs['chrom-names'] = [a.encode('utf-8') for a in chrom_order]
    d.attrs['chrom-sizes'] = chrom_sizes
    d.attrs['chrom-order'] = [a.encode('utf-8') for a in chrom_order]
    d.attrs['tile-size'] = tile_size
    d.attrs['max-zoom'] = max_zoom = math.ceil(
        math.log(d.attrs['max-length'] / tile_size) / math.log(2))
    d.attrs['max-width'] = tile_size * 2**max_zoom
    d.attrs['max-position'] = 0

    print("assembly size (max-length)", d.attrs['max-length'])
    print("max-width", d.attrs['max-width'])
    print("max_zoom:", d.attrs['max-zoom'])
    print("chunk-size:", chunk_size)
    print("chrom-order", d.attrs['chrom-order'])

    t1 = time.time()

    curr_zoom = 0

    def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add):
        curr_zoom = 0

        data_buffers[0] += buffers_to_add
        nan_data_buffers[0] += nan_buffers_to_add

        curr_time = time.time() - t1
        percent_progress = (positions[curr_zoom] + 1) / float(assembly_size)
        print(
            "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}".
            format(positions[curr_zoom] + 1, percent_progress, curr_time,
                   curr_time / (percent_progress) - curr_time))

        while len(data_buffers[curr_zoom]) >= chunk_size:
            # get the current chunk and store it, converting nans to 0
            print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom]))
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
            #curr_chunk[np.isnan(curr_chunk)] = 0
            '''
            print("1cc:", sum(curr_chunk))
            print("1db:", data_buffers[curr_zoom][:chunk_size])
            print("1curr_chunk:", nan_curr_chunk)
            '''
            print("positions[curr_zoom]:", positions[curr_zoom])

            dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = curr_chunk
            nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                                 chunk_size] = nan_curr_chunk

            # aggregate nan values
            #nan_curr_chunk[np.isnan(curr_chunk)] = 0
            #print("1na_cc:", sum(nan_curr_chunk))

            # aggregate and store aggregated values in the next zoom_level's data
            data_buffers[curr_zoom + 1] += list(
                ct.aggregate(curr_chunk, 2**zoom_step))
            nan_data_buffers[curr_zoom + 1] += list(
                ct.aggregate(nan_curr_chunk, 2**zoom_step))

            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][
                chunk_size:]

            data = data_buffers[curr_zoom + 1]
            nan_data = nan_data_buffers[curr_zoom + 1]

            # do the same for the nan values buffers

            positions[curr_zoom] += chunk_size
            curr_zoom += 1

            if curr_zoom * zoom_step >= max_zoom:
                break

    # Do we only want values from a single chromosome?
    if chromosome is not None:
        chroms_to_use = [chromosome]
    else:
        chroms_to_use = chrom_order

    for chrom in chroms_to_use:
        print("chrom:", chrom)
        '''
        if chrom not in bwf.chroms():
            print("skipping chrom (not in bigWig file):",
            chrom, chrom_info.chrom_lengths[chrom])
            continue
        '''

        counter = 0
        # chrom_size = bwf.chroms()[chrom]
        chrom_size = chrom_info.chrom_lengths[chrom]

        # print("chrom_size:", chrom_size, bwf.chroms()[chrom])
        d.attrs['max-position'] += chrom_size

        while counter < chrom_size:
            remaining = min(chunk_size, chrom_size - counter)

            if chrom not in bwf.chroms():
                values = [np.nan] * remaining
                nan_values = [1] * remaining
            else:
                values = bwf.values(chrom, counter, counter + remaining)
                nan_values = np.isnan(values).astype('i4')

            # print("counter:", counter, "remaining:", remaining,
            # "counter + remaining:", counter + remaining)
            counter += remaining
            curr_zoom = 0

            add_values_to_data_buffers(list(values), list(nan_values))

    while True:
        # get the current chunk and store it
        chunk_size = len(data_buffers[curr_zoom])
        curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
        nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])

        dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                         chunk_size] = curr_chunk
        nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = nan_curr_chunk

        # aggregate and store aggregated values in the next zoom_level's data
        data_buffers[curr_zoom + 1] += list(
            ct.aggregate(curr_chunk, 2**zoom_step))
        nan_data_buffers[curr_zoom + 1] += list(
            ct.aggregate(nan_curr_chunk, 2**zoom_step))

        data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
        nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:]

        data = data_buffers[curr_zoom + 1]
        nan_data = nan_data_buffers[curr_zoom + 1]

        positions[curr_zoom] += chunk_size
        curr_zoom += 1

        # we've created enough tile levels to cover the entire maximum width
        if curr_zoom * zoom_step >= max_zoom:
            break

    # still need to take care of the last chunk

    data = np.array(data)
    t1 = time.time()
    pass
Esempio n. 6
0
def _bedfile(filepath, output_file, assembly, importance_column, has_header,
             chromosome, max_per_tile, tile_size, delimiter,
             chromsizes_filename, offset):
    if output_file is None:
        output_file = filepath + ".multires"
    else:
        output_file = output_file

    if op.exists(output_file):
        os.remove(output_file)

    bed_file = open(filepath, 'r')

    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_names = chrom_info.chrom_order
        chrom_sizes = [
            chrom_info.chrom_lengths[c] for c in chrom_info.chrom_order
        ]
    else:
        chrom_info = nc.get_chrominfo(assembly)
        chrom_names = nc.get_chromorder(assembly)
        chrom_sizes = nc.get_chromsizes(assembly)

    print("chrom_names:", chrom_info.chrom_order)
    print("chrom_sizes:", chrom_sizes)

    def line_to_np_array(line):
        '''
        Convert a bed file line to a numpy array which can later
        be used as an entry in an h5py file.
        '''
        try:
            start = int(line[1])
            stop = int(line[2])
        except ValueError:
            raise ValueError(
                "Error parsing the position, line: {}".format(line))

        chrom = line[0]

        if importance_column is None:
            importance = stop - start
        elif importance_column == 'random':
            importance = random.random()
        else:
            importance = int(line[int(importance_column) - 1])

        # convert chromosome coordinates to genome coordinates

        genome_start = chrom_info.cum_chrom_lengths[chrom] + start + offset
        #nc.chr_pos_to_genome_pos(str(chrom), start, assembly)
        genome_end = chrom_info.cum_chrom_lengths[chrom] + stop + offset
        #nc.chr_pos_to_genome_pos(chrom, stop, assembly)

        pos_offset = genome_start - start
        parts = {
            'startPos': genome_start,
            'endPos': genome_end,
            'uid': slugid.nice().decode('utf-8'),
            'chrOffset': pos_offset,
            'fields': '\t'.join(line),
            'importance': importance,
            'chromosome': str(chrom)
        }

        return parts

    dset = []

    if has_header:
        line = bed_file.readline()
        header = line.strip().split(delimiter)
    else:
        line = bed_file.readline().strip()
        dset += [line_to_np_array(line.strip().split(delimiter))]
        header = map(str, list(range(1,
                                     len(line.strip().split(delimiter)) + 1)))
    print("header:", header)

    for line in bed_file:
        dset += [line_to_np_array(line.strip().split(delimiter))]

    if chromosome is not None:
        dset = [d for d in dset if d['chromosome'] == chromosome]

    # We neeed chromosome information as well as the assembly size to properly
    # tile this data
    tile_size = tile_size

    #if chromosome is None:
    assembly_size = chrom_info.total_length + 1
    '''
    else:
        try:
            assembly_size = chrom_info.chrom_lengths[chromosome]
        except KeyError:
            print("ERROR: Chromosome {} not found in assembly {}.".format(chromosome, assembly), file=sys.stderr)
            return 1
    '''

    #max_zoom = int(math.ceil(math.log(assembly_size / min_feature_width) / math.log(2)))
    max_zoom = int(math.ceil(
        math.log(assembly_size / tile_size) / math.log(2)))
    '''
    if max_zoom is not None and max_zoom < max_zoom:
        max_zoom = max_zoom
    '''

    # this script stores data in a sqlite database
    import sqlite3
    sqlite3.register_adapter(np.int64, lambda val: int(val))
    print("output_file:", output_file)
    conn = sqlite3.connect(output_file)

    # store some meta data
    store_meta_data(conn,
                    1,
                    max_length=assembly_size,
                    assembly=assembly,
                    chrom_names=chrom_names,
                    chrom_sizes=chrom_sizes,
                    tile_size=tile_size,
                    max_zoom=max_zoom,
                    max_width=tile_size * 2**max_zoom,
                    header=header)

    max_width = tile_size * 2**max_zoom
    uid_to_entry = {}

    intervals = []

    # store each bed file entry as an interval
    for d in dset:
        uid = d['uid']
        uid_to_entry[uid] = d
        intervals += [(d['startPos'], d['endPos'], uid)]

    tile_width = tile_size

    removed = set()

    c = conn.cursor()
    c.execute('''
    CREATE TABLE intervals
    (
        id int PRIMARY KEY,
        zoomLevel int,
        importance real,
        startPos int,
        endPos int,
        chrOffset int,
        uid text,
        fields text
    )
    ''')

    c.execute('''
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rStartPos, rEndPos
        )
        ''')

    curr_zoom = 0
    counter = 0

    max_viewable_zoom = max_zoom

    if max_zoom is not None and max_zoom < max_zoom:
        max_viewable_zoom = max_zoom

    while curr_zoom <= max_viewable_zoom and len(intervals) > 0:
        # at each zoom level, add the top genes
        tile_width = tile_size * 2**(max_zoom - curr_zoom)

        for tile_num in range(max_width // tile_width):
            # go over each tile and distribute the remaining values
            #values = interval_tree[tile_num * tile_width: (tile_num+1) * tile_width]
            from_value = tile_num * tile_width
            to_value = (tile_num + 1) * tile_width
            entries = [
                i for i in intervals if (i[0] < to_value and i[1] > from_value)
            ]
            values_in_tile = sorted(
                entries, key=lambda x: -uid_to_entry[x[-1]]['importance']
            )[:max_per_tile]  # the importance is always the last column
            # take the negative because we want to prioritize
            # higher values

            if len(values_in_tile) > 0:
                for v in values_in_tile:
                    counter += 1

                    value = uid_to_entry[v[-1]]

                    # one extra question mark for the primary key
                    exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?)'
                    #print("value:", value['startPos'])

                    ret = c.execute(
                        exec_statement,
                        # primary key, zoomLevel, startPos, endPos, chrOffset, line
                        (counter, curr_zoom, value['importance'],
                         value['startPos'], value['endPos'],
                         value['chrOffset'], value['uid'], value['fields']))
                    conn.commit()

                    exec_statement = 'INSERT INTO position_index VALUES (?,?,?)'
                    ret = c.execute(
                        exec_statement,
                        (counter, value['startPos'], value['endPos']
                         )  #add counter as a primary key
                    )
                    conn.commit()
                    intervals.remove(v)
        #print ("curr_zoom:", curr_zoom, file=sys.stderr)
        curr_zoom += 1

    conn.commit()
    conn.close()

    return
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(description="""
    
    python chr_pos_to_genome_pos.py -t 1,2:3,4

    Convert chromosome,position pairs to genome_positions. Assumes that the
    coordinates refer to the hg19 assembly (unless otherwise specified).

    Example:

    2       NM_000014       chr12   -       9220303 9268825

    -> python scripts/chr_pos_to_genome_pos.py -c 3:5,3:6

    2       NM_000014       genome  -       2115405269      2115453791

    --------------------------------

    This also works with space-delimited fields:

    chr5    56765,56766

    ->python scripts/chr_pos_to_genome_pos.py -c 1:2

    genome  881683465,881683466

""")

    parser.add_argument('-a', '--assembly', default='hg19')
    parser.add_argument('-s', '--chromsizes-file', default=None)
    parser.add_argument('-n', '--new-chrom', default=None)
    parser.add_argument(
        '-c',
        '--columns',
        default='1,2',
        help="Which columns to translate to genome positions. "
        "Column pairs should be 1-based and separated by colons")

    #parser.add_argument('-u', '--useless', action='store_true',
    #                     help='Another useless option')
    args = parser.parse_args()

    if args.chromsizes_file is not None:
        chrom_info = nc.get_chrominfo_from_file(args.chromsizes_file)
    else:
        chrom_info = nc.get_chrominfo(args.assembly)

    for line in sys.stdin:
        try:
            line_output = []
            line_parts = line.strip().split()
            translated_positions = {}
            translated_chroms = {}

            for translate_pair in [[int(y) for y in x.split(':')]
                                   for x in args.columns.split(',')]:
                # go through the pairs of columns that need to be translated to genome position
                # assume that the position column is comma separated list of values (although it doesn't
                # actually need to be)
                chrom, poss = line_parts[translate_pair[0] - 1], line_parts[
                    translate_pair[1] - 1].strip(",").split(',')
                genome_pos = ",".join(
                    map(str, [
                        nc.chr_pos_to_genome_pos(chrom, int(pos), chrom_info)
                        for pos in poss
                    ]))
                #line_output += [genome_pos]

                # note that we've translated these columns and shouldn't include them in the output
                translated_positions[translate_pair[1] - 1] = genome_pos
                translated_chroms[translate_pair[0] - 1] = chrom

            for i, part in enumerate(line_parts):
                if i in translated_chroms:
                    # replace chromosome identifiers (e.g. 'chr1') with 'genome' to indicate the positions
                    if args.new_chrom is None:
                        line_output += ['genome({})'.format(chrom)]
                    else:
                        line_output += [args.new_chrom]
                elif i in translated_positions:
                    # this column used to contain a position so we need to replace it with a translated
                    # position
                    line_output += [translated_positions[i]]
                else:
                    # if this column didn't contain a translated position output it as is
                    line_output += [part]

            try:
                print("\t".join(map(str, line_output)))
            except BrokenPipeError:
                # Output is probably being run through "head" or something similar
                break
        except KeyError as ke:
            print("KeyError:", ke, line.strip(), file=sys.stderr)
Esempio n. 8
0
 def END_ABS(self, CHROM, END):
     chrom_info = nc.get_chrominfo("hg38")
     return nc.chr_pos_to_genome_pos("chr" + CHROM, END, chrom_info)
Esempio n. 9
0
 def START_ABS(self, CHROM, START):
     chrom_info = nc.get_chrominfo("hg38")
     return nc.chr_pos_to_genome_pos("chr" + CHROM, START, chrom_info)
def bigwigs_to_multivec(input_bigwig_files, input_metadata_files, output_file,
                        starting_resolution):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Zip the input to create (bw, metadata) tuples
    zipped_input = zip(input_bigwig_files, input_metadata_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = chromosomes[:25]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name,
                                                   chr_shape,
                                                   dtype="f4",
                                                   fillvalue=np.nan,
                                                   compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution), num_samples)
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    resolutions_group[str(
                        resolution)]["values"][chr_name][:, bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for metadata_index, metadata_file in enumerate(input_metadata_files):
        with open(metadata_file) as mf:
            try:
                metadata_json = json.load(mf)
            except Exception as e:
                print(f"Error loading metadata file: {metadata_file}")
                print(e)
                metadata_json = None
        row_info = metadata_json_to_row_info(metadata_json)
        row_infos.append(row_info)

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()
def bigwigs_to_multivec(
    input_bigwig_files,
    output_file,
    starting_resolution
):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder(GENOME_BUILD)
    chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used?
    chroms_length_arr = np.array([ nc.get_chrominfo('hg19').chrom_lengths[x] for x in chromosomes ], dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    num_zoom_levels = math.floor(math.log2(GENOME_LENGTH / starting_resolution))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2 ** x) for x in range(num_zoom_levels)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in enumerate(input_bigwig_files):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    num_bins = math.ceil(chr_len / resolution)
                    arr = bbi.fetch(bw_file, chr_name, 0, chr_len, num_bins, summary="sum")
                    resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for input_bigwig_file in input_bigwig_files:
        _, filename = os.path.split(input_bigwig_file)
        name, _ = os.path.splitext(filename)
        row_infos.append({
            'id': name
        })

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()
Esempio n. 12
0
def main():
    """
    python make_tiles.py input_file

    Create tiles for all of the entries in the JSON file.
    """
    parser = argparse.ArgumentParser()

    # parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    # parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')
    parser.add_argument("--min-pos", help="The minimum range for the tiling")
    parser.add_argument("--max-pos", help="The maximum range for the tiling")
    parser.add_argument("--assembly", default=None)
    parser.add_argument("-r",
                        "--resolution",
                        help="The resolution of the data",
                        default=None,
                        type=int)
    parser.add_argument(
        "-k",
        "--position-cols",
        help="The position columns (defaults to all but the last, 1-based)",
        default=None,
    )
    parser.add_argument(
        "-v",
        "--value-pos",
        help="The value column (defaults to the last one, 1-based)",
        default=None,
        type=str,
    )
    parser.add_argument("-z",
                        "--max-zoom",
                        help="The maximum zoom value",
                        default=None,
                        type=int)
    parser.add_argument("--expand-range", help="Expand ranges of values")
    parser.add_argument(
        "--ignore-0",
        help="Ignore ranges with a zero value",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "-b",
        "--bins-per-dimension",
        default=1,
        help="The number of bins to consider in each dimension",
        type=int,
    )
    parser.add_argument(
        "-e",
        "--elasticsearch-url",
        default=None,
        help="The url of the elasticsearch database where to save the tiles",
    )
    parser.add_argument(
        "-f",
        "--columnfile-path",
        default=None,
        help="The path to the column file where to save the tiles",
    )
    parser.add_argument("-n", "--num-threads", default=4, type=int)
    parser.add_argument("--triangular", default=False, action="store_true")
    parser.add_argument("--log-file", default=None)
    parser.add_argument("--max-queue-size", default=40000, type=int)
    parser.add_argument("--print-status", default=None, type=int)

    args = parser.parse_args()

    if args.resolution is None and args.max_zoom is None:
        print("One of --resolution and --max-zoom must be set",
              file=sys.stderr)
        sys.exit(1)

    first_line = sys.stdin.readline()
    first_line_parts = first_line.strip().split()
    if len(first_line_parts) == 0:
        print("ERROR: no input")
        return

    if args.position_cols is not None:
        position_cols = list(map(int, args.position_cols.split(",")))
    else:
        position_cols = None

    # if specific position columns aren't specified, use all but the last column
    if position_cols is None:
        position_cols = list(range(1, len(first_line_parts)))

    if args.assembly is not None:
        mins = [1 for p in position_cols]
        maxs = [
            nc.get_chrominfo(args.assembly).total_length for p in position_cols
        ]
    else:
        mins = [float(p) for p in args.min_pos.split(",")]
        maxs = [float(p) for p in args.max_pos.split(",")]

    max_width = max([b - a for (a, b) in zip(mins, maxs)])

    if args.expand_range is not None:
        expand_range = list(map(int, args.expand_range.split(",")))
    else:
        expand_range = None

    if args.max_zoom is None:
        # determine the maximum zoom level based on the domain of the data
        # and the resolution
        bins_to_display_at_max_resolution = (max_width // args.resolution //
                                             args.bins_per_dimension)
        max_max_zoom = math.ceil(
            math.log(bins_to_display_at_max_resolution) / math.log(2.0))

        if max_max_zoom < 0:
            max_max_zoom = 0

        max_zoom = int(max_max_zoom)
    else:
        max_zoom = args.max_zoom

    # print("max_zoom:", max_zoom)
    max_width = args.resolution * args.bins_per_dimension * 2**max_zoom

    value_pos = args.value_pos

    # if there's not column designated as the value column, use the last column
    if value_pos is None:
        value_pos = [len(first_line_parts) - 1]
    else:
        value_pos = [int(vp) - 1 for vp in value_pos.split(",")]

    max_data_in_sparse = args.bins_per_dimension**len(position_cols) // 10
    """
    if args.elasticsearch_url is not None:
        tile_saver = cst.ElasticSearchTileSaver(max_data_in_sparse,
                                                args.bins_per_dimension,
                                                num_dimensions = len(position_cols),
                                                es_path = args.elasticsearch_url)
    else:
        tile_saver = cst.EmptyTileSaver(max_data_in_sparse,
                                        args.bins_per_dimension,
                                        num_dimensions = len(position_cols))
    """

    print(
        "maxs:",
        maxs,
        "max_zoom:",
        max_zoom,
        "max_data_in_sparse:",
        max_data_in_sparse,
        "url:",
        args.elasticsearch_url,
    )

    # bin_counts = col.defaultdict(col.defaultdict(int))
    q = mpr.Queue(maxsize=args.max_queue_size)

    tilesaver_processes = []
    finished = mpr.Value("b", False)
    if args.elasticsearch_url is not None:
        tile_saver = cst.ElasticSearchTileSaver(
            max_data_in_sparse,
            args.bins_per_dimension,
            len(position_cols),
            args.elasticsearch_url,
            args.log_file,
            args.print_status,
            initial_value=[0.0 for vp in value_pos],
        )
    else:
        tile_saver = cst.ColumnFileTileSaver(
            max_data_in_sparse,
            args.bins_per_dimension,
            len(position_cols),
            args.columnfile_path,
            args.log_file,
            args.print_status,
            initial_value=[0.0 for vp in value_pos],
        )

    for i in range(args.num_threads):
        p = mpr.Process(target=cst.tile_saver_worker,
                        args=(q, tile_saver, finished))

        p.daemon = True
        p.start()
        tilesaver_processes += [(tile_saver, p)]

    tileset_info = {
        "max_value": [0 for vp in value_pos],
        "min_value": [0 for vp in value_pos],
        "min_pos": mins,
        "max_pos": maxs,
        "max_zoom": max_zoom,
        "bins_per_dimension": args.bins_per_dimension,
        "max_width": max_width,
    }

    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()

    try:
        tileset_info = create_tiles(
            q,
            [first_line],
            sys.stdin,
            position_cols,
            value_pos,
            max_zoom,
            args.bins_per_dimension,
            tile_saver,
            expand_range,
            args.ignore_0,
            tileset_info,
            max_width,
            args.triangular,
            args.max_queue_size,
            print_status=args.print_status,
        )
    except KeyboardInterrupt:
        for (ts, p) in tilesaver_processes:
            ts.flush()
            p.terminate()
            p.join()
        raise

    finished.value = True
    # wait for the worker processes to finish
    for (ts, p) in tilesaver_processes:
        p.join()

    print("tileset_info:", tileset_info)
    tile_saver.save_tile({
        "tile_id": "tileset_info",
        "tile_value": tileset_info
    })
    tile_saver.flush()
Esempio n. 13
0
def main():
    usage = """
    python make_tiles.py input_file

    Create tiles for all of the entries in the JSON file.
    """
    num_args = 1
    parser = argparse.ArgumentParser()

    #parser.add_argument('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    #parser.add_argument('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')
    parser.add_argument('input_file')
    parser.add_argument('-b',
                        '--bins-per-dimension',
                        help='The number of bins to divide the data into',
                        default=1,
                        type=int)
    parser.add_argument('--use-spark',
                        default=False,
                        action='store_true',
                        help='Use spark to distribute the workload')

    parser.add_argument(
        '-r',
        '--resolution',
        help='The resolution of the data (applies only to matrix data)',
        type=int)

    parser.add_argument('--importance',
                        action='store_true',
                        help='Create tiles by importance')

    parser.add_argument(
        '-i',
        '--importance-field',
        dest='importance_field',
        default='importance_field',
        help=
        'The field in each JSON entry that indicates how important that entry is',
        type=str)
    parser.add_argument(
        '-v',
        '--value',
        dest='value_field',
        default='count',
        help=
        'The that has the value of each point. Used for aggregation and display'
    )

    group = parser.add_mutually_exclusive_group()

    group.add_argument('-p',
                       '--position',
                       dest='position',
                       default='position',
                       help='Where this entry would be placed on the x axis',
                       type=str)
    group.add_argument('-s',
                       '--sort-by',
                       default=None,
                       help='Sort by a field and use as the position')

    parser.add_argument(
        '--end-position',
        default=None,
        help=
        "Use a field to indicate the end of a particular element so that it appears in all tiles that intersect it"
    )
    parser.add_argument(
        '-e',
        '--max-entries-per-tile',
        dest='max_entries_per_tile',
        default=15,
        help=
        'The maximum number of entries that can be displayed on a single tile',
        type=int)
    parser.add_argument('-c',
                        '--column-names',
                        dest='column_names',
                        default=None)
    parser.add_argument('-m',
                        '--max-zoom',
                        dest='max_zoom',
                        help='The maximum zoom level',
                        type=int,
                        required=True)
    parser.add_argument('--min-pos',
                        dest='min_pos',
                        default=None,
                        help='The minimum x position',
                        type=float)
    parser.add_argument('--max-pos',
                        dest='max_pos',
                        default=None,
                        help='The maximum x position',
                        type=float)
    parser.add_argument('--assembly', default=None)

    parser.add_argument(
        '--min-value',
        help=
        'The field which will be used to determinethe minimum value for any data point',
        default='min_y')
    parser.add_argument(
        '--max-value',
        help=
        'The field which will be used to determine the maximum value for any data point',
        default='max_y')
    parser.add_argument(
        '--range',
        help="Use two columns to create a range (i.e. pos1,pos2",
        default=None)
    parser.add_argument('--range-except-0',
                        help="Don't expand rows which have values less than 0",
                        default=None)
    parser.add_argument('--gzip',
                        help='Compress the output JSON files using gzip',
                        action='store_true')
    parser.add_argument(
        '--output-format',
        help=
        'The format for the output matrix, can be either "dense" or "sparse"',
        default='sparse')
    parser.add_argument('--add-uuid',
                        help='Add a uuid to each element',
                        action='store_true',
                        default=False)
    parser.add_argument('--reverse-importance',
                        help='Reverse the ordering of the importance',
                        action='store_true',
                        default=False)

    output_group = parser.add_mutually_exclusive_group(required=True)

    output_group.add_argument(
        '--elasticsearch-path',
        help='Send the output to an elasticsearch instance',
        default=None)
    output_group.add_argument('-o',
                              '--output-dir',
                              help='The directory to place the tiles',
                              default=None)

    parser.add_argument(
        '--delimiter',
        help=
        "The delimiter separating the different columns in the input files",
        default=None)

    parser.add_argument(
        '--elasticsearch-nodes',
        help='Specify elasticsearch nodes to push the completions to',
        default=None)
    parser.add_argument('--elasticsearch-index',
                        help="The index to place the results in",
                        default='test')
    parser.add_argument('--elasticsearch-doctype',
                        help="The type of document to index",
                        default="autocomplete")
    parser.add_argument('--print-status',
                        action="store_true",
                        help="Print status messages")

    args = parser.parse_args()

    if not args.importance:
        if args.output_format not in ['sparse', 'dense']:
            print(
                'ERROR: The output format must be one of "dense" or "sparse"',
                file=sys.stderr)

    dim_names = args.position.split(',')
    position_cols = dim_names

    sc = None

    if args.use_spark:
        from pyspark import SparkContext
        sc = SparkContext()
    else:
        sys.stderr.write("setting sc:")
        sc = cfp.FakeSparkContext

    if args.column_names is not None:
        args.column_names = args.column_names.split(',')

    if args.assembly is not None:
        mins = [1 for p in position_cols]
        maxs = [
            nc.get_chrominfo(args.assembly).total_length for p in position_cols
        ]
    else:
        mins = [float(p) for p in args.min_pos.split(',')]
        maxs = [float(p) for p in args.max_pos.split(',')]

    max_width = max([b - a for (a, b) in zip(mins, maxs)])

    print("start time:", strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    entries = cti.load_entries_from_file(
        sc,
        args.input_file,
        args.column_names,
        delimiter=args.delimiter,
        elasticsearch_path=args.elasticsearch_path)
    print("load entries time:", strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    if args.range is not None:
        # if a pair of columns specifies a range of values, then create multiple
        # entries for each value within that range (e.g. bed files)
        range_cols = args.range.split(',')
        entries = entries.flatMap(lambda x: cti.expand_range(
            x, *range_cols, range_except_0=args.range_except_0))

    if args.importance:
        # Data will be aggregated by importance. Only more "important" pieces of information will
        # be passed onto the lower resolution tiles if they are too crowded
        tileset = cti.make_tiles_by_importance(
            sc,
            entries,
            dim_names=args.position.split(','),
            end_dim_names=args.end_position.split(','),
            max_zoom=args.max_zoom,
            importance_field=args.importance_field,
            output_dir=args.output_dir,
            max_entries_per_tile=args.max_entries_per_tile,
            gzip_output=args.gzip,
            add_uuid=args.add_uuid,
            reverse_importance=args.reverse_importance,
            adapt_zoom=False,
            mins=mins,
            maxs=maxs)
    else:
        # Data will be aggregated by binning. This means that it two adjacent bins should be able
        # to be reduced into one using some function (i.e. 'sum', 'min', 'max')
        tileset = cti.make_tiles_by_binning(
            sc,
            entries,
            args.position.split(','),
            args.max_zoom,
            args.value_field,
            args.importance_field,
            bins_per_dimension=args.bins_per_dimension,
            resolution=args.resolution)

    all_tiles = tileset['tiles']

    if args.elasticsearch_nodes is not None:
        # save the tiles to an elasticsearch database
        save_tile_to_elasticsearch = ft.partial(
            cst.save_tile_to_elasticsearch,
            elasticsearch_nodes=args.elasticsearch_nodes,
            elasticsearch_path=args.elasticsearch_path,
            print_status=args.print_status)

        (all_tiles.map(lambda x: {
            "tile_id": ".".join(map(str, x[0])),
            "tile_value": x[1]
        }).foreachPartition(save_tile_to_elasticsearch))

        dataset_info = cdd.describe_dataset(sys.argv, args)
        print("saving tileset_info to:", args.elasticsearch_path)
        (sc.parallelize([{
            "tile_value": tileset['tileset_info'],
            "tile_id": "tileset_info"
        }]).foreachPartition(save_tile_to_elasticsearch))

        (sc.parallelize([{
            "tile_value": dataset_info,
            "tile_id": "dataset_info"
        }]).foreachPartition(save_tile_to_elasticsearch))

        if 'histogram' in tileset:
            histogram_rdd = sc.parallelize([{
                "tile_value": tileset['histogram'],
                "tile_id": "histogram"
            }])

            histogram_rdd.foreachPartition(save_tile_to_elasticsearch)
    else:
        # dump tiles to a directory structure
        all_tiles.foreach(
            ft.partial(cst.save_tile,
                       output_dir=args.output_dir,
                       gzip_output=args.gzip))

        dataset_info = cdd.describe_dataset(sys.argv, args)

        with open(op.join(args.output_dir, 'dataset_info'), 'w') as f:
            json.dump(
                {
                    "_source": {
                        "tile_id": "dataset_info",
                        "tile_value": dataset_info
                    }
                },
                f,
                indent=2)

        with open(op.join(args.output_dir, 'tileset_info'), 'w') as f:
            json.dump(
                {
                    "_source": {
                        "tile_id": "tileset_info",
                        "tile_value": tileset['tileset_info']
                    }
                },
                f,
                indent=2)

        if 'histogram' in tileset:
            with open(op.join(args.output_dir, 'value_histogram'), 'w') as f:
                json.dump(
                    {
                        "_source": {
                            "tile_id": "histogram",
                            "tile_value": tileset['histogram']
                        }
                    },
                    f,
                    indent=2)
Esempio n. 14
0
 def POS_ABS(self, CHROM, POS):
     chrom_info = nc.get_chrominfo('hg38')
     return nc.chr_pos_to_genome_pos('chr'+CHROM, POS, chrom_info)
def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution,
                    name):
    # Short-hand for creating a DirectoryStore with a root group.
    f = zarr.open(output_file, mode='w')
    compressor = Zlib(level=1)

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    chromosomes_group = f.create_group("chromosomes")

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = [str(chr_name) for chr_name in chromosomes[:25]
                   ]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_cumsum_arr = np.concatenate(
        (np.array([0]), np.cumsum(chroms_length_arr)))

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))
    chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each chromosome dataset.
    for chr_name, chr_len in chrom_name_to_length.items():
        chr_group = chromosomes_group.create_group(chr_name)
        # Create each resolution group.
        for resolution in resolutions:
            chr_shape = (num_samples, math.ceil(chr_len / resolution))
            chr_group.create_dataset(str(resolution),
                                     shape=chr_shape,
                                     dtype="f4",
                                     fill_value=np.nan,
                                     compressor=compressor)

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (num_samples, math.ceil(chr_len / resolution))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[1],
                                    summary="sum")
                    chromosomes_group[chr_name][str(resolution)][
                        bw_index, :] = arr
        else:
            print(f"{bw_file} not is_bigwig")

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for bw_index, bw_file in enumerate(input_bigwig_files):
        row_infos.append({
            "cluster": int(bw_index + 1),
            "file": os.path.basename(bw_file)
        })

    # f.attrs should contain all tileset_info properties
    # For zarr, more attributes are used here to allow "serverless"
    f.attrs['row_infos'] = row_infos
    f.attrs['resolutions'] = sorted(resolutions, reverse=True)
    f.attrs['shape'] = [num_samples, 256]
    f.attrs['name'] = name
    f.attrs['coordSystem'] = "hg38"

    # https://github.com/zarr-developers/zarr-specs/issues/50
    f.attrs['multiscales'] = [{
        "version":
        "0.1",
        "name":
        chr_name,
        "datasets": [{
            "path": f"chromosomes/{chr_name}/{resolution}"
        } for resolution in sorted(resolutions, reverse=True)],
        "type":
        "zarr-multivec",
        "metadata": {
            "chromoffset": int(chrom_name_to_cumsum[chr_name]),
            "chromsize": int(chr_len),
        }
    } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
Esempio n. 16
0
def _bedgraph(filepath, output_file, assembly, chrom_col, from_pos_col,
              to_pos_col, value_col, has_header, chromosome, tile_size,
              chunk_size, method, nan_value, transform, count_nan,
              closed_interval, chromsizes_filename, zoom_step):
    last_end = 0
    data = []

    if output_file is None:
        output_file = op.splitext(filepath)[0] + '.hitile'

    print("output file:", output_file)

    # Override the output file if it existts
    if op.exists(output_file):
        os.remove(output_file)
    f = h5py.File(output_file, 'w')

    # get the information about the chromosomes in this assembly
    if chromsizes_filename is not None:
        chrom_info = nc.get_chrominfo_from_file(chromsizes_filename)
        chrom_order = [
            a.encode('utf-8')
            for a in nc.get_chromorder_from_file(chromsizes_filename)
        ]
        chrom_sizes = nc.get_chromsizes_from_file(chromsizes_filename)
    else:
        chrom_info = nc.get_chrominfo(assembly)
        chrom_order = [a.encode('utf-8') for a in nc.get_chromorder(assembly)]
        chrom_sizes = nc.get_chromsizes(assembly)

    assembly_size = chrom_info.total_length
    print('assembly_size:', assembly_size)

    tile_size = tile_size
    chunk_size = tile_size * 2**chunk_size  # how many values to read in at once while tiling

    dsets = []  # data sets at each zoom level
    nan_dsets = []  # store nan values

    # initialize the arrays which will store the values at each stored zoom level
    z = 0
    positions = []  # store where we are at the current dataset
    data_buffers = [[]]
    nan_data_buffers = [[]]

    while assembly_size / 2**z > tile_size:
        dset_length = math.ceil(assembly_size / 2**z)
        dsets += [
            f.create_dataset('values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]
        nan_dsets += [
            f.create_dataset('nan_values_' + str(z), (dset_length, ),
                             dtype='f',
                             compression='gzip')
        ]

        data_buffers += [[]]
        nan_data_buffers += [[]]

        positions += [0]
        z += zoom_step

    #print("dsets[0][-10:]", dsets[0][-10:])

    # load the bigWig file
    #print("filepath:", filepath)

    # store some meta data
    d = f.create_dataset('meta', (1, ), dtype='f')

    print("assembly:", assembly)
    #print("chrom_info:", nc.get_chromorder(assembly))

    d.attrs['zoom-step'] = zoom_step
    d.attrs['max-length'] = assembly_size
    d.attrs['assembly'] = assembly
    d.attrs['chrom-names'] = chrom_order
    d.attrs['chrom-sizes'] = chrom_sizes
    d.attrs['chrom-order'] = chrom_order
    d.attrs['tile-size'] = tile_size
    d.attrs['max-zoom'] = max_zoom = math.ceil(
        math.log(d.attrs['max-length'] / tile_size) / math.log(2))
    d.attrs['max-width'] = tile_size * 2**max_zoom
    d.attrs['max-position'] = 0

    print("assembly size (max-length)", d.attrs['max-length'])
    print("max-width", d.attrs['max-width'])
    print("max_zoom:", d.attrs['max-zoom'])
    print("chunk-size:", chunk_size)
    print("chrom-order", d.attrs['chrom-order'])

    t1 = time.time()

    # are we reading the input from stdin or from a file?

    if filepath == '-':
        f = sys.stdin
    else:
        if filepath.endswith('.gz'):
            import gzip
            f = gzip.open(filepath, 'rt')
        else:
            f = open(filepath, 'r')

    curr_zoom = 0

    def add_values_to_data_buffers(buffers_to_add, nan_buffers_to_add):
        curr_zoom = 0

        data_buffers[0] += buffers_to_add
        nan_data_buffers[0] += nan_buffers_to_add

        curr_time = time.time() - t1
        percent_progress = (positions[curr_zoom] + 1) / float(assembly_size)
        print(
            "position: {} progress: {:.2f} elapsed: {:.2f} remaining: {:.2f}".
            format(positions[curr_zoom] + 1, percent_progress, curr_time,
                   curr_time / (percent_progress) - curr_time))

        while len(data_buffers[curr_zoom]) >= chunk_size:
            # get the current chunk and store it, converting nans to 0
            print("len(data_buffers[curr_zoom])", len(data_buffers[curr_zoom]))
            curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
            nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
            #curr_chunk[np.isnan(curr_chunk)] = 0
            '''
            print("1cc:", sum(curr_chunk))
            print("1db:", data_buffers[curr_zoom][:chunk_size])
            print("1curr_chunk:", nan_curr_chunk)
            '''
            print("positions[curr_zoom]:", positions[curr_zoom])

            dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = curr_chunk
            nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                                 chunk_size] = nan_curr_chunk

            # aggregate nan values
            #nan_curr_chunk[np.isnan(curr_chunk)] = 0
            #print("1na_cc:", sum(nan_curr_chunk))

            # aggregate and store aggregated values in the next zoom_level's data
            data_buffers[curr_zoom + 1] += list(
                ct.aggregate(curr_chunk, 2**zoom_step))
            nan_data_buffers[curr_zoom + 1] += list(
                ct.aggregate(nan_curr_chunk, 2**zoom_step))

            data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
            nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][
                chunk_size:]

            data = data_buffers[curr_zoom + 1]
            nan_data = nan_data_buffers[curr_zoom + 1]

            # do the same for the nan values buffers

            positions[curr_zoom] += chunk_size
            curr_zoom += 1

            if curr_zoom * zoom_step >= max_zoom:
                break

    values = []
    nan_values = []

    if has_header:
        f.readline()

    # the genome position up to which we've filled in values
    curr_genome_pos = 0

    # keep track of the previous value so that we can use it to fill in NAN values
    prev_value = 0

    for line in f:
        # each line should indicate a chromsome, start position and end position
        parts = line.strip().split()

        start_genome_pos = chrom_info.cum_chrom_lengths[parts[
            chrom_col - 1]] + int(parts[from_pos_col - 1])
        #print("len(values):", len(values), curr_genome_pos, start_genome_pos)
        #print("line:", line)

        if start_genome_pos - curr_genome_pos > 1:
            values += [np.nan] * (start_genome_pos - curr_genome_pos - 1)
            nan_values += [1] * (start_genome_pos - curr_genome_pos - 1)

            curr_genome_pos += (start_genome_pos - curr_genome_pos - 1)

        # count how many nan values there are in the dataset
        nan_count = 1 if parts[value_col - 1] == nan_value else 0

        # if the provided values are log2 transformed, we have to un-transform them
        if transform == 'exp2':
            value = 2**float(
                parts[value_col -
                      1]) if not parts[value_col - 1] == nan_value else np.nan
        else:
            value = float(
                parts[value_col -
                      1]) if not parts[value_col - 1] == nan_value else np.nan

        # print("pos:", int(parts[to_pos_col-1]) - int(parts[from_pos_col-1]))
        # we're going to add as many values are as specified in the bedfile line
        values_to_add = [value] * (int(parts[to_pos_col - 1]) -
                                   int(parts[from_pos_col - 1]))
        nan_counts_to_add = [nan_count] * (int(parts[to_pos_col - 1]) -
                                           int(parts[from_pos_col - 1]))

        if closed_interval:
            values_to_add += [value]
            nan_counts_to_add += [nan_count]

        # print("values_to_add", values_to_add)

        values += values_to_add
        nan_values += nan_counts_to_add

        d.attrs['max-position'] = start_genome_pos + len(values_to_add)

        #print("values:", values[:30])

        curr_genome_pos += len(values_to_add)

        while len(values) > chunk_size:
            print("len(values):", len(values), chunk_size)
            print("line:", line)
            add_values_to_data_buffers(values[:chunk_size],
                                       nan_values[:chunk_size])
            values = values[chunk_size:]
            nan_values = nan_values[chunk_size:]

    add_values_to_data_buffers(values, nan_values)

    # store the remaining data
    while True:
        # get the current chunk and store it
        chunk_size = len(data_buffers[curr_zoom])
        curr_chunk = np.array(data_buffers[curr_zoom][:chunk_size])
        nan_curr_chunk = np.array(nan_data_buffers[curr_zoom][:chunk_size])
        '''
        print("2curr_chunk", curr_chunk)
        print("2curr_zoom:", curr_zoom)
        print("2db", data_buffers[curr_zoom][:100])
        '''

        dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                         chunk_size] = curr_chunk
        nan_dsets[curr_zoom][positions[curr_zoom]:positions[curr_zoom] +
                             chunk_size] = nan_curr_chunk

        #print("chunk_size:", chunk_size, "len(curr_chunk):", len(curr_chunk), "len(nan_curr_chunk)", len(nan_curr_chunk))

        # aggregate and store aggregated values in the next zoom_level's data
        data_buffers[curr_zoom + 1] += list(
            ct.aggregate(curr_chunk, 2**zoom_step))
        nan_data_buffers[curr_zoom + 1] += list(
            ct.aggregate(nan_curr_chunk, 2**zoom_step))

        data_buffers[curr_zoom] = data_buffers[curr_zoom][chunk_size:]
        nan_data_buffers[curr_zoom] = nan_data_buffers[curr_zoom][chunk_size:]

        data = data_buffers[curr_zoom + 1]
        nan_data = nan_data_buffers[curr_zoom + 1]

        positions[curr_zoom] += chunk_size
        curr_zoom += 1

        # we've created enough tile levels to cover the entire maximum width
        if curr_zoom * zoom_step >= max_zoom:
            break
Esempio n. 17
0
def _bedpe(filepath,
           output_file,
           assembly,
           importance_column,
           has_header,
           max_per_tile,
           tile_size,
           max_zoom=None,
           chromosome=None,
           chr1_col=0,
           from1_col=1,
           to1_col=2,
           chr2_col=3,
           from2_col=4,
           to2_col=5):
    print('output_file:', output_file)

    if filepath.endswith('.gz'):
        print("gzip")
        f = gzip.open(filepath, 'rt')
    else:
        print("plain")
        f = open(filepath, 'r')

    if output_file is None:
        output_file = filepath + ".multires.db"
    else:
        output_file = output_file

    if op.exists(output_file):
        os.remove(output_file)

    def line_to_dict(line):
        parts = line.split()
        d = {}
        try:
            d['xs'] = [
                nc.chr_pos_to_genome_pos(parts[chr1_col],
                                         int(parts[from1_col]), assembly),
                nc.chr_pos_to_genome_pos(parts[chr1_col], int(parts[to1_col]),
                                         assembly)
            ]
            d['ys'] = [
                nc.chr_pos_to_genome_pos(parts[chr2_col],
                                         int(parts[from2_col]), assembly),
                nc.chr_pos_to_genome_pos(parts[chr2_col], int(parts[to2_col]),
                                         assembly)
            ]
        except KeyError:
            error_str = (
                "ERROR converting chromosome position to genome position. "
                "Please make sure you've specified the correct assembly "
                "using the --assembly option. "
                "Current assembly: {}, chromosomes: {},{}".format(
                    assembly, parts[chr1_col], parts[chr2_col]))
            raise (KeyError(error_str))

        d['uid'] = slugid.nice().decode('utf-8')

        d['chrOffset'] = d['xs'][0] - int(parts[from1_col])

        if importance_column is None:
            d['importance'] = max(d['xs'][1] - d['xs'][0],
                                  d['ys'][1] - d['ys'][0])
        elif importance_column == 'random':
            d['importance'] = random.random()
        else:
            d['importance'] = float(d[importance_column])

        d['fields'] = line

        return d

    entries = []

    if has_header:
        f.readline()
    else:
        first_line = f.readline().strip()
        try:
            parts = first_line.split()
            '''
            print("chr1_col", chr1_col, "chr2_col", chr2_col, 
                  "from1_col:", from1_col, "from2_col", from2_col, 
                  "to1_col", to1_col, "to2_col", to2_col)
            '''

            pos = int(parts[from1_col])
            pos = int(parts[to1_col])
            pos = int(parts[from2_col])
            pos = int(parts[to2_col])
        except ValueError as ve:
            error_str = "Couldn't convert one of the bedpe coordinates to an integer. If the input file contains a header, make sure to indicate that with the --has-header option. Line: {}".format(
                first_line)
            raise (ValueError(error_str))
        entries = [line_to_dict(first_line)]

    entries += [line_to_dict(line.strip()) for line in f]

    # We neeed chromosome information as well as the assembly size to properly
    # tile this data
    tile_size = tile_size
    chrom_info = nc.get_chrominfo(assembly)
    assembly_size = chrom_info.total_length + 1
    #max_zoom = int(math.ceil(math.log(assembly_size / min_feature_width) / math.log(2)))
    max_zoom = int(math.ceil(
        math.log(assembly_size / tile_size) / math.log(2)))
    '''
    if max_zoom is not None and max_zoom < max_zoom:
        max_zoom = max_zoom
    '''

    # this script stores data in a sqlite database
    sqlite3.register_adapter(np.int64, lambda val: int(val))
    conn = sqlite3.connect(output_file)

    # store some meta data
    store_meta_data(conn,
                    1,
                    max_length=assembly_size,
                    assembly=assembly,
                    chrom_names=nc.get_chromorder(assembly),
                    chrom_sizes=nc.get_chromsizes(assembly),
                    tile_size=tile_size,
                    max_zoom=max_zoom,
                    max_width=tile_size * 2**max_zoom)

    max_width = tile_size * 2**max_zoom
    uid_to_entry = {}

    c = conn.cursor()
    c.execute('''
    CREATE TABLE intervals
    (
        id int PRIMARY KEY,
        zoomLevel int,
        importance real,
        fromX int,
        toX int,
        fromY int,
        toY int,
        chrOffset int,
        uid text,
        fields text
    )
    ''')

    print("creating rtree")
    c.execute('''
        CREATE VIRTUAL TABLE position_index USING rtree(
            id,
            rFromX, rToX,
            rFromY, rToY
        )
        ''')

    curr_zoom = 0
    counter = 0

    max_viewable_zoom = max_zoom

    if max_zoom is not None and max_zoom < max_zoom:
        max_viewable_zoom = max_zoom

    tile_counts = col.defaultdict(
        lambda: col.defaultdict(lambda: col.defaultdict(int)))
    entries = sorted(entries, key=lambda x: -x['importance'])

    counter = 0
    for d in entries:
        curr_zoom = 0

        while curr_zoom <= max_zoom:
            tile_width = tile_size * 2**(max_zoom - curr_zoom)
            #print("d:", d)
            tile_from = list(
                map(lambda x: x / tile_width, [d['xs'][0], d['ys'][0]]))
            tile_to = list(
                map(lambda x: x / tile_width, [d['xs'][1], d['ys'][1]]))

            empty_tiles = True

            # go through and check if any of the tiles at this zoom level are full

            for i in range(int(tile_from[0]), int(tile_to[0]) + 1):
                if not empty_tiles:
                    break

                for j in range(int(tile_from[1]), int(tile_to[1]) + 1):
                    if tile_counts[curr_zoom][i][j] > max_per_tile:

                        empty_tiles = False
                        break

            if empty_tiles:
                # they're all empty so add this interval to this zoom level
                for i in range(int(tile_from[0]), int(tile_to[0]) + 1):
                    for j in range(int(tile_from[1]), int(tile_to[1]) + 1):
                        tile_counts[curr_zoom][i][j] += 1

                #print("adding:", curr_zoom, d)
                exec_statement = 'INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?,?)'
                ret = c.execute(
                    exec_statement,
                    (counter, curr_zoom, d['importance'], d['xs'][0],
                     d['xs'][1], d['ys'][0], d['ys'][1], d['chrOffset'],
                     d['uid'], d['fields']))
                conn.commit()

                exec_statement = 'INSERT INTO position_index VALUES (?,?,?,?,?)'
                ret = c.execute(
                    exec_statement,
                    (counter, d['xs'][0], d['xs'][1], d['ys'][0], d['ys'][1]
                     )  #add counter as a primary key
                )
                conn.commit()

                counter += 1
                break

            curr_zoom += 1

    return
Esempio n. 18
0
def test_clodius_aggregate_bedgraph1():
    input_file = op.join(testdir, 'sample_data', 'dm3_values.tsv')
    output_file = '/tmp/dm3_values.hitile'

    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [input_file, '--output-file', output_file, '--assembly', 'dm3'])

    a, b, tb = result.exc_info
    """
    print("exc_info:", result.exc_info)
    print("result:", result)
    print("result.output", result.output)
    print("result.error", traceback.print_tb(tb))
    print("Exception:", a,b)
    """

    # print("result.output", result.output)

    f = h5py.File('/tmp/dm3_values.hitile')
    # max_zoom = f['meta'].attrs['max-zoom']
    # TODO: Make assertions about result
    values = f['values_0']

    import numpy as np
    # print("values:", values[8])
    # genome positions are 0 based as stored in hitile files
    assert (np.isnan(values[8]))
    assert (values[9] == 1)
    assert (values[10] == 1)
    assert (values[13] == 1)
    assert (np.isnan(values[14]))
    assert (np.isnan(values[15]))

    chrom_info = nc.get_chrominfo('dm3')
    chr_2r_pos = nc.chr_pos_to_genome_pos('chr2R', 0, chrom_info)
    # print('chr_2r_pos:', chr_2r_pos)

    assert (np.isnan(values[chr_2r_pos + 28]))
    assert (values[chr_2r_pos + 29] == 77)
    assert (values[chr_2r_pos + 38] == 77)
    assert (values[chr_2r_pos + 39] == 0)

    assert (result.exit_code == 0)

    d = cht.get_data(f, 0, 0)
    # print("d[:10]", d[:10])
    # print("sum(d):", sum([x for x in d if not np.isnan(x)]))
    assert (np.nansum(d) > 1.0 and np.nansum(d) < 10.0)

    return

    input_file = op.join(testdir, 'sample_data', 'test3chroms_values.tsv')
    output_file = '/tmp/test3chroms_values.hitile'

    runner = clt.CliRunner()
    result = runner.invoke(cca.bedgraph, [
        input_file, '--output-file', output_file, '--assembly', 'test3chroms'
    ])

    # print('output:', result.output, result)

    f = h5py.File('/tmp/test3chroms_values.hitile')
    # f['meta'].attrs['max-zoom']
    # TODO: Make assertions about result

    # print('max_zoom:', max_zoom)
    # print("len", len(f['values_0']))

    values = f['values_0']

    # print('values', values[:100])

    # genome positions are 0 based as stored in hitile files
    assert (values[8] == 0)
    assert (values[9] == 1)
    assert (values[10] == 1)
    assert (values[13] == 1)
    assert (values[14] == 0)
    assert (values[15] == 0)

    chr2_pos = nc.chr_pos_to_genome_pos('chr2', 0, 'test3chroms')

    assert (values[chr2_pos + 28] == 0)
    assert (values[chr2_pos + 29] == 77)
    assert (values[chr2_pos + 38] == 77)
    assert (values[chr2_pos + 39] == 0)

    assert (result.exit_code == 0)

    d = cht.get_data(f, 0, 0)
    assert (sum(d) == 770 + 880 + 5)
Esempio n. 19
0
    def __init__(self,
                 f,
                 profile_paths,
                 assembly='hg38',
                 starting_resolution=5000,
                 name="Genomic Profiles"):
        """
    Constructor method

    :param f: The opened Zarr store object.
    :type f: zarr.Group
    :param list[list[str]] profile_paths: A list of cell set paths, one path for each profile.
    :param str assembly: The genome assembly to use for chromosome lengths, passed to negspy. By default, 'hg38'.
    :param int starting_resolution: The starting resolution. By default, 5000.
    :param str name: The name for this set of profiles. By default, 'Genomic Profiles'.
    """

        self.f = f

        num_profiles = len(profile_paths)

        compressor = 'default'

        chromosomes = [
            str(chr_name) for chr_name in nc.get_chromorder(assembly)[:25]
        ]  # TODO: should more than chr1-chrM be used?
        num_chromosomes = len(chromosomes)
        chroms_length_arr = np.array(
            [nc.get_chrominfo(assembly).chrom_lengths[x] for x in chromosomes],
            dtype="i8")
        chroms_cumsum_arr = np.concatenate(
            (np.array([0]), np.cumsum(chroms_length_arr)))

        chromosomes_set = set(chromosomes)
        chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))
        chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr))

        # Prepare to fill in resolutions datasets.
        resolutions = [starting_resolution * (2**x) for x in range(16)]

        chromosomes_group = f.create_group("chromosomes")
        for chr_name, chr_len in chrom_name_to_length.items():
            chr_group = chromosomes_group.create_group(chr_name)
            # Create each resolution group.
            for resolution in resolutions:
                chr_shape = (num_profiles, math.ceil(chr_len / resolution))
                chr_group.create_dataset(str(resolution),
                                         shape=chr_shape,
                                         dtype="f4",
                                         fill_value=np.nan,
                                         compressor=compressor)

        # f.attrs should contain the properties required for HiGlass's "tileset_info" requests.
        f.attrs['row_infos'] = [{
            "path": profile_path
        } for profile_path in profile_paths]
        f.attrs['resolutions'] = sorted(resolutions, reverse=True)
        f.attrs['shape'] = [num_profiles, 256]
        f.attrs['name'] = name
        f.attrs['coordSystem'] = assembly

        self.resolutions = resolutions
        self.chromosomes = chromosomes
        self.chromosomes_group = chromosomes_group
        self.chrom_name_to_length = chrom_name_to_length
        self.num_profiles = num_profiles

        # https://github.com/zarr-developers/zarr-specs/issues/50
        f.attrs['multiscales'] = [{
            "version":
            "0.1",
            "name":
            chr_name,
            "datasets": [{
                "path": f"chromosomes/{chr_name}/{resolution}"
            } for resolution in sorted(resolutions, reverse=True)],
            "type":
            "zarr-multivec",
            "metadata": {
                "chromoffset": int(chrom_name_to_cumsum[chr_name]),
                "chromsize": int(chr_len),
            }
        } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser(description="""
    
    python chr_pos_to_genome_pos.py -t 1,2:3,4

    Convert chromosome,position pairs to genome_positions. Assumes that the
    coordinates refer to the hg19 assembly (unless otherwise specified).

    Example:

    2       NM_000014       chr12   -       9220303 9268825

    -> python scripts/chr_pos_to_genome_pos.py -c 3:5,3:6

    2       NM_000014       genome  -       2115405269      2115453791

    --------------------------------

    This also works with space-delimited fields:

    chr5    56765,56766

    ->python scripts/chr_pos_to_genome_pos.py -c 1:2

    genome  881683465,881683466

""")

    parser.add_argument('-a', '--assembly', default='hg19')
    parser.add_argument('-s', '--chromsizes-file', default=None)
    parser.add_argument('-n', '--new-chrom', default=None)
    parser.add_argument('-c', '--columns', default='1,2', 
            help="Which columns to translate to genome positions. "
            "Column pairs should be 1-based and separated by colons")

    #parser.add_argument('-u', '--useless', action='store_true', 
    #                     help='Another useless option')
    args = parser.parse_args()

    if args.chromsizes_file is not None:
        chrom_info = nc.get_chrominfo_from_file(args.chromsizes_file)
    else:
        chrom_info = nc.get_chrominfo(args.assembly)

    for line in sys.stdin:
        try:
            line_output = []
            line_parts = line.strip().split()
            translated_positions = {}
            translated_chroms = {}

            for translate_pair in [[int (y) for y in x.split(':')] for x in args.columns.split(',')]:
                # go through the pairs of columns that need to be translated to genome position
                # assume that the position column is comma separated list of values (although it doesn't
                # actually need to be)
                chrom,poss = line_parts[translate_pair[0]-1], line_parts[translate_pair[1]-1].strip(",").split(',')
                genome_pos = ",".join(map(str,[nc.chr_pos_to_genome_pos( chrom, int(pos), chrom_info) for pos in poss]))
                #line_output += [genome_pos]

                # note that we've translated these columns and shouldn't include them in the output
                translated_positions[translate_pair[1]-1] = genome_pos
                translated_chroms[translate_pair[0]-1] = chrom

            for i,part in enumerate(line_parts):
                if i in translated_chroms:
                    # replace chromosome identifiers (e.g. 'chr1') with 'genome' to indicate the positions
                    if args.new_chrom is None:
                        line_output += ['genome({})'.format(chrom)]
                    else:
                        line_output += [args.new_chrom]
                elif i in translated_positions:
                    # this column used to contain a position so we need to replace it with a translated
                    # position
                    line_output += [translated_positions[i]]
                else:
                    # if this column didn't contain a translated position output it as is
                    line_output += [part]

            try:
                print("\t".join(map(str, line_output)))
            except BrokenPipeError:
                # Output is probably being run through "head" or something similar
                break
        except KeyError as ke:
            print("KeyError:", ke, line.strip(), file=sys.stderr)