Ejemplo n.º 1
0
def pos(argv):
    # default parameters
    leg_file_name = None
    in_only = False

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "l:O")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c leg [options] -l <in.leg> <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -l <in.leg>    LEG file to convert to 3D positions (required)\n"
        )
        sys.stderr.write("  -O             exclude out-of-bound legs\n")
        return 1
    for o, a in opts:
        if o == "-l":
            leg_file_name = a
        if o == "-O":
            in_only = True
    if leg_file_name is None:
        sys.stderr.write("[E::" + __name__ + "] -l is required\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")
    g3d_data.prepare_interpolate()

    # convert LEG file to 3DG particles
    for leg_file_line in open(leg_file_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_line.strip()))
        if position is None or (is_out and in_only):
            sys.stdout.write("None\n")
        else:
            sys.stdout.write("\t".join(map(str, position)) + "\n")

    return 0
Ejemplo n.º 2
0
def ard(argv):
    # default parameters
    reference_file_name = None
    min_separation = None
    max_distance = 10000000
    grid_size = None
    is_symmetrical = True
    superellipse_mode = False
    count_mode = False
    normalize_by_num_cons = False
    leg_file_1_name = None
    leg_file_2_name = None

    # progress display parameters
    display_num_ref_cons = 1000

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "c:s:d:h:Sent1:2:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c ard [options] <in.con>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -c <ref.con>    contact file for reference points [<in.con> itself]\n"
        )
        sys.stderr.write(
            "  -s INT          only use intra-chromosomal reference points, min separation (bp) [only use inter-chromosomal] \n"
        )
        sys.stderr.write(
            "  -d INT          max distance (bp, L-inf norm) around reference points ["
            + str(max_distance) + "]\n")
        sys.stderr.write(
            "  -h INT          output 2D histogram, grid size (bp) (useful for too many contacts)\n"
        )
        sys.stderr.write(
            "  -e              use L-1/2 norm (superellipse) instead\n")
        sys.stderr.write(
            "  -S              does not symmetrize for \"-h\"\n\n")
        sys.stderr.write(
            "  -n              output the number of nearby contacts for each reference point\n"
        )
        sys.stderr.write(
            "  -t              normalize by the total number of contacts for \"-n\"\n\n"
        )
        sys.stderr.write(
            "  -1 <in1.leg>    generate a pairwise count matrix between reference legs\n"
        )
        sys.stderr.write(
            "  -2 <in2.leg>    generate a pairwise count matrix between two sets of reference legs [<in2.leg>]\n"
        )
        return 1
    for o, a in opts:
        if o == "-c":
            reference_file_name = a
        elif o == "-s":
            min_separation = int(a)
        elif o == "-d":
            max_distance = int(a)
        elif o == "-h":
            grid_size = int(a)
        elif o == "-S":
            is_symmetrical = False
        elif o == "-e":
            superellipse_mode = True
        elif o == "-n":
            count_mode = True
        elif o == "-t":
            normalize_by_num_cons = True
        elif o == "-1":
            leg_file_1_name = a
        elif o == "-2":
            leg_file_2_name = a

    # read CON file
    con_file = gzip.open(args[0], "rb") if args[0].endswith(".gz") else open(
        args[0], "rb")
    con_data = file_to_con_data(con_file)
    sys.stderr.write(
        "[M::" + __name__ + "] read " + str(con_data.num_cons()) +
        " contacts (" +
        str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) +
        "% intra-chromosomal, " + str(
            round(100.0 * con_data.num_phased_legs() / con_data.num_cons() /
                  2, 2)) + "% legs phased)\n")

    if leg_file_1_name is None:
        # regular mode
        # read reference CON file
        if reference_file_name is None:
            # use itself
            ref_con_data = copy.deepcopy(con_data)
        else:
            # open another file
            ref_con_file = gzip.open(
                reference_file_name,
                "rb") if reference_file_name.endswith(".gz") else open(
                    reference_file_name, "rb")
            ref_con_data = file_to_con_data(ref_con_file)
        sys.stderr.write("[M::" + __name__ + "] read " +
                         str(ref_con_data.num_cons()) + " reference points (" +
                         str(
                             round(
                                 100.0 * ref_con_data.num_intra_chr() /
                                 ref_con_data.num_cons(), 2)) +
                         "% intra-chromosomal)\n")

        # keep only desired reference points
        if min_separation is None:
            # inter-chromosomal only
            ref_con_data.clean_intra_chr()
        else:
            # intra-chromosmal only, remove small separations
            ref_con_data.clean_inter_chr()
            ref_con_data.clean_separation(min_separation)
        sys.stderr.write("[M::" + __name__ + "] kept " +
                         str(ref_con_data.num_cons()) + " reference points (" +
                         str(
                             round(
                                 100.0 * ref_con_data.num_intra_chr() /
                                 ref_con_data.num_cons(), 2)) +
                         "% intra-chromosomal)\n")

        # initialize 2D histogram
        if not grid_size is None:
            grid_num = 2 * max_distance / grid_size
            around_hist = np.zeros((grid_num, grid_num), dtype=np.int)

        # find relation positions
        con_data.sort_cons()
        num_ref_cons = 0
        for ref_con in ref_con_data.get_cons():
            num_ref_cons += 1
            if num_ref_cons % display_num_ref_cons == 0:
                sys.stderr.write("[M::" + __name__ + "] analyzed " +
                                 str(num_ref_cons) + " reference points\n")
            num_nearby_cons = 0
            for con in (con_data.get_cons_near(ref_con, max_distance)
                        if superellipse_mode else con_data.get_cons_near_inf(
                            ref_con, max_distance)):
                num_nearby_cons += 1
                if count_mode:
                    continue
                if grid_size is None:
                    # output relative positions
                    sys.stdout.write(con.to_string_around(ref_con) + "\n")
                else:
                    # calculate histogram
                    rel_locus = con.to_rel_locus_around(ref_con)
                    if is_symmetrical:
                        # symmetrize
                        if min_separation is None:
                            # inter-chromosomal: 8 copies
                            for sign_1 in [-1, 1]:
                                for sign_2 in [-1, 1]:
                                    add_ref_locus_to_hist(
                                        around_hist, (sign_1 * rel_locus[0],
                                                      sign_2 * rel_locus[1]),
                                        max_distance, grid_size)
                                    add_ref_locus_to_hist(
                                        around_hist, (sign_2 * rel_locus[1],
                                                      sign_1 * rel_locus[0]),
                                        max_distance, grid_size)
                        else:
                            # intra-chromosomal: 2 copies
                            add_ref_locus_to_hist(around_hist,
                                                  (rel_locus[0], rel_locus[1]),
                                                  max_distance, grid_size)
                            add_ref_locus_to_hist(
                                around_hist,
                                (-1 * rel_locus[1], -1 * rel_locus[0]),
                                max_distance, grid_size)
                    else:
                        add_ref_locus_to_hist(around_hist,
                                              (rel_locus[0], rel_locus[1]),
                                              max_distance, grid_size)
            if count_mode:
                if normalize_by_num_cons:
                    sys.stdout.write(
                        str(float(num_nearby_cons) / con_data.num_cons()) +
                        "\n")
                else:
                    sys.stdout.write(str(num_nearby_cons) + "\n")
        # output 2D histogram
        if not grid_size is None:
            sys.stderr.write("[M::" + __name__ +
                             "] writing output for 2D histogram\n")
            np.savetxt(sys.stdout, around_hist, delimiter='\t')
    else:
        # pairwise leg mode
        # read legs
        legs_1 = [
            string_to_leg(leg_file_line.strip())
            for leg_file_line in open(leg_file_1_name, "rb")
        ]
        if leg_file_2_name is None:
            legs_2 = legs_1
        else:
            legs_2 = [
                string_to_leg(leg_file_line.strip())
                for leg_file_line in open(leg_file_2_name, "rb")
            ]

        # initilize pariwise count matrix
        num_legs_1 = len(legs_1)
        num_legs_2 = len(legs_2)
        count_matrix = np.empty([num_legs_1, num_legs_2], dtype=int)
        count_matrix[:] = -1

        # for each pair of legs
        num_ref_cons = 0
        for i in range(num_legs_1):
            for j in (range(i + 1, num_legs_2)
                      if leg_file_2_name is None else range(num_legs_2)):
                ref_con = Con(legs_1[i], legs_2[j])
                if min_separation is None:
                    # inter-chromosomal only
                    if ref_con.is_intra_chr():
                        continue
                else:
                    # intra-chromosmal only, remove small separations
                    if not ref_con.is_intra_chr(
                    ) or ref_con.separation() < min_separation:
                        continue
                num_ref_cons += 1
                if num_ref_cons % display_num_ref_cons == 0:
                    sys.stderr.write("[M::" + __name__ + "] analyzed " +
                                     str(num_ref_cons) + " reference points\n")

                # count
                num_nearby_cons = 0
                for con in (con_data.get_cons_near(ref_con, max_distance) if
                            superellipse_mode else con_data.get_cons_near_inf(
                                ref_con, max_distance)):
                    num_nearby_cons += 1
                count_matrix[i, j] = num_nearby_cons
                if leg_file_2_name is None:
                    count_matrix[j, i] = num_nearby_cons

        # write pariwise count matrix
        sys.stderr.write("[M::" + __name__ +
                         "] writing output for pairwise count matrix\n")
        np.savetxt(sys.stdout, count_matrix, fmt='%i', delimiter='\t')

    return 0
Ejemplo n.º 3
0
def color(argv):
    # default parameters
    color_file_name = None
    color_mode = None
    max_distance = None
    smooth_distance = None
    max_separation = None
    radial_mode = False
    radial_min_num_particles = 10
    radial_missing_value = -1.0
    radial_max_r = 3.0
    radial_bin_r = 0.05

    # display parameters
    disp_num_particles = 1000

    # read arguments
    try:
        opts, args = getopt.getopt(
            argv[1:], "c:n:l:m:L:i:s:S:hd:r:I:CD:R",
            ["min-num=", "missing=", "max-r=", "bin-size="])
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c color [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -c <color.txt>    color by a list of locus-color pairs (tab-delimited: chr, locus, color)\n"
        )
        sys.stderr.write(
            "  -n <chr.txt>      color by chromosome name (one chromosome per line)\n"
        )
        sys.stderr.write(
            "  -l <chr.len>      color by locus divided by chromosome length (tab-delimited: chr, len)\n"
        )
        sys.stderr.write(
            "  -L <chr.cen>      color by arm locus divided by arm length (tab-delimited: chr, len, center of centromere)\n"
        )
        sys.stderr.write(
            "  -h                color by distance to homologous locus\n\n")
        sys.stderr.write(
            "  -i FLOAT          color by percentage of intra-homologous neighbors within a given distance\n"
        )
        sys.stderr.write(
            "  -I FLOAT          color by number of intra-homologous neighbors within a given distance\n"
        )
        sys.stderr.write(
            "  -S INT            (with \"-i\" or \"-I\") max separation (bp) for intra-homologous neighbors\n\n"
        )
        sys.stderr.write(
            "  -d FLOAT          color by homolog diversity within a given distance\n"
        )
        sys.stderr.write(
            "  -r FLOAT          color by homolog richness within a given distance\n\n"
        )
        sys.stderr.write(
            "  -C                color by distance to the nuclear center of mass\n"
        )
        sys.stderr.write(
            "  -D <in.leg>       color by distance to a given locus (only the first line of the LEG file will be used)\n\n"
        )
        sys.stderr.write(
            "  -s FLOAT          smooth color by averaging over a ball\n\n")
        sys.stderr.write(
            "  -R                special: output average color for different radial distances (normalized to 1.0)\n"
        )
        sys.stderr.write(
            "  --min-num=INT     (with \"-R\") min number of particles per bin ["
            + str(radial_min_num_particles) + "]\n")
        sys.stderr.write(
            "  --missing=FLOAT   (with \"-R\") output value when \"--min-num\" is not met ["
            + str(radial_missing_value) + "]\n")
        sys.stderr.write(
            "  --max-r=FLOAT     (with \"-R\") max radial distance [" +
            str(radial_max_r) + "]\n")
        sys.stderr.write(
            "  --bin-size=FLOAT  (with \"-R\") bin size of radial distances ["
            + str(radial_bin_r) + "]\n\n")
        sys.stderr.write("Output:\n")
        sys.stderr.write("  tab-delimited: homolog, locus, color\n")
        sys.stderr.write(
            "  (with \"-R\") tab-delimited: radial distance, average color, #particles\n"
        )
        return 1

    num_color_schemes = 0
    for o, a in opts:
        if o == "-i" or o == "-I" or o == "-d" or o == "-r":
            num_color_schemes += 1
            color_mode = o[1:]
            max_distance = float(a)
        elif o == "-s":
            smooth_distance = float(a)
        elif o == "-S":
            max_separation = int(a)
        elif o == "--min-num":
            radial_min_num_particles = int(a)
        elif o == "--missing":
            radial_missing_value = float(a)
        elif o == "--max-r":
            radial_max_r = float(a)
        elif o == "--bin-size":
            radial_bin_r = float(a)
        elif o == "-R":
            radial_mode = True
        else:
            num_color_schemes += 1
            color_mode = o[1:]
            if a != "":
                color_file_name = a
    if not max_separation is None and color_mode != "i":
        sys.stderr.write("[E::" + __name__ +
                         "] \"-S\" must be used with \"-i\"\n")
        return 1
    if num_color_schemes != 1:
        sys.stderr.write("[E::" + __name__ +
                         "] exactly one color scheme is needed\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")

    # open color file
    if not color_file_name is None:
        color_file = open(color_file_name, "rb")

    # prepare
    if color_mode is None:
        pass
    elif color_mode == "c":
        ref_name_ref_locus_colors = {}
        for color_file_line in color_file:
            ref_name, ref_locus, color = color_file_line.strip().split("\t")
            ref_locus = int(ref_locus)
            color = float(color)
            ref_name_ref_locus_colors[(ref_name, ref_locus)] = color
    elif color_mode == "n":
        ref_name_colors = {}
        color_counter = 0
        for color_file_line in color_file:
            color_counter += 1
            ref_name = color_file_line.strip()
            ref_name_colors[ref_name] = color_counter
    elif color_mode == "l":
        ref_lens = {}
        for color_file_line in color_file:
            ref_name, ref_len = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_lens[ref_name] = ref_len
    elif color_mode == "L":
        ref_lens = {}
        ref_cens = {}
        for color_file_line in color_file:
            ref_name, ref_len, ref_cen = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_cen = int(ref_cen)
            ref_lens[ref_name] = ref_len
            ref_cens[ref_name] = ref_cen
    elif color_mode == "i" or color_mode == "I" or color_mode == "d" or color_mode == "r":
        g3d_data.prepare_nearby()
    elif color_mode == "C":
        hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays()
        ref_pos = np.mean(position_np_array, axis=0)
        sys.stderr.write("[M::" + __name__ +
                         "] reference point (center of mass) is at (" +
                         ", ".join(map(str, ref_pos)) + ")\n")
    elif color_mode == "D":
        # fine reference point position
        ref_leg = string_to_leg(color_file.readline().strip())
        g3d_data.prepare_interpolate()
        is_out, ref_pos = g3d_data.interpolate_leg(ref_leg)
        sys.stderr.write("[M::" + __name__ + "] reference point (" +
                         ref_leg.to_string() + ") is at (" +
                         ", ".join(map(str, ref_pos)) + ")\n")

    # calculate colors for each particle
    color_data = {}
    atom_id = 0
    for g3d_particle in g3d_data.get_g3d_particles():
        atom_id += 1
        if atom_id % disp_num_particles == 0:
            sys.stderr.write(
                "[M::" + __name__ + "] analyzed " + str(atom_id) +
                " particles (" +
                str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) +
                "%)\n")

        # color
        if color_mode == "c":
            try:
                color = ref_name_ref_locus_colors[(
                    g3d_particle.get_ref_name(), g3d_particle.get_ref_locus())]
            except KeyError:
                continue
        elif color_mode == "n":
            try:
                color = ref_name_colors[g3d_particle.get_ref_name()]
            except KeyError:
                continue
        elif color_mode == "l":
            try:
                color = float(g3d_particle.get_ref_locus()) / ref_lens[
                    g3d_particle.get_ref_name()]
            except KeyError:
                continue
        elif color_mode == "L":
            try:
                arm_locus = g3d_particle.get_ref_locus() - ref_cens[
                    g3d_particle.get_ref_name()]
                if arm_locus > 0:
                    arm_len = ref_lens[g3d_particle.get_ref_name()] - ref_cens[
                        g3d_particle.get_ref_name()]
                else:
                    arm_len = ref_cens[g3d_particle.get_ref_name()]
                color = float(abs(arm_locus)) / arm_len
            except KeyError:
                continue
        elif color_mode == "i":
            color = intra_hom_fraction(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance), max_separation)
            if color is None:
                continue
        elif color_mode == "I":
            color = intra_hom_count(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance), max_separation)
        elif color_mode == "h":
            homologous_g3d_particle = g3d_data.get_g3d_particle_from_hom_name_ref_locus(
                homologous_hom_name(g3d_particle.get_hom_name()),
                g3d_particle.get_ref_locus())
            if homologous_g3d_particle is None:
                continue
            color = math.sqrt(
                (g3d_particle.get_x() - homologous_g3d_particle.get_x())**2 +
                (g3d_particle.get_y() - homologous_g3d_particle.get_y())**2 +
                (g3d_particle.get_z() - homologous_g3d_particle.get_z())**2)
        elif color_mode == "d":
            color = hom_diversity(
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance))
        elif color_mode == "r":
            color = hom_richness(
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance))
        elif color_mode == "C" or color_mode == "D":
            color = math.sqrt((g3d_particle.get_x() - ref_pos[0])**2 +
                              (g3d_particle.get_y() - ref_pos[1])**2 +
                              (g3d_particle.get_z() - ref_pos[2])**2)
        #sys.stderr.write(str(color) + "\n")
        color_data[g3d_particle.get_hom_name(),
                   g3d_particle.get_ref_locus()] = color

    # smoothing
    if not smooth_distance is None:
        g3d_data.prepare_nearby()
        smooth_color_data = {}
        atom_id = 0
        for g3d_particle in g3d_data.get_g3d_particles():
            atom_id += 1
            if atom_id % disp_num_particles == 0:
                sys.stderr.write(
                    "[M::" + __name__ + "] smoothed " + str(atom_id) +
                    " particles (" + str(
                        round(100.0 * atom_id /
                              g3d_data.num_g3d_particles(), 2)) + "%)\n")
            color = smooth_color(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                smooth_distance), color_data)
            if not color is None:
                smooth_color_data[g3d_particle.get_hom_name(),
                                  g3d_particle.get_ref_locus()] = color
        color_data = smooth_color_data

    # radial
    if radial_mode:
        num_radial_bins = int(radial_max_r / radial_bin_r) + 1
        radial_color_sums = [0.0] * num_radial_bins
        radial_color_nums = [0] * num_radial_bins

        # calculate center of mass, and normalization factor
        hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays()
        ref_pos = np.mean(position_np_array, axis=0)
        mean_radial = np.mean(np.sum((position_np_array - ref_pos)**2,
                                     axis=-1)**0.5,
                              axis=0)
        sys.stderr.write("[M::" + __name__ +
                         "] radial mode: average radial distance = " +
                         str(mean_radial) +
                         ", which will be normalize to 1.0\n")

        # examine each particle
        for g3d_particle in g3d_data.get_g3d_particles():
            atom_id += 1
            if atom_id % disp_num_particles == 0:
                sys.stderr.write(
                    "[M::" + __name__ + "] radial mode for " + str(atom_id) +
                    " particles (" + str(
                        round(100.0 * atom_id /
                              g3d_data.num_g3d_particles(), 2)) + "%)\n")
            if (g3d_particle.get_hom_name(),
                    g3d_particle.get_ref_locus()) not in color_data:
                continue
            color = color_data[g3d_particle.get_hom_name(),
                               g3d_particle.get_ref_locus()]
            radial = math.sqrt(
                (g3d_particle.get_x() - ref_pos[0])**2 +
                (g3d_particle.get_y() - ref_pos[1])**2 +
                (g3d_particle.get_z() - ref_pos[2])**2) / mean_radial
            radial_bin_id = int(radial / radial_bin_r + 0.5)
            #sys.stderr.write(str(radial)+", " + str(radial_bin_id) + "=" + str(radial_bin_id*radial_bin_r)+ ", "+ str(color)+"\n")
            if radial_bin_id >= num_radial_bins:
                continue  # out of bound, skip
            radial_color_sums[radial_bin_id] += color
            radial_color_nums[radial_bin_id] += 1

        # output
        sys.stderr.write("[M::" + __name__ + "] writing radial mode output\n")
        for radial_bin_id in range(num_radial_bins):
            if radial_color_nums[radial_bin_id] < radial_min_num_particles:
                output_value = radial_missing_value
            else:
                output_value = radial_color_sums[
                    radial_bin_id] / radial_color_nums[radial_bin_id]
            sys.stdout.write("\t".join([
                str(radial_bin_id * radial_bin_r),
                str(output_value),
                str(radial_color_nums[radial_bin_id])
            ]) + "\n")

        return 0

    # output
    sys.stderr.write(
        "[M::" + __name__ + "] writing " + str(len(color_data)) + " colors (" +
        str(round(100.0 * len(color_data) / g3d_data.num_g3d_particles(), 2)) +
        "%)\n")
    for hom_name, ref_locus in sorted(color_data.keys()):
        sys.stdout.write("\t".join(
            [hom_name,
             str(ref_locus),
             str(color_data[(hom_name, ref_locus)])]) + "\n")

    return 0
Ejemplo n.º 4
0
def bincon(argv):
    # default parameters
    chr_len_file_name = None
    matrix_bin_size = 1000000
    merge_haplotypes = False
    info_mode = False
    leg_mode = False
    min_separation = 0

    # progress display parameters
    display_num_cons = 1e4

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "l:b:HiLs:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write(
            "Usage: dip-c bincon [options] -l <chr.len> <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -l <chr.len>   file containing chromosome lengths (tab-delimited: chr, len)\n"
        )
        sys.stderr.write("  -L             analyze LEG instead of CON\n")
        sys.stderr.write(
            "  -b INT         bin size (bp) (bins are centered around multiples of bin size) ["
            + str(matrix_bin_size) + "]\n")
        sys.stderr.write("  -H             merge the two haplotypes\n")
        sys.stderr.write(
            "  -s INT         min separation (bp) for intra-chromosomal contacts ["
            + str(min_separation) + "]\n")
        sys.stderr.write(
            "  -i             output bin info (tab-delimited: homolog or chr if \"-H\", bin center) instead\n"
        )
        return 1

    num_color_schemes = 0
    for o, a in opts:
        if o == "-l":
            matrix_mode = True
            chr_len_file_name = a
        elif o == "-s":
            min_separation = int(a)
        elif o == "-b":
            matrix_bin_size = int(a)
        elif o == "-H":
            merge_haplotypes = True
        elif o == "-i":
            info_mode = True
        elif o == "-L":
            leg_mode = True
    if chr_len_file_name is None:
        sys.stderr.write("[E::" + __name__ + "] -l is required\n")
        return 1

    # read chromosome lengths
    hom_lens = {}
    hom_bin_lens = {}
    hom_offsets = {}
    matrix_size = 0
    chr_len_file = open(chr_len_file_name, "rb")
    for chr_len_file_line in chr_len_file:
        ref_name, ref_len = chr_len_file_line.strip().split("\t")
        ref_len = int(ref_len)
        for haplotype in ([Haplotypes.paternal] if merge_haplotypes else
                          [Haplotypes.paternal, Haplotypes.maternal]):
            hom_name = ref_name_haplotype_to_hom_name((ref_name, haplotype))
            hom_bin_len = int(round(float(ref_len) / matrix_bin_size)) + 1
            hom_lens[hom_name] = ref_len
            hom_bin_lens[hom_name] = hom_bin_len
            hom_offsets[hom_name] = matrix_size
            matrix_size += hom_bin_len

            if info_mode:
                for bin_id in range(hom_bin_len):
                    sys.stdout.write("\t".join(
                        [(ref_name if merge_haplotypes else hom_name),
                         str(bin_id * matrix_bin_size)]) + "\n")

    # generate matrix
    if not info_mode:
        if leg_mode:
            matrix_data = np.zeros((matrix_size, 1), dtype=int)
            for leg_file_line in open(args[0], "rb"):
                leg = string_to_leg(leg_file_line.strip())
                matrix_data[leg_to_matrix_index(leg, hom_offsets,
                                                matrix_bin_size,
                                                merge_haplotypes)] += 1
        else:
            con_file = gzip.open(args[0],
                                 "rb") if args[0].endswith(".gz") else open(
                                     args[0], "rb")
            con_data = file_to_con_data(con_file)
            con_data.clean_separation(min_separation)
            sys.stderr.write("[M::" + __name__ + "] read " +
                             str(con_data.num_cons()) +
                             " putative contacts (" + str(
                                 round(
                                     100.0 * con_data.num_intra_chr() /
                                     con_data.num_cons(), 2)) +
                             "% intra-chromosomal, " + str(
                                 round(
                                     100.0 * con_data.num_phased_legs() /
                                     con_data.num_cons() / 2, 2)) +
                             "% legs phased)\n")
            matrix_data = con_data_to_matrix(con_data, hom_offsets,
                                             matrix_bin_size, matrix_size,
                                             merge_haplotypes,
                                             display_num_cons)
        np.savetxt(sys.stdout, matrix_data, fmt='%i', delimiter='\t')

    return 0
Ejemplo n.º 5
0
def pd(argv):
    # default parameters
    leg_file_1_name = None
    leg_file_2_name = None

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "1:2:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write(
            "Usage: dip-c pd [options] -1 <in1.leg> [-2 <in2.leg>] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -1 <in1.leg>    LEG file (required)\n")
        sys.stderr.write("  -2 <in2.leg>    LEG file [<in1.leg>]\n")
        return 1
    for o, a in opts:
        if o == "-1":
            leg_file_1_name = a
        elif o == "-2":
            leg_file_2_name = a
    if leg_file_1_name is None:
        sys.stderr.write("[E::" + __name__ + "] -1 is required\n")
        return 1
    if leg_file_2_name is None:
        leg_file_2_name = leg_file_1_name

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")
    g3d_data.prepare_interpolate()

    # convert LEG file to 3DG particles
    positions_1 = np.empty([0, 3])
    for leg_file_1_line in open(leg_file_1_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_1_line.strip()))
        if position is None:
            position = np.array([np.nan, np.nan, np.nan])
        positions_1 = np.vstack([positions_1, position])

    positions_2 = np.empty([0, 3])
    for leg_file_2_line in open(leg_file_2_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_2_line.strip()))
        if position is None:
            position = np.array([np.nan, np.nan, np.nan])
        positions_2 = np.vstack([positions_2, position])

    # calculate pairwise distances
    distances = distance.cdist(positions_1, positions_2)
    np.savetxt(sys.stdout, distances, delimiter='\t')

    return 0