Ejemplo n.º 1
0
def pos(argv):
    # default parameters
    leg_file_name = None
    in_only = False

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "l:O")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c leg [options] -l <in.leg> <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -l <in.leg>    LEG file to convert to 3D positions (required)\n"
        )
        sys.stderr.write("  -O             exclude out-of-bound legs\n")
        return 1
    for o, a in opts:
        if o == "-l":
            leg_file_name = a
        if o == "-O":
            in_only = True
    if leg_file_name is None:
        sys.stderr.write("[E::" + __name__ + "] -l is required\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")
    g3d_data.prepare_interpolate()

    # convert LEG file to 3DG particles
    for leg_file_line in open(leg_file_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_line.strip()))
        if position is None or (is_out and in_only):
            sys.stdout.write("None\n")
        else:
            sys.stdout.write("\t".join(map(str, position)) + "\n")

    return 0
Ejemplo n.º 2
0
def vis(argv):
    # default parameters
    color_file_name = None
    missing_value = -1.0
    discard_missing = False

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "c:m:M")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c vis [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -c <color.txt>    color by a list of locus-color pairs (tab-delimited: homolog, locus, color)\n"
        )
        sys.stderr.write(
            "  -m FLOAT          color for particles that are missing from the color scheme ["
            + str(missing_value) + "]\n")
        sys.stderr.write(
            "  -M                discard particles that are missing from the color scheme\n\n"
        )
        sys.stderr.write("Output mmCIF format:\n")
        sys.stderr.write(
            "  label_asym_id     homolog name (e.g. \"1(mat)\")\n")
        sys.stderr.write(
            "  label_comp_id     locus // 1 Mb, 3 digits with leading zeros\n")
        sys.stderr.write("  label_seq_id      1\n")
        sys.stderr.write(
            "  label_atom_id     locus % 1 Mb // 1 kb, 3 digits with leading zeros\n"
        )
        sys.stderr.write("  B_iso_or_equiv    scalar color\n")
        sys.stderr.write("  covale            backbone bond\n")
        return 1

    num_color_schemes = 0
    for o, a in opts:
        if o == "-m":
            missing_value = float(a)
        elif o == "-c":
            color_file_name = a
        elif o == "-M":
            discard_missing = True

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")

    # read color file
    color_data = {}
    if not color_file_name is None:
        color_file = open(color_file_name, "rb")
        for color_file_line in color_file:
            hom_name, ref_locus, color = color_file_line.strip().split("\t")
            ref_locus = int(ref_locus)
            color = float(color)
            color_data[(hom_name, ref_locus)] = color

    # open mmCIF file to write
    myDataList = []
    curContainer = DataContainer("myblock")
    aCat = DataCategory("atom_site")
    aCat.appendAttribute("group_PDB")
    aCat.appendAttribute("type_symbol")
    aCat.appendAttribute("id")
    aCat.appendAttribute("label_asym_id")
    aCat.appendAttribute("label_comp_id")
    aCat.appendAttribute("label_seq_id")
    aCat.appendAttribute("label_atom_id")
    aCat.appendAttribute("Cartn_x")
    aCat.appendAttribute("Cartn_y")
    aCat.appendAttribute("Cartn_z")
    aCat.appendAttribute("B_iso_or_equiv")

    sCat = DataCategory("struct_conn")
    sCat.appendAttribute("id")
    sCat.appendAttribute("conn_type_id")
    sCat.appendAttribute("ptnr1_label_asym_id")
    sCat.appendAttribute("ptnr1_label_comp_id")
    sCat.appendAttribute("ptnr1_label_seq_id")
    sCat.appendAttribute("ptnr1_label_atom_id")
    sCat.appendAttribute("ptnr2_label_asym_id")
    sCat.appendAttribute("ptnr2_label_comp_id")
    sCat.appendAttribute("ptnr2_label_seq_id")
    sCat.appendAttribute("ptnr2_label_atom_id")

    # write atoms
    atom_id = 0
    for g3d_particle in g3d_data.get_g3d_particles():
        atom_id += 1
        try:
            color = color_data[(g3d_particle.get_hom_name(),
                                g3d_particle.get_ref_locus())]
        except KeyError:
            if discard_missing:
                continue
            color = missing_value
        aCat.append(g3d_particle_to_atom_data(g3d_particle, atom_id, color))

    # write backbond bonds
    conn_id = 0
    for g3d_particle_tuple in g3d_data.get_adjacent_g3d_particle_tuples(
            g3d_resolution):
        conn_id += 1
        sCat.append(
            g3d_particle_tuple_to_conn_data(g3d_particle_tuple, conn_id))

    # write output
    curContainer.append(sCat)
    curContainer.append(aCat)
    myDataList.append(curContainer)
    pdbxW = PdbxWriter(sys.stdout)
    pdbxW.write(myDataList)

    return 0
Ejemplo n.º 3
0
def exp(argv):
    # default parameters
    expansion_factor = 3.0
    centers_only = False

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "f:c")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c exp [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -f FLOAT     expansion factor for translating away from nuclear center ["
            + str(expansion_factor) + "]\n")
        sys.stderr.write("  -c           output centers of mass\n")
        return 1
    for o, a in opts:
        if o == "-f":
            expansion_factor = float(a)
        if o == "-c":
            centers_only = True

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")

    # center of nucleus
    nuc_center = center_g3d_particles(g3d_data.get_g3d_particles())

    # process data
    if centers_only:
        center_g3d_data = G3dData()
        for hom_name in g3d_data.get_hom_names():
            center_position = center_g3d_particles(
                g3d_data.get_g3d_particles_from_hom_name(hom_name))
            center_position += (center_position -
                                nuc_center) * expansion_factor
            center_g3d_data.add_g3d_particle(
                G3dParticle(hom_name, 0, center_position.tolist()))
        g3d_data = center_g3d_data
    else:
        hom_centers = {}
        # center of each homologs
        for hom_name in g3d_data.get_hom_names():
            hom_centers[hom_name] = center_g3d_particles(
                g3d_data.get_g3d_particles_from_hom_name(hom_name))
            sys.stderr.write("extract " + hom_name_to_object_name(hom_name) +
                             ", chain \"" + hom_name + "\"\n")
        # translate
        for hom_name in g3d_data.get_hom_names():
            translation_vector = (hom_centers[hom_name] -
                                  nuc_center) * expansion_factor
            for g3d_particle in g3d_data.get_g3d_particles_from_hom_name(
                    hom_name):
                g3d_particle.set_position(
                    (np.array(g3d_particle.get_position()) +
                     translation_vector).tolist())
            #sys.stderr.write("translate [" + ",".join(map(str, translation_vector)) + "], chain \"" + hom_name + "\"\n")
            sys.stderr.write("translate [" +
                             ",".join(map(str, translation_vector)) +
                             "], object=" + hom_name_to_object_name(hom_name) +
                             ", camera=0\n")
        for hom_name in g3d_data.get_hom_names():
            sys.stderr.write("mview store, object=" +
                             hom_name_to_object_name(hom_name) + "\n")

    # output
    sys.stderr.write("[M::" + __name__ + "] writing output for " +
                     str(g3d_data.num_g3d_particles()) + " particles\n")
    sys.stdout.write(g3d_data.to_string() + "\n")

    return 0
Ejemplo n.º 4
0
def impute3(argv):
    # default parameters
    g3d_file_name = None
    vio_file_name = None
    max_impute3_distance = 20
    max_impute3_ratio = 0.5
    min_impute3_separation_factor = 1.0
    max_clean_distance = 10000000
    min_clean_count = 2
    is_male = False
    par_data = None

    # presets
    h_par = ParData("X", "Y")
    h_par.add_par(Par("X", 60000, 2699520, "Y", 10000))
    h_par.add_par(Par("X", 154931043, 155260560, "Y", 59034049))

    m_par = ParData("chrX", "chrY")
    m_par.add_par(Par("chrX", 169969758, 170931299, "chrY", 90745844))

    presets = {
        "f": [False, None],
        "hf": [False, None],
        "mf": [False, None],
        "hm": [True, h_par],
        "mm": [True, m_par]
    }
    preset_descriptions = {
        "f": "female",
        "hf": "human female (same as f)",
        "mf": "mouse female (same as f)",
        "hm": "human male (hg19, no \"chr\" prefix)",
        "mm": "mouse male (mm10)"
    }

    # progress display parameters
    display_max_num_legs = 20
    display_num_cons = 10000

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "3:v:d:r:s:D:C:p:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write(
            "Usage: dip-c impute3 [options] -3 <in.3dg> [-v <out.vio>] <in.con>\n"
        )
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -3 <in.3dg>    3D genome file for imputing haplotypes (required)\n"
        )
        sys.stderr.write(
            "  -v <out.vio>   output statistics to a contact violation file:\n"
        )
        sys.stderr.write(
            "                   tab-delimited: leg 1, leg 2, num of compatible haplotypes,\n"
        )
        sys.stderr.write(
            "                   shortest 3D distance, ratio between the shortest and the 2nd shortest distance\n\n"
        )
        sys.stderr.write(
            "  -d FLOAT       max 3D distance for imputing haplotypes [" +
            str(max_impute3_distance) + "]\n")
        sys.stderr.write(
            "  -r FLOAT       max ratio between 3D distances for the best and 2nd best haplotypes ["
            + str(max_impute3_ratio) + "]\n")
        sys.stderr.write(
            "  -s FLOAT       min separation (unit: 3D genome resolution) for imputing\n"
        )
        sys.stderr.write(
            "                   completely unphased, intra-chromosomal contacts ["
            + str(min_impute3_separation_factor) + "]\n\n")
        sys.stderr.write(
            "  -D INT         max distance (bp, L-1/2 norm) for removing isolated contacts ["
            + str(max_clean_distance) + "]\n")
        sys.stderr.write(
            "  -C INT         min neighbor count for an unisolated contact [" +
            str(min_clean_count) + "]\n\n")
        sys.stderr.write("  -p STR         presets for PARs and sex: [f]\n")
        for preset in sorted(presets.keys()):
            sys.stderr.write("                   " + preset + " = " +
                             preset_descriptions[preset] + "\n")
        return 1
    for o, a in opts:
        if o == "-3":
            g3d_file_name = a
        elif o == "-v":
            vio_file_name = a
        elif o == "-d":
            max_impute3_distance = float(a)
        elif o == "-r":
            max_impute3_ratio = float(a)
        elif o == "-s":
            min_impute3_separation_factor = float(a)
        elif o == "-D":
            max_clean_distance = int(a)
        elif o == "-C":
            min_clean_count = int(a)
        elif o == "-p":
            try:
                is_male, par_data = presets[a]
                sys.stderr.write("[M::" + __name__ + "] use preset " + a +
                                 " = " + preset_descriptions[a] + "\n")
            except KeyError:
                sys.stderr.write("[E::" + __name__ + "] unknown preset\n")
                return 1
    if g3d_file_name is None:
        sys.stderr.write("[E::" + __name__ + "] -3 is required\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(g3d_file_name, "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")
    g3d_data.prepare_interpolate()

    # read CON file
    con_file = gzip.open(args[0], "rb") if args[0].endswith(".gz") else open(
        args[0], "rb")
    con_data = file_to_con_data(con_file)
    sys.stderr.write(
        "[M::" + __name__ + "] read " + str(con_data.num_cons()) +
        " contacts (" +
        str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) +
        "% intra-chromosomal, " + str(
            round(100.0 * con_data.num_phased_legs() / con_data.num_cons() /
                  2, 2)) + "% legs phased)\n")

    # impute3
    vio_file = None
    if not vio_file_name is None:
        vio_file = open(vio_file_name, "wb")
    con_data.impute_from_g3d_data(g3d_data, max_impute3_distance,
                                  max_impute3_ratio,
                                  max_impute3_ratio * g3d_resolution, is_male,
                                  par_data, vio_file)
    if not vio_file is None:
        vio_file.close()
    sys.stderr.write(
        "[M::" + __name__ + "] imputed " + str(con_data.num_phased_cons()) +
        " contacts (" +
        str(round(100.0 * con_data.num_phased_cons() /
                  con_data.num_cons(), 2)) + "%)\n")

    # clean imputed
    con_data.sort_cons()
    con_data.clean_unphased()
    before_clean_num_cons = con_data.num_cons()
    con_data.clean_isolated_phased(copy.deepcopy(con_data), max_clean_distance,
                                   min_clean_count)
    after_clean_num_cons = con_data.num_cons()
    sys.stderr.write("[M::" + __name__ + "] removed " +
                     str(before_clean_num_cons - after_clean_num_cons) +
                     " isolated contacts (" + str(
                         round(
                             100.0 *
                             (before_clean_num_cons - after_clean_num_cons) /
                             before_clean_num_cons, 2)) + "%)\n")

    # write output
    sys.stderr.write(
        "[M::" + __name__ + "] writing output for " +
        str(con_data.num_cons()) + " contacts (" +
        str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) +
        "% intra-chromosomal, " + str(
            round(100.0 * con_data.num_phased_legs() / con_data.num_cons() /
                  2, 2)) + "% legs phased)\n")
    sys.stdout.write(con_data.to_string() + "\n")

    return 0
Ejemplo n.º 5
0
def color(argv):
    # default parameters
    color_file_name = None
    color_mode = None
    max_distance = None
    smooth_distance = None
    max_separation = None
    
    # display parameters
    disp_num_particles = 1000
    
    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "c:n:l:m:L:i:s:S:hd:r:I:C")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c color [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -c <color.txt>    color by a list of locus-color pairs (tab-delimited: chr, locus, color)\n")
        sys.stderr.write("  -n <chr.txt>      color by chromosome name (one chromosome per line)\n")
        sys.stderr.write("  -l <chr.len>      color by locus divided by chromosome length (tab-delimited: chr, len)\n")
        sys.stderr.write("  -L <chr.cen>      color by arm locus divided by arm length (tab-delimited: chr, len, center of centromere)\n")
        sys.stderr.write("  -h                color by distance to homologous locus\n\n")
        sys.stderr.write("  -i FLOAT          color by percentage of intra-homologous neighbors within a given distance\n")
        sys.stderr.write("  -I FLOAT          color by number of intra-homologous neighbors within a given distance\n")
        sys.stderr.write("  -S INT            (with \"-i\" or \"-I\") max separation (bp) for intra-homologous neighbors\n\n")
        sys.stderr.write("  -d FLOAT          color by homolog diversity within a given distance\n")
        sys.stderr.write("  -r FLOAT          color by homolog richness within a given distance\n\n")
        sys.stderr.write("  -C                color by distance to the nuclear center of mass\n")
        sys.stderr.write("  -s FLOAT          smooth color by averaging over a ball\n")
        sys.stderr.write("Output:\n")
        sys.stderr.write("  tab-delimited: homolog, locus, color\n")
        return 1
        
    num_color_schemes = 0
    for o, a in opts:
        if o == "-i" or o == "-I" or o == "-d" or o == "-r":
            num_color_schemes += 1
            color_mode = o[1:]
            max_distance = float(a)
        elif o == "-s":
            smooth_distance = float(a)
        elif o == "-S":
            max_separation = int(a)
        else:
            num_color_schemes += 1
            color_mode = o[1:]
            if a != "":
                color_file_name = a
    if not max_separation is None and color_mode != "i":
        sys.stderr.write("[E::" + __name__ + "] \"-S\" must be used with \"-i\"\n")
        return 1
    if num_color_schemes != 1:
        sys.stderr.write("[E::" + __name__ + "] exactly one color scheme is needed\n")
        return 1
                    
    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n")

    # open color file
    if not color_file_name is None:
        color_file = open(color_file_name, "rb")
    
    # prepare
    if color_mode is None:
        pass
    elif color_mode == "c":
        ref_name_ref_locus_colors = {}
        for color_file_line in color_file:
            ref_name, ref_locus, color = color_file_line.strip().split("\t")
            ref_locus = int(ref_locus)
            color = float(color)
            ref_name_ref_locus_colors[(ref_name, ref_locus)] = color
    elif color_mode == "n":
        ref_name_colors = {}
        color_counter = 0
        for color_file_line in color_file:
            color_counter += 1
            ref_name = color_file_line.strip()
            ref_name_colors[ref_name] = color_counter
    elif color_mode == "l":
        ref_lens = {}
        for color_file_line in color_file:
            ref_name, ref_len = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_lens[ref_name] = ref_len
    elif color_mode == "L":
        ref_lens = {}
        ref_cens = {}
        for color_file_line in color_file:
            ref_name, ref_len, ref_cen = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_cen = int(ref_cen)
            ref_lens[ref_name] = ref_len
            ref_cens[ref_name] = ref_cen
    elif color_mode == "i" or color_mode == "I" or color_mode == "d" or color_mode == "r":
        g3d_data.prepare_nearby()
    elif color_mode == "C":
        hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays()
        center_mass = np.mean(position_np_array, axis = 0)
        sys.stderr.write("[M::" + __name__ + "] center of mass is at (" + ", ".join(map(str, center_mass)) + ")\n")
                        
    # calculate colors for each particle
    color_data = {}
    atom_id = 0
    for g3d_particle in g3d_data.get_g3d_particles():
        atom_id += 1
        if atom_id % disp_num_particles == 0:
            sys.stderr.write("[M::" + __name__ + "] analyzed " + str(atom_id) + " particles (" + str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n")
        
        # color
        if color_mode == "c":
            try:
                color = ref_name_ref_locus_colors[(g3d_particle.get_ref_name(), g3d_particle.get_ref_locus())]
            except KeyError:
                continue
        elif color_mode == "n":
            try:
                color = ref_name_colors[g3d_particle.get_ref_name()]
            except KeyError:
                continue
        elif color_mode == "l":
            try:
                color = float(g3d_particle.get_ref_locus()) / ref_lens[g3d_particle.get_ref_name()]
            except KeyError:
                continue       
        elif color_mode == "L":
            try:
                arm_locus = g3d_particle.get_ref_locus() - ref_cens[g3d_particle.get_ref_name()]
                if arm_locus > 0:
                    arm_len = ref_lens[g3d_particle.get_ref_name()] - ref_cens[g3d_particle.get_ref_name()]
                else:
                    arm_len = ref_cens[g3d_particle.get_ref_name()]
                color = float(abs(arm_locus)) / arm_len
            except KeyError:
                continue    
        elif color_mode == "i":
            color = intra_hom_fraction(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation)
            if color is None:
                continue
        elif color_mode == "I":
            color = intra_hom_count(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation)
        elif color_mode == "h":
            homologous_g3d_particle = g3d_data.get_g3d_particle_from_hom_name_ref_locus(homologous_hom_name(g3d_particle.get_hom_name()), g3d_particle.get_ref_locus())
            if homologous_g3d_particle is None:
                continue
            color = math.sqrt((g3d_particle.get_x() - homologous_g3d_particle.get_x()) ** 2 + (g3d_particle.get_y() - homologous_g3d_particle.get_y()) ** 2 + (g3d_particle.get_z() - homologous_g3d_particle.get_z()) ** 2)
        elif color_mode == "d":
            color = hom_diversity(g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance))
        elif color_mode == "r":
            color = hom_richness(g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance))
        elif color_mode == "C":
            color = math.sqrt((g3d_particle.get_x() - center_mass[0]) ** 2 + (g3d_particle.get_y() - center_mass[1]) ** 2 + (g3d_particle.get_z() - center_mass[2]) ** 2)
        #sys.stderr.write(str(color) + "\n")
        color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color
        
    # smoothing
    if not smooth_distance is None:
        g3d_data.prepare_nearby()
        smooth_color_data = {}
        atom_id = 0
        for g3d_particle in g3d_data.get_g3d_particles():
            atom_id += 1
            if atom_id % disp_num_particles == 0:
                sys.stderr.write("[M::" + __name__ + "] smoothed " + str(atom_id) + " particles (" + str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n")
            color = smooth_color(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), smooth_distance), color_data)
            if not color is None:
                smooth_color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color
        color_data = smooth_color_data
            
    # output
    sys.stderr.write("[M::" + __name__ + "] writing " + str(len(color_data)) + " colors (" + str(round(100.0 * len(color_data) / g3d_data.num_g3d_particles(), 2)) + "%)\n")
    for hom_name, ref_locus in sorted(color_data.keys()):
        sys.stdout.write("\t".join([hom_name, str(ref_locus), str(color_data[(hom_name, ref_locus)])]) + "\n")
    
    return 0
Ejemplo n.º 6
0
Archivo: rg.py Proyecto: zhuakexi/dip-c
def rg(argv):
    # default parameters
    output_prefix = None
    reg_file_name = None
    reg_list = []
    distance_mode = False
    
    # display parameters
    disp_num_particles = 100
    
    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "o:r:d")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c rg [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -o STR        output prefix [<in.3dg>.]\n")
        sys.stderr.write("  -r <in.reg>   only analyze certain regions\n")
        sys.stderr.write("                  (will output two regions if haplotype is \".\")\n")
        sys.stderr.write("  -d            output pairwise distances instead\n")
        sys.stderr.write("Output:\n")
        sys.stderr.write("  <prefix><region_name>.rg    an N x N matrix of radii of gyration\n")
        sys.stderr.write("  <prefix><region_name>.loc   a list of N chromosomal loci\n")
        return 1
    for o, a in opts:
        if o == "-o":
            output_prefix = a
        elif o == "-r":
            reg_file_name = a
        elif o == "-d":
            distance_mode = True
            
    if output_prefix is None:
        output_prefix = args[0] + "."
        
    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + str(g3d_resolution) + " bp resolution\n")
    
    # prepare regions to analyze
    if reg_file_name is None:
        # analyze all homologs
        for hom_name in g3d_data.get_hom_names():
            ref_name, haplotype = hom_name_to_ref_name_haplotype(hom_name)
            reg = Reg(ref_name)
            reg.add_haplotype(haplotype)
            reg_list.append(reg)
    else:
        reg_file = gzip.open(reg_file_name, "rb") if reg_file_name.endswith(".gz") else open(reg_file_name, "rb")
        reg_list.extend(file_to_reg_list(reg_file))
        reg_file.close()
        reg_list = [reg for reg in get_phased_regs(reg_list)]
    
    sys.stderr.write("[M::" + __name__ + "] will analyze the following regions:\n")
    sys.stderr.write("name\tchr\thap\tstart\tend\n")
    for reg in reg_list:
        sys.stderr.write(reg.to_name_string() + "\t" + reg.to_string() + "\n")
    
    # calculate Rg matrix for each region
    for reg in reg_list:
        reg_name = reg.to_name_string()
        g3d_list = G3dList()
        for g3d_particle in g3d_data.get_g3d_particles_in_reg(reg):
            g3d_list.add_g3d_particle(g3d_particle)
        g3d_list.sort_g3d_particles()
        sys.stderr.write("[M::" + __name__ + "] processing region " + reg_name + ", with " + str(g3d_list.num_g3d_particles()) + " particles\n")

        loci_np_array, position_np_array = g3d_list.to_np_arrays()
        
        # write loci file
        loci_file_name = output_prefix + reg_name + ".loc"
        np.savetxt(loci_file_name, loci_np_array, fmt='%i', delimiter='\t')
        
        # calculate Rg
        rg_file_name = output_prefix + reg_name + ".rg"
        if distance_mode:
            output_matrix = squareform(pdist(position_np_array))
        else:
            output_matrix = position_np_array_to_rg_np_array(position_np_array, disp_num_particles)
        np.savetxt(rg_file_name, output_matrix, delimiter='\t')        
    
    return 0
Ejemplo n.º 7
0
def dist(argv):
    # default parameters

    # display parameters
    disp_num_particles = 100

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "d")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c dist [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -d            diploid mode\n")
        sys.stderr.write("Output:\n")
        sys.stderr.write(
            "  tab-delimited: homolog (chr if \"-d\"), separation (in bp), #pairs, mean distance, r.m.s. distance\n"
        )

        return 1
    for o, a in opts:
        if o == "-o":
            output_prefix = a
        elif o == "-r":
            reg_file_name = a

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " +
                     str(g3d_data.num_g3d_particles()) + " particles at " +
                     str(g3d_resolution) + " bp resolution\n")

    # analyze each homolog
    for hom_name in g3d_data.get_hom_names():
        sys.stderr.write("analyzing " + hom_name + "\n")
        loci_np_array, position_np_array = g3d_data.get_g3d_list_from_hom_name(
            hom_name).to_np_arrays()
        sep_np_array = pdist(loci_np_array)
        dist_np_array = pdist(position_np_array)
        uniq_seps, uniq_indices = np.unique(sep_np_array, return_inverse=True)
        num_seps = len(uniq_seps)

        # calculate statistics
        nums_pairs = [0] * num_seps
        sums = [0.0] * num_seps
        sums_sq = [0.0] * num_seps
        for i in range(len(sep_np_array)):
            sep_index = uniq_indices[i]
            nums_pairs[sep_index] += 1
            sums[sep_index] += dist_np_array[i]
            sums_sq[sep_index] += dist_np_array[i]**2

        # print
        for i in range(num_seps):
            sys.stdout.write("\t".join([
                hom_name,
                str(int(uniq_seps[i])),
                str(nums_pairs[i]),
                str(sums[i] / nums_pairs[i]),
                str(math.sqrt(sums_sq[i] / nums_pairs[i]))
            ]) + "\n")

    return 0
Ejemplo n.º 8
0
def color(argv):
    # default parameters
    color_file_name = None
    color_mode = None
    max_distance = None
    smooth_distance = None
    max_separation = None
    radial_mode = False
    radial_min_num_particles = 10
    radial_missing_value = -1.0
    radial_max_r = 3.0
    radial_bin_r = 0.05

    # display parameters
    disp_num_particles = 1000

    # read arguments
    try:
        opts, args = getopt.getopt(
            argv[1:], "c:n:l:m:L:i:s:S:hd:r:I:CD:R",
            ["min-num=", "missing=", "max-r=", "bin-size="])
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c color [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -c <color.txt>    color by a list of locus-color pairs (tab-delimited: chr, locus, color)\n"
        )
        sys.stderr.write(
            "  -n <chr.txt>      color by chromosome name (one chromosome per line)\n"
        )
        sys.stderr.write(
            "  -l <chr.len>      color by locus divided by chromosome length (tab-delimited: chr, len)\n"
        )
        sys.stderr.write(
            "  -L <chr.cen>      color by arm locus divided by arm length (tab-delimited: chr, len, center of centromere)\n"
        )
        sys.stderr.write(
            "  -h                color by distance to homologous locus\n\n")
        sys.stderr.write(
            "  -i FLOAT          color by percentage of intra-homologous neighbors within a given distance\n"
        )
        sys.stderr.write(
            "  -I FLOAT          color by number of intra-homologous neighbors within a given distance\n"
        )
        sys.stderr.write(
            "  -S INT            (with \"-i\" or \"-I\") max separation (bp) for intra-homologous neighbors\n\n"
        )
        sys.stderr.write(
            "  -d FLOAT          color by homolog diversity within a given distance\n"
        )
        sys.stderr.write(
            "  -r FLOAT          color by homolog richness within a given distance\n\n"
        )
        sys.stderr.write(
            "  -C                color by distance to the nuclear center of mass\n"
        )
        sys.stderr.write(
            "  -D <in.leg>       color by distance to a given locus (only the first line of the LEG file will be used)\n\n"
        )
        sys.stderr.write(
            "  -s FLOAT          smooth color by averaging over a ball\n\n")
        sys.stderr.write(
            "  -R                special: output average color for different radial distances (normalized to 1.0)\n"
        )
        sys.stderr.write(
            "  --min-num=INT     (with \"-R\") min number of particles per bin ["
            + str(radial_min_num_particles) + "]\n")
        sys.stderr.write(
            "  --missing=FLOAT   (with \"-R\") output value when \"--min-num\" is not met ["
            + str(radial_missing_value) + "]\n")
        sys.stderr.write(
            "  --max-r=FLOAT     (with \"-R\") max radial distance [" +
            str(radial_max_r) + "]\n")
        sys.stderr.write(
            "  --bin-size=FLOAT  (with \"-R\") bin size of radial distances ["
            + str(radial_bin_r) + "]\n\n")
        sys.stderr.write("Output:\n")
        sys.stderr.write("  tab-delimited: homolog, locus, color\n")
        sys.stderr.write(
            "  (with \"-R\") tab-delimited: radial distance, average color, #particles\n"
        )
        return 1

    num_color_schemes = 0
    for o, a in opts:
        if o == "-i" or o == "-I" or o == "-d" or o == "-r":
            num_color_schemes += 1
            color_mode = o[1:]
            max_distance = float(a)
        elif o == "-s":
            smooth_distance = float(a)
        elif o == "-S":
            max_separation = int(a)
        elif o == "--min-num":
            radial_min_num_particles = int(a)
        elif o == "--missing":
            radial_missing_value = float(a)
        elif o == "--max-r":
            radial_max_r = float(a)
        elif o == "--bin-size":
            radial_bin_r = float(a)
        elif o == "-R":
            radial_mode = True
        else:
            num_color_schemes += 1
            color_mode = o[1:]
            if a != "":
                color_file_name = a
    if not max_separation is None and color_mode != "i":
        sys.stderr.write("[E::" + __name__ +
                         "] \"-S\" must be used with \"-i\"\n")
        return 1
    if num_color_schemes != 1:
        sys.stderr.write("[E::" + __name__ +
                         "] exactly one color scheme is needed\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")

    # open color file
    if not color_file_name is None:
        color_file = open(color_file_name, "rb")

    # prepare
    if color_mode is None:
        pass
    elif color_mode == "c":
        ref_name_ref_locus_colors = {}
        for color_file_line in color_file:
            ref_name, ref_locus, color = color_file_line.strip().split("\t")
            ref_locus = int(ref_locus)
            color = float(color)
            ref_name_ref_locus_colors[(ref_name, ref_locus)] = color
    elif color_mode == "n":
        ref_name_colors = {}
        color_counter = 0
        for color_file_line in color_file:
            color_counter += 1
            ref_name = color_file_line.strip()
            ref_name_colors[ref_name] = color_counter
    elif color_mode == "l":
        ref_lens = {}
        for color_file_line in color_file:
            ref_name, ref_len = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_lens[ref_name] = ref_len
    elif color_mode == "L":
        ref_lens = {}
        ref_cens = {}
        for color_file_line in color_file:
            ref_name, ref_len, ref_cen = color_file_line.strip().split("\t")
            ref_len = int(ref_len)
            ref_cen = int(ref_cen)
            ref_lens[ref_name] = ref_len
            ref_cens[ref_name] = ref_cen
    elif color_mode == "i" or color_mode == "I" or color_mode == "d" or color_mode == "r":
        g3d_data.prepare_nearby()
    elif color_mode == "C":
        hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays()
        ref_pos = np.mean(position_np_array, axis=0)
        sys.stderr.write("[M::" + __name__ +
                         "] reference point (center of mass) is at (" +
                         ", ".join(map(str, ref_pos)) + ")\n")
    elif color_mode == "D":
        # fine reference point position
        ref_leg = string_to_leg(color_file.readline().strip())
        g3d_data.prepare_interpolate()
        is_out, ref_pos = g3d_data.interpolate_leg(ref_leg)
        sys.stderr.write("[M::" + __name__ + "] reference point (" +
                         ref_leg.to_string() + ") is at (" +
                         ", ".join(map(str, ref_pos)) + ")\n")

    # calculate colors for each particle
    color_data = {}
    atom_id = 0
    for g3d_particle in g3d_data.get_g3d_particles():
        atom_id += 1
        if atom_id % disp_num_particles == 0:
            sys.stderr.write(
                "[M::" + __name__ + "] analyzed " + str(atom_id) +
                " particles (" +
                str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) +
                "%)\n")

        # color
        if color_mode == "c":
            try:
                color = ref_name_ref_locus_colors[(
                    g3d_particle.get_ref_name(), g3d_particle.get_ref_locus())]
            except KeyError:
                continue
        elif color_mode == "n":
            try:
                color = ref_name_colors[g3d_particle.get_ref_name()]
            except KeyError:
                continue
        elif color_mode == "l":
            try:
                color = float(g3d_particle.get_ref_locus()) / ref_lens[
                    g3d_particle.get_ref_name()]
            except KeyError:
                continue
        elif color_mode == "L":
            try:
                arm_locus = g3d_particle.get_ref_locus() - ref_cens[
                    g3d_particle.get_ref_name()]
                if arm_locus > 0:
                    arm_len = ref_lens[g3d_particle.get_ref_name()] - ref_cens[
                        g3d_particle.get_ref_name()]
                else:
                    arm_len = ref_cens[g3d_particle.get_ref_name()]
                color = float(abs(arm_locus)) / arm_len
            except KeyError:
                continue
        elif color_mode == "i":
            color = intra_hom_fraction(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance), max_separation)
            if color is None:
                continue
        elif color_mode == "I":
            color = intra_hom_count(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance), max_separation)
        elif color_mode == "h":
            homologous_g3d_particle = g3d_data.get_g3d_particle_from_hom_name_ref_locus(
                homologous_hom_name(g3d_particle.get_hom_name()),
                g3d_particle.get_ref_locus())
            if homologous_g3d_particle is None:
                continue
            color = math.sqrt(
                (g3d_particle.get_x() - homologous_g3d_particle.get_x())**2 +
                (g3d_particle.get_y() - homologous_g3d_particle.get_y())**2 +
                (g3d_particle.get_z() - homologous_g3d_particle.get_z())**2)
        elif color_mode == "d":
            color = hom_diversity(
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance))
        elif color_mode == "r":
            color = hom_richness(
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                max_distance))
        elif color_mode == "C" or color_mode == "D":
            color = math.sqrt((g3d_particle.get_x() - ref_pos[0])**2 +
                              (g3d_particle.get_y() - ref_pos[1])**2 +
                              (g3d_particle.get_z() - ref_pos[2])**2)
        #sys.stderr.write(str(color) + "\n")
        color_data[g3d_particle.get_hom_name(),
                   g3d_particle.get_ref_locus()] = color

    # smoothing
    if not smooth_distance is None:
        g3d_data.prepare_nearby()
        smooth_color_data = {}
        atom_id = 0
        for g3d_particle in g3d_data.get_g3d_particles():
            atom_id += 1
            if atom_id % disp_num_particles == 0:
                sys.stderr.write(
                    "[M::" + __name__ + "] smoothed " + str(atom_id) +
                    " particles (" + str(
                        round(100.0 * atom_id /
                              g3d_data.num_g3d_particles(), 2)) + "%)\n")
            color = smooth_color(
                g3d_particle,
                g3d_data.get_g3d_particles_near(g3d_particle.get_position(),
                                                smooth_distance), color_data)
            if not color is None:
                smooth_color_data[g3d_particle.get_hom_name(),
                                  g3d_particle.get_ref_locus()] = color
        color_data = smooth_color_data

    # radial
    if radial_mode:
        num_radial_bins = int(radial_max_r / radial_bin_r) + 1
        radial_color_sums = [0.0] * num_radial_bins
        radial_color_nums = [0] * num_radial_bins

        # calculate center of mass, and normalization factor
        hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays()
        ref_pos = np.mean(position_np_array, axis=0)
        mean_radial = np.mean(np.sum((position_np_array - ref_pos)**2,
                                     axis=-1)**0.5,
                              axis=0)
        sys.stderr.write("[M::" + __name__ +
                         "] radial mode: average radial distance = " +
                         str(mean_radial) +
                         ", which will be normalize to 1.0\n")

        # examine each particle
        for g3d_particle in g3d_data.get_g3d_particles():
            atom_id += 1
            if atom_id % disp_num_particles == 0:
                sys.stderr.write(
                    "[M::" + __name__ + "] radial mode for " + str(atom_id) +
                    " particles (" + str(
                        round(100.0 * atom_id /
                              g3d_data.num_g3d_particles(), 2)) + "%)\n")
            if (g3d_particle.get_hom_name(),
                    g3d_particle.get_ref_locus()) not in color_data:
                continue
            color = color_data[g3d_particle.get_hom_name(),
                               g3d_particle.get_ref_locus()]
            radial = math.sqrt(
                (g3d_particle.get_x() - ref_pos[0])**2 +
                (g3d_particle.get_y() - ref_pos[1])**2 +
                (g3d_particle.get_z() - ref_pos[2])**2) / mean_radial
            radial_bin_id = int(radial / radial_bin_r + 0.5)
            #sys.stderr.write(str(radial)+", " + str(radial_bin_id) + "=" + str(radial_bin_id*radial_bin_r)+ ", "+ str(color)+"\n")
            if radial_bin_id >= num_radial_bins:
                continue  # out of bound, skip
            radial_color_sums[radial_bin_id] += color
            radial_color_nums[radial_bin_id] += 1

        # output
        sys.stderr.write("[M::" + __name__ + "] writing radial mode output\n")
        for radial_bin_id in range(num_radial_bins):
            if radial_color_nums[radial_bin_id] < radial_min_num_particles:
                output_value = radial_missing_value
            else:
                output_value = radial_color_sums[
                    radial_bin_id] / radial_color_nums[radial_bin_id]
            sys.stdout.write("\t".join([
                str(radial_bin_id * radial_bin_r),
                str(output_value),
                str(radial_color_nums[radial_bin_id])
            ]) + "\n")

        return 0

    # output
    sys.stderr.write(
        "[M::" + __name__ + "] writing " + str(len(color_data)) + " colors (" +
        str(round(100.0 * len(color_data) / g3d_data.num_g3d_particles(), 2)) +
        "%)\n")
    for hom_name, ref_locus in sorted(color_data.keys()):
        sys.stdout.write("\t".join(
            [hom_name,
             str(ref_locus),
             str(color_data[(hom_name, ref_locus)])]) + "\n")

    return 0
Ejemplo n.º 9
0
def con3(argv):
    # default parameters
    max_distance = 3.0
    matrix_mode = False
    chr_len_file_name = None
    matrix_bin_size = 1000000
    merge_haplotypes = False
    info_mode = False

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "d:m:b:Hi")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write("Usage: dip-c con3 [options] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -d FLOAT       max distance for generating a contact [" +
            str(max_distance) + "]\n")
        sys.stderr.write(
            "  -m <chr.len>   output a matrix of binned counts based on chromosome lengths (tab-delimited: chr, len)\n"
        )
        sys.stderr.write(
            "  -b INT         bin size (bp) for \"-m\" (bins are centered around multiples of bin size) ["
            + str(matrix_bin_size) + "]\n")
        sys.stderr.write(
            "  -H             merge the two haplotypes (for \"-m\")\n")
        sys.stderr.write(
            "  -i             output bin info (tab-delimited: homolog or chr if \"-H\", bin center) instead (for \"-m\")\n"
        )
        return 1

    num_color_schemes = 0
    for o, a in opts:
        if o == "-d":
            max_distance = float(a)
        elif o == "-m":
            matrix_mode = True
            chr_len_file_name = a
        elif o == "-b":
            matrix_bin_size = int(a)
        elif o == "-H":
            merge_haplotypes = True
        elif o == "-i":
            info_mode = True

    # read 3DG file
    if not info_mode:
        g3d_data = file_to_g3d_data(open(args[0], "rb"))
        g3d_data.sort_g3d_particles()
        g3d_resolution = g3d_data.resolution()
        sys.stderr.write(
            "[M::" + __name__ + "] read a 3D structure with " +
            str(g3d_data.num_g3d_particles()) + " particles at " +
            ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
            " bp resolution\n")

    # matrix mode
    if matrix_mode:
        # read chromosome lengths
        hom_lens = {}
        hom_bin_lens = {}
        hom_offsets = {}
        matrix_size = 0
        chr_len_file = open(chr_len_file_name, "rb")
        for chr_len_file_line in chr_len_file:
            ref_name, ref_len = chr_len_file_line.strip().split("\t")
            ref_len = int(ref_len)
            for haplotype in ([Haplotypes.paternal] if merge_haplotypes else
                              [Haplotypes.paternal, Haplotypes.maternal]):
                hom_name = ref_name_haplotype_to_hom_name(
                    (ref_name, haplotype))
                hom_bin_len = int(round(float(ref_len) / matrix_bin_size)) + 1
                hom_lens[hom_name] = ref_len
                hom_bin_lens[hom_name] = hom_bin_len
                hom_offsets[hom_name] = matrix_size
                matrix_size += hom_bin_len

                if info_mode:
                    for bin_id in range(hom_bin_len):
                        sys.stdout.write("\t".join(
                            [(ref_name if merge_haplotypes else hom_name),
                             str(bin_id * matrix_bin_size)]) + "\n")

        # generate matrix
        if not info_mode:
            matrix_data = g3d_data_to_matrix(g3d_data, max_distance,
                                             hom_offsets, matrix_bin_size,
                                             matrix_size, merge_haplotypes)
            np.savetxt(sys.stdout, matrix_data, fmt='%i', delimiter='\t')

    else:
        if not info_mode:
            con_data = g3d_data_to_con_data(g3d_data, max_distance)
            con_data.sort_cons()
            sys.stderr.write("[M::" + __name__ + "] writing output for " +
                             str(con_data.num_cons()) + " contacts (" + str(
                                 round(
                                     100.0 * con_data.num_intra_chr() /
                                     con_data.num_cons(), 2)) +
                             "% intra-chromosomal, " + str(
                                 round(
                                     100.0 * con_data.num_phased_legs() /
                                     con_data.num_cons() / 2, 2)) +
                             "% legs phased)\n")
            sys.stdout.write(con_data.to_string() + "\n")

    return 0
Ejemplo n.º 10
0
def pd(argv):
    # default parameters
    leg_file_1_name = None
    leg_file_2_name = None

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "1:2:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write(
            "Usage: dip-c pd [options] -1 <in1.leg> [-2 <in2.leg>] <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write("  -1 <in1.leg>    LEG file (required)\n")
        sys.stderr.write("  -2 <in2.leg>    LEG file [<in1.leg>]\n")
        return 1
    for o, a in opts:
        if o == "-1":
            leg_file_1_name = a
        elif o == "-2":
            leg_file_2_name = a
    if leg_file_1_name is None:
        sys.stderr.write("[E::" + __name__ + "] -1 is required\n")
        return 1
    if leg_file_2_name is None:
        leg_file_2_name = leg_file_1_name

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")
    g3d_data.prepare_interpolate()

    # convert LEG file to 3DG particles
    positions_1 = np.empty([0, 3])
    for leg_file_1_line in open(leg_file_1_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_1_line.strip()))
        if position is None:
            position = np.array([np.nan, np.nan, np.nan])
        positions_1 = np.vstack([positions_1, position])

    positions_2 = np.empty([0, 3])
    for leg_file_2_line in open(leg_file_2_name, "rb"):
        is_out, position = g3d_data.interpolate_leg(
            string_to_leg(leg_file_2_line.strip()))
        if position is None:
            position = np.array([np.nan, np.nan, np.nan])
        positions_2 = np.vstack([positions_2, position])

    # calculate pairwise distances
    distances = distance.cdist(positions_1, positions_2)
    np.savetxt(sys.stdout, distances, delimiter='\t')

    return 0
Ejemplo n.º 11
0
def clean3(argv):
    # default parameters
    con_file_name = None
    max_clean_distance = 500000
    clean_quantile = 0.06

    # display parameters
    display_quantiles = np.arange(0.0, 1.01, 0.01)

    # read arguments
    try:
        opts, args = getopt.getopt(argv[1:], "c:d:q:")
    except getopt.GetoptError as err:
        sys.stderr.write("[E::" + __name__ + "] unknown command\n")
        return 1
    if len(args) == 0:
        sys.stderr.write(
            "Usage: dip-c clean3 [options] -c <in.con> <in.3dg>\n")
        sys.stderr.write("Options:\n")
        sys.stderr.write(
            "  -c <in.con>    contact file for cleaning (required)\n")
        sys.stderr.write(
            "  -d INT         max distance (bp) from a contact leg to a 3D genome particle ["
            + str(max_clean_distance) + "]\n")
        sys.stderr.write("  -q FLOAT       quantile of particles to remove [" +
                         str(clean_quantile) + "]\n")
        return 1
    for o, a in opts:
        if o == "-c":
            con_file_name = a
        elif o == "-d":
            max_clean_distance = int(a)
        elif o == "-q":
            clean_quantile = float(a)

    if con_file_name is None:
        sys.stderr.write("[E::" + __name__ + "] -c is required\n")
        return 1

    # read 3DG file
    g3d_data = file_to_g3d_data(open(args[0], "rb"))
    g3d_data.sort_g3d_particles()
    g3d_resolution = g3d_data.resolution()
    sys.stderr.write(
        "[M::" + __name__ + "] read a 3D structure with " +
        str(g3d_data.num_g3d_particles()) + " particles at " +
        ("N.A." if g3d_resolution is None else str(g3d_resolution)) +
        " bp resolution\n")

    # read legs from CON file
    con_file = gzip.open(con_file_name,
                         "rb") if con_file_name.endswith(".gz") else open(
                             con_file_name, "rb")
    con_data = file_to_con_data(con_file)
    sys.stderr.write(
        "[M::" + __name__ + "] read " + str(con_data.num_cons()) +
        " contacts (" +
        str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) +
        "% intra-chromosomal, " + str(
            round(100.0 * con_data.num_phased_legs() / con_data.num_cons() /
                  2, 2)) + "% legs phased)\n")
    leg_data = LegData()
    leg_data.add_con_data(con_data)
    leg_data.sort_phased_legs()
    sys.stderr.write("[M::" + __name__ + "] sorted " +
                     str(leg_data.num_legs()) + " legs\n")

    # find cut-off
    leg_counts = g3d_data.leg_counts(leg_data, max_clean_distance)
    sys.stderr.write("[M::" + __name__ + "] statistics:\n")
    sys.stderr.write("quantile\t#legs\n")
    for display_quantile in display_quantiles:
        sys.stderr.write(
            str(display_quantile) + "\t" + str(
                int(
                    round(np.percentile(leg_counts, display_quantile *
                                        100.0), 0))) + "\n")
    min_leg_count = int(
        math.ceil(np.percentile(leg_counts, clean_quantile * 100.0)))
    sys.stderr.write("[M::" + __name__ + "] min leg count: " +
                     str(min_leg_count) + "\n")

    # clean
    g3d_data.clean_leg_poor(leg_data, max_clean_distance, min_leg_count)
    leg_counts = g3d_data.leg_counts(leg_data, max_clean_distance)
    g3d_data.sort_g3d_particles()
    sys.stderr.write("[M::" + __name__ + "] writing output for " +
                     str(g3d_data.num_g3d_particles()) + " particles\n")
    sys.stdout.write(g3d_data.to_string() + "\n")

    return 0