def pos(argv): # default parameters leg_file_name = None in_only = False # read arguments try: opts, args = getopt.getopt(argv[1:], "l:O") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c leg [options] -l <in.leg> <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -l <in.leg> LEG file to convert to 3D positions (required)\n" ) sys.stderr.write(" -O exclude out-of-bound legs\n") return 1 for o, a in opts: if o == "-l": leg_file_name = a if o == "-O": in_only = True if leg_file_name is None: sys.stderr.write("[E::" + __name__ + "] -l is required\n") return 1 # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") g3d_data.prepare_interpolate() # convert LEG file to 3DG particles for leg_file_line in open(leg_file_name, "rb"): is_out, position = g3d_data.interpolate_leg( string_to_leg(leg_file_line.strip())) if position is None or (is_out and in_only): sys.stdout.write("None\n") else: sys.stdout.write("\t".join(map(str, position)) + "\n") return 0
def vis(argv): # default parameters color_file_name = None missing_value = -1.0 discard_missing = False # read arguments try: opts, args = getopt.getopt(argv[1:], "c:m:M") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c vis [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -c <color.txt> color by a list of locus-color pairs (tab-delimited: homolog, locus, color)\n" ) sys.stderr.write( " -m FLOAT color for particles that are missing from the color scheme [" + str(missing_value) + "]\n") sys.stderr.write( " -M discard particles that are missing from the color scheme\n\n" ) sys.stderr.write("Output mmCIF format:\n") sys.stderr.write( " label_asym_id homolog name (e.g. \"1(mat)\")\n") sys.stderr.write( " label_comp_id locus // 1 Mb, 3 digits with leading zeros\n") sys.stderr.write(" label_seq_id 1\n") sys.stderr.write( " label_atom_id locus % 1 Mb // 1 kb, 3 digits with leading zeros\n" ) sys.stderr.write(" B_iso_or_equiv scalar color\n") sys.stderr.write(" covale backbone bond\n") return 1 num_color_schemes = 0 for o, a in opts: if o == "-m": missing_value = float(a) elif o == "-c": color_file_name = a elif o == "-M": discard_missing = True # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # read color file color_data = {} if not color_file_name is None: color_file = open(color_file_name, "rb") for color_file_line in color_file: hom_name, ref_locus, color = color_file_line.strip().split("\t") ref_locus = int(ref_locus) color = float(color) color_data[(hom_name, ref_locus)] = color # open mmCIF file to write myDataList = [] curContainer = DataContainer("myblock") aCat = DataCategory("atom_site") aCat.appendAttribute("group_PDB") aCat.appendAttribute("type_symbol") aCat.appendAttribute("id") aCat.appendAttribute("label_asym_id") aCat.appendAttribute("label_comp_id") aCat.appendAttribute("label_seq_id") aCat.appendAttribute("label_atom_id") aCat.appendAttribute("Cartn_x") aCat.appendAttribute("Cartn_y") aCat.appendAttribute("Cartn_z") aCat.appendAttribute("B_iso_or_equiv") sCat = DataCategory("struct_conn") sCat.appendAttribute("id") sCat.appendAttribute("conn_type_id") sCat.appendAttribute("ptnr1_label_asym_id") sCat.appendAttribute("ptnr1_label_comp_id") sCat.appendAttribute("ptnr1_label_seq_id") sCat.appendAttribute("ptnr1_label_atom_id") sCat.appendAttribute("ptnr2_label_asym_id") sCat.appendAttribute("ptnr2_label_comp_id") sCat.appendAttribute("ptnr2_label_seq_id") sCat.appendAttribute("ptnr2_label_atom_id") # write atoms atom_id = 0 for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 try: color = color_data[(g3d_particle.get_hom_name(), g3d_particle.get_ref_locus())] except KeyError: if discard_missing: continue color = missing_value aCat.append(g3d_particle_to_atom_data(g3d_particle, atom_id, color)) # write backbond bonds conn_id = 0 for g3d_particle_tuple in g3d_data.get_adjacent_g3d_particle_tuples( g3d_resolution): conn_id += 1 sCat.append( g3d_particle_tuple_to_conn_data(g3d_particle_tuple, conn_id)) # write output curContainer.append(sCat) curContainer.append(aCat) myDataList.append(curContainer) pdbxW = PdbxWriter(sys.stdout) pdbxW.write(myDataList) return 0
def exp(argv): # default parameters expansion_factor = 3.0 centers_only = False # read arguments try: opts, args = getopt.getopt(argv[1:], "f:c") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c exp [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -f FLOAT expansion factor for translating away from nuclear center [" + str(expansion_factor) + "]\n") sys.stderr.write(" -c output centers of mass\n") return 1 for o, a in opts: if o == "-f": expansion_factor = float(a) if o == "-c": centers_only = True # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # center of nucleus nuc_center = center_g3d_particles(g3d_data.get_g3d_particles()) # process data if centers_only: center_g3d_data = G3dData() for hom_name in g3d_data.get_hom_names(): center_position = center_g3d_particles( g3d_data.get_g3d_particles_from_hom_name(hom_name)) center_position += (center_position - nuc_center) * expansion_factor center_g3d_data.add_g3d_particle( G3dParticle(hom_name, 0, center_position.tolist())) g3d_data = center_g3d_data else: hom_centers = {} # center of each homologs for hom_name in g3d_data.get_hom_names(): hom_centers[hom_name] = center_g3d_particles( g3d_data.get_g3d_particles_from_hom_name(hom_name)) sys.stderr.write("extract " + hom_name_to_object_name(hom_name) + ", chain \"" + hom_name + "\"\n") # translate for hom_name in g3d_data.get_hom_names(): translation_vector = (hom_centers[hom_name] - nuc_center) * expansion_factor for g3d_particle in g3d_data.get_g3d_particles_from_hom_name( hom_name): g3d_particle.set_position( (np.array(g3d_particle.get_position()) + translation_vector).tolist()) #sys.stderr.write("translate [" + ",".join(map(str, translation_vector)) + "], chain \"" + hom_name + "\"\n") sys.stderr.write("translate [" + ",".join(map(str, translation_vector)) + "], object=" + hom_name_to_object_name(hom_name) + ", camera=0\n") for hom_name in g3d_data.get_hom_names(): sys.stderr.write("mview store, object=" + hom_name_to_object_name(hom_name) + "\n") # output sys.stderr.write("[M::" + __name__ + "] writing output for " + str(g3d_data.num_g3d_particles()) + " particles\n") sys.stdout.write(g3d_data.to_string() + "\n") return 0
def impute3(argv): # default parameters g3d_file_name = None vio_file_name = None max_impute3_distance = 20 max_impute3_ratio = 0.5 min_impute3_separation_factor = 1.0 max_clean_distance = 10000000 min_clean_count = 2 is_male = False par_data = None # presets h_par = ParData("X", "Y") h_par.add_par(Par("X", 60000, 2699520, "Y", 10000)) h_par.add_par(Par("X", 154931043, 155260560, "Y", 59034049)) m_par = ParData("chrX", "chrY") m_par.add_par(Par("chrX", 169969758, 170931299, "chrY", 90745844)) presets = { "f": [False, None], "hf": [False, None], "mf": [False, None], "hm": [True, h_par], "mm": [True, m_par] } preset_descriptions = { "f": "female", "hf": "human female (same as f)", "mf": "mouse female (same as f)", "hm": "human male (hg19, no \"chr\" prefix)", "mm": "mouse male (mm10)" } # progress display parameters display_max_num_legs = 20 display_num_cons = 10000 # read arguments try: opts, args = getopt.getopt(argv[1:], "3:v:d:r:s:D:C:p:") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write( "Usage: dip-c impute3 [options] -3 <in.3dg> [-v <out.vio>] <in.con>\n" ) sys.stderr.write("Options:\n") sys.stderr.write( " -3 <in.3dg> 3D genome file for imputing haplotypes (required)\n" ) sys.stderr.write( " -v <out.vio> output statistics to a contact violation file:\n" ) sys.stderr.write( " tab-delimited: leg 1, leg 2, num of compatible haplotypes,\n" ) sys.stderr.write( " shortest 3D distance, ratio between the shortest and the 2nd shortest distance\n\n" ) sys.stderr.write( " -d FLOAT max 3D distance for imputing haplotypes [" + str(max_impute3_distance) + "]\n") sys.stderr.write( " -r FLOAT max ratio between 3D distances for the best and 2nd best haplotypes [" + str(max_impute3_ratio) + "]\n") sys.stderr.write( " -s FLOAT min separation (unit: 3D genome resolution) for imputing\n" ) sys.stderr.write( " completely unphased, intra-chromosomal contacts [" + str(min_impute3_separation_factor) + "]\n\n") sys.stderr.write( " -D INT max distance (bp, L-1/2 norm) for removing isolated contacts [" + str(max_clean_distance) + "]\n") sys.stderr.write( " -C INT min neighbor count for an unisolated contact [" + str(min_clean_count) + "]\n\n") sys.stderr.write(" -p STR presets for PARs and sex: [f]\n") for preset in sorted(presets.keys()): sys.stderr.write(" " + preset + " = " + preset_descriptions[preset] + "\n") return 1 for o, a in opts: if o == "-3": g3d_file_name = a elif o == "-v": vio_file_name = a elif o == "-d": max_impute3_distance = float(a) elif o == "-r": max_impute3_ratio = float(a) elif o == "-s": min_impute3_separation_factor = float(a) elif o == "-D": max_clean_distance = int(a) elif o == "-C": min_clean_count = int(a) elif o == "-p": try: is_male, par_data = presets[a] sys.stderr.write("[M::" + __name__ + "] use preset " + a + " = " + preset_descriptions[a] + "\n") except KeyError: sys.stderr.write("[E::" + __name__ + "] unknown preset\n") return 1 if g3d_file_name is None: sys.stderr.write("[E::" + __name__ + "] -3 is required\n") return 1 # read 3DG file g3d_data = file_to_g3d_data(open(g3d_file_name, "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") g3d_data.prepare_interpolate() # read CON file con_file = gzip.open(args[0], "rb") if args[0].endswith(".gz") else open( args[0], "rb") con_data = file_to_con_data(con_file) sys.stderr.write( "[M::" + __name__ + "] read " + str(con_data.num_cons()) + " contacts (" + str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str( round(100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n") # impute3 vio_file = None if not vio_file_name is None: vio_file = open(vio_file_name, "wb") con_data.impute_from_g3d_data(g3d_data, max_impute3_distance, max_impute3_ratio, max_impute3_ratio * g3d_resolution, is_male, par_data, vio_file) if not vio_file is None: vio_file.close() sys.stderr.write( "[M::" + __name__ + "] imputed " + str(con_data.num_phased_cons()) + " contacts (" + str(round(100.0 * con_data.num_phased_cons() / con_data.num_cons(), 2)) + "%)\n") # clean imputed con_data.sort_cons() con_data.clean_unphased() before_clean_num_cons = con_data.num_cons() con_data.clean_isolated_phased(copy.deepcopy(con_data), max_clean_distance, min_clean_count) after_clean_num_cons = con_data.num_cons() sys.stderr.write("[M::" + __name__ + "] removed " + str(before_clean_num_cons - after_clean_num_cons) + " isolated contacts (" + str( round( 100.0 * (before_clean_num_cons - after_clean_num_cons) / before_clean_num_cons, 2)) + "%)\n") # write output sys.stderr.write( "[M::" + __name__ + "] writing output for " + str(con_data.num_cons()) + " contacts (" + str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str( round(100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n") sys.stdout.write(con_data.to_string() + "\n") return 0
def color(argv): # default parameters color_file_name = None color_mode = None max_distance = None smooth_distance = None max_separation = None # display parameters disp_num_particles = 1000 # read arguments try: opts, args = getopt.getopt(argv[1:], "c:n:l:m:L:i:s:S:hd:r:I:C") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c color [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write(" -c <color.txt> color by a list of locus-color pairs (tab-delimited: chr, locus, color)\n") sys.stderr.write(" -n <chr.txt> color by chromosome name (one chromosome per line)\n") sys.stderr.write(" -l <chr.len> color by locus divided by chromosome length (tab-delimited: chr, len)\n") sys.stderr.write(" -L <chr.cen> color by arm locus divided by arm length (tab-delimited: chr, len, center of centromere)\n") sys.stderr.write(" -h color by distance to homologous locus\n\n") sys.stderr.write(" -i FLOAT color by percentage of intra-homologous neighbors within a given distance\n") sys.stderr.write(" -I FLOAT color by number of intra-homologous neighbors within a given distance\n") sys.stderr.write(" -S INT (with \"-i\" or \"-I\") max separation (bp) for intra-homologous neighbors\n\n") sys.stderr.write(" -d FLOAT color by homolog diversity within a given distance\n") sys.stderr.write(" -r FLOAT color by homolog richness within a given distance\n\n") sys.stderr.write(" -C color by distance to the nuclear center of mass\n") sys.stderr.write(" -s FLOAT smooth color by averaging over a ball\n") sys.stderr.write("Output:\n") sys.stderr.write(" tab-delimited: homolog, locus, color\n") return 1 num_color_schemes = 0 for o, a in opts: if o == "-i" or o == "-I" or o == "-d" or o == "-r": num_color_schemes += 1 color_mode = o[1:] max_distance = float(a) elif o == "-s": smooth_distance = float(a) elif o == "-S": max_separation = int(a) else: num_color_schemes += 1 color_mode = o[1:] if a != "": color_file_name = a if not max_separation is None and color_mode != "i": sys.stderr.write("[E::" + __name__ + "] \"-S\" must be used with \"-i\"\n") return 1 if num_color_schemes != 1: sys.stderr.write("[E::" + __name__ + "] exactly one color scheme is needed\n") return 1 # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # open color file if not color_file_name is None: color_file = open(color_file_name, "rb") # prepare if color_mode is None: pass elif color_mode == "c": ref_name_ref_locus_colors = {} for color_file_line in color_file: ref_name, ref_locus, color = color_file_line.strip().split("\t") ref_locus = int(ref_locus) color = float(color) ref_name_ref_locus_colors[(ref_name, ref_locus)] = color elif color_mode == "n": ref_name_colors = {} color_counter = 0 for color_file_line in color_file: color_counter += 1 ref_name = color_file_line.strip() ref_name_colors[ref_name] = color_counter elif color_mode == "l": ref_lens = {} for color_file_line in color_file: ref_name, ref_len = color_file_line.strip().split("\t") ref_len = int(ref_len) ref_lens[ref_name] = ref_len elif color_mode == "L": ref_lens = {} ref_cens = {} for color_file_line in color_file: ref_name, ref_len, ref_cen = color_file_line.strip().split("\t") ref_len = int(ref_len) ref_cen = int(ref_cen) ref_lens[ref_name] = ref_len ref_cens[ref_name] = ref_cen elif color_mode == "i" or color_mode == "I" or color_mode == "d" or color_mode == "r": g3d_data.prepare_nearby() elif color_mode == "C": hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays() center_mass = np.mean(position_np_array, axis = 0) sys.stderr.write("[M::" + __name__ + "] center of mass is at (" + ", ".join(map(str, center_mass)) + ")\n") # calculate colors for each particle color_data = {} atom_id = 0 for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 if atom_id % disp_num_particles == 0: sys.stderr.write("[M::" + __name__ + "] analyzed " + str(atom_id) + " particles (" + str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n") # color if color_mode == "c": try: color = ref_name_ref_locus_colors[(g3d_particle.get_ref_name(), g3d_particle.get_ref_locus())] except KeyError: continue elif color_mode == "n": try: color = ref_name_colors[g3d_particle.get_ref_name()] except KeyError: continue elif color_mode == "l": try: color = float(g3d_particle.get_ref_locus()) / ref_lens[g3d_particle.get_ref_name()] except KeyError: continue elif color_mode == "L": try: arm_locus = g3d_particle.get_ref_locus() - ref_cens[g3d_particle.get_ref_name()] if arm_locus > 0: arm_len = ref_lens[g3d_particle.get_ref_name()] - ref_cens[g3d_particle.get_ref_name()] else: arm_len = ref_cens[g3d_particle.get_ref_name()] color = float(abs(arm_locus)) / arm_len except KeyError: continue elif color_mode == "i": color = intra_hom_fraction(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation) if color is None: continue elif color_mode == "I": color = intra_hom_count(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation) elif color_mode == "h": homologous_g3d_particle = g3d_data.get_g3d_particle_from_hom_name_ref_locus(homologous_hom_name(g3d_particle.get_hom_name()), g3d_particle.get_ref_locus()) if homologous_g3d_particle is None: continue color = math.sqrt((g3d_particle.get_x() - homologous_g3d_particle.get_x()) ** 2 + (g3d_particle.get_y() - homologous_g3d_particle.get_y()) ** 2 + (g3d_particle.get_z() - homologous_g3d_particle.get_z()) ** 2) elif color_mode == "d": color = hom_diversity(g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance)) elif color_mode == "r": color = hom_richness(g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance)) elif color_mode == "C": color = math.sqrt((g3d_particle.get_x() - center_mass[0]) ** 2 + (g3d_particle.get_y() - center_mass[1]) ** 2 + (g3d_particle.get_z() - center_mass[2]) ** 2) #sys.stderr.write(str(color) + "\n") color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color # smoothing if not smooth_distance is None: g3d_data.prepare_nearby() smooth_color_data = {} atom_id = 0 for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 if atom_id % disp_num_particles == 0: sys.stderr.write("[M::" + __name__ + "] smoothed " + str(atom_id) + " particles (" + str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n") color = smooth_color(g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), smooth_distance), color_data) if not color is None: smooth_color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color color_data = smooth_color_data # output sys.stderr.write("[M::" + __name__ + "] writing " + str(len(color_data)) + " colors (" + str(round(100.0 * len(color_data) / g3d_data.num_g3d_particles(), 2)) + "%)\n") for hom_name, ref_locus in sorted(color_data.keys()): sys.stdout.write("\t".join([hom_name, str(ref_locus), str(color_data[(hom_name, ref_locus)])]) + "\n") return 0
def rg(argv): # default parameters output_prefix = None reg_file_name = None reg_list = [] distance_mode = False # display parameters disp_num_particles = 100 # read arguments try: opts, args = getopt.getopt(argv[1:], "o:r:d") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c rg [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write(" -o STR output prefix [<in.3dg>.]\n") sys.stderr.write(" -r <in.reg> only analyze certain regions\n") sys.stderr.write(" (will output two regions if haplotype is \".\")\n") sys.stderr.write(" -d output pairwise distances instead\n") sys.stderr.write("Output:\n") sys.stderr.write(" <prefix><region_name>.rg an N x N matrix of radii of gyration\n") sys.stderr.write(" <prefix><region_name>.loc a list of N chromosomal loci\n") return 1 for o, a in opts: if o == "-o": output_prefix = a elif o == "-r": reg_file_name = a elif o == "-d": distance_mode = True if output_prefix is None: output_prefix = args[0] + "." # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + str(g3d_resolution) + " bp resolution\n") # prepare regions to analyze if reg_file_name is None: # analyze all homologs for hom_name in g3d_data.get_hom_names(): ref_name, haplotype = hom_name_to_ref_name_haplotype(hom_name) reg = Reg(ref_name) reg.add_haplotype(haplotype) reg_list.append(reg) else: reg_file = gzip.open(reg_file_name, "rb") if reg_file_name.endswith(".gz") else open(reg_file_name, "rb") reg_list.extend(file_to_reg_list(reg_file)) reg_file.close() reg_list = [reg for reg in get_phased_regs(reg_list)] sys.stderr.write("[M::" + __name__ + "] will analyze the following regions:\n") sys.stderr.write("name\tchr\thap\tstart\tend\n") for reg in reg_list: sys.stderr.write(reg.to_name_string() + "\t" + reg.to_string() + "\n") # calculate Rg matrix for each region for reg in reg_list: reg_name = reg.to_name_string() g3d_list = G3dList() for g3d_particle in g3d_data.get_g3d_particles_in_reg(reg): g3d_list.add_g3d_particle(g3d_particle) g3d_list.sort_g3d_particles() sys.stderr.write("[M::" + __name__ + "] processing region " + reg_name + ", with " + str(g3d_list.num_g3d_particles()) + " particles\n") loci_np_array, position_np_array = g3d_list.to_np_arrays() # write loci file loci_file_name = output_prefix + reg_name + ".loc" np.savetxt(loci_file_name, loci_np_array, fmt='%i', delimiter='\t') # calculate Rg rg_file_name = output_prefix + reg_name + ".rg" if distance_mode: output_matrix = squareform(pdist(position_np_array)) else: output_matrix = position_np_array_to_rg_np_array(position_np_array, disp_num_particles) np.savetxt(rg_file_name, output_matrix, delimiter='\t') return 0
def dist(argv): # default parameters # display parameters disp_num_particles = 100 # read arguments try: opts, args = getopt.getopt(argv[1:], "d") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c dist [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write(" -d diploid mode\n") sys.stderr.write("Output:\n") sys.stderr.write( " tab-delimited: homolog (chr if \"-d\"), separation (in bp), #pairs, mean distance, r.m.s. distance\n" ) return 1 for o, a in opts: if o == "-o": output_prefix = a elif o == "-r": reg_file_name = a # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write("[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + str(g3d_resolution) + " bp resolution\n") # analyze each homolog for hom_name in g3d_data.get_hom_names(): sys.stderr.write("analyzing " + hom_name + "\n") loci_np_array, position_np_array = g3d_data.get_g3d_list_from_hom_name( hom_name).to_np_arrays() sep_np_array = pdist(loci_np_array) dist_np_array = pdist(position_np_array) uniq_seps, uniq_indices = np.unique(sep_np_array, return_inverse=True) num_seps = len(uniq_seps) # calculate statistics nums_pairs = [0] * num_seps sums = [0.0] * num_seps sums_sq = [0.0] * num_seps for i in range(len(sep_np_array)): sep_index = uniq_indices[i] nums_pairs[sep_index] += 1 sums[sep_index] += dist_np_array[i] sums_sq[sep_index] += dist_np_array[i]**2 # print for i in range(num_seps): sys.stdout.write("\t".join([ hom_name, str(int(uniq_seps[i])), str(nums_pairs[i]), str(sums[i] / nums_pairs[i]), str(math.sqrt(sums_sq[i] / nums_pairs[i])) ]) + "\n") return 0
def color(argv): # default parameters color_file_name = None color_mode = None max_distance = None smooth_distance = None max_separation = None radial_mode = False radial_min_num_particles = 10 radial_missing_value = -1.0 radial_max_r = 3.0 radial_bin_r = 0.05 # display parameters disp_num_particles = 1000 # read arguments try: opts, args = getopt.getopt( argv[1:], "c:n:l:m:L:i:s:S:hd:r:I:CD:R", ["min-num=", "missing=", "max-r=", "bin-size="]) except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c color [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -c <color.txt> color by a list of locus-color pairs (tab-delimited: chr, locus, color)\n" ) sys.stderr.write( " -n <chr.txt> color by chromosome name (one chromosome per line)\n" ) sys.stderr.write( " -l <chr.len> color by locus divided by chromosome length (tab-delimited: chr, len)\n" ) sys.stderr.write( " -L <chr.cen> color by arm locus divided by arm length (tab-delimited: chr, len, center of centromere)\n" ) sys.stderr.write( " -h color by distance to homologous locus\n\n") sys.stderr.write( " -i FLOAT color by percentage of intra-homologous neighbors within a given distance\n" ) sys.stderr.write( " -I FLOAT color by number of intra-homologous neighbors within a given distance\n" ) sys.stderr.write( " -S INT (with \"-i\" or \"-I\") max separation (bp) for intra-homologous neighbors\n\n" ) sys.stderr.write( " -d FLOAT color by homolog diversity within a given distance\n" ) sys.stderr.write( " -r FLOAT color by homolog richness within a given distance\n\n" ) sys.stderr.write( " -C color by distance to the nuclear center of mass\n" ) sys.stderr.write( " -D <in.leg> color by distance to a given locus (only the first line of the LEG file will be used)\n\n" ) sys.stderr.write( " -s FLOAT smooth color by averaging over a ball\n\n") sys.stderr.write( " -R special: output average color for different radial distances (normalized to 1.0)\n" ) sys.stderr.write( " --min-num=INT (with \"-R\") min number of particles per bin [" + str(radial_min_num_particles) + "]\n") sys.stderr.write( " --missing=FLOAT (with \"-R\") output value when \"--min-num\" is not met [" + str(radial_missing_value) + "]\n") sys.stderr.write( " --max-r=FLOAT (with \"-R\") max radial distance [" + str(radial_max_r) + "]\n") sys.stderr.write( " --bin-size=FLOAT (with \"-R\") bin size of radial distances [" + str(radial_bin_r) + "]\n\n") sys.stderr.write("Output:\n") sys.stderr.write(" tab-delimited: homolog, locus, color\n") sys.stderr.write( " (with \"-R\") tab-delimited: radial distance, average color, #particles\n" ) return 1 num_color_schemes = 0 for o, a in opts: if o == "-i" or o == "-I" or o == "-d" or o == "-r": num_color_schemes += 1 color_mode = o[1:] max_distance = float(a) elif o == "-s": smooth_distance = float(a) elif o == "-S": max_separation = int(a) elif o == "--min-num": radial_min_num_particles = int(a) elif o == "--missing": radial_missing_value = float(a) elif o == "--max-r": radial_max_r = float(a) elif o == "--bin-size": radial_bin_r = float(a) elif o == "-R": radial_mode = True else: num_color_schemes += 1 color_mode = o[1:] if a != "": color_file_name = a if not max_separation is None and color_mode != "i": sys.stderr.write("[E::" + __name__ + "] \"-S\" must be used with \"-i\"\n") return 1 if num_color_schemes != 1: sys.stderr.write("[E::" + __name__ + "] exactly one color scheme is needed\n") return 1 # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # open color file if not color_file_name is None: color_file = open(color_file_name, "rb") # prepare if color_mode is None: pass elif color_mode == "c": ref_name_ref_locus_colors = {} for color_file_line in color_file: ref_name, ref_locus, color = color_file_line.strip().split("\t") ref_locus = int(ref_locus) color = float(color) ref_name_ref_locus_colors[(ref_name, ref_locus)] = color elif color_mode == "n": ref_name_colors = {} color_counter = 0 for color_file_line in color_file: color_counter += 1 ref_name = color_file_line.strip() ref_name_colors[ref_name] = color_counter elif color_mode == "l": ref_lens = {} for color_file_line in color_file: ref_name, ref_len = color_file_line.strip().split("\t") ref_len = int(ref_len) ref_lens[ref_name] = ref_len elif color_mode == "L": ref_lens = {} ref_cens = {} for color_file_line in color_file: ref_name, ref_len, ref_cen = color_file_line.strip().split("\t") ref_len = int(ref_len) ref_cen = int(ref_cen) ref_lens[ref_name] = ref_len ref_cens[ref_name] = ref_cen elif color_mode == "i" or color_mode == "I" or color_mode == "d" or color_mode == "r": g3d_data.prepare_nearby() elif color_mode == "C": hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays() ref_pos = np.mean(position_np_array, axis=0) sys.stderr.write("[M::" + __name__ + "] reference point (center of mass) is at (" + ", ".join(map(str, ref_pos)) + ")\n") elif color_mode == "D": # fine reference point position ref_leg = string_to_leg(color_file.readline().strip()) g3d_data.prepare_interpolate() is_out, ref_pos = g3d_data.interpolate_leg(ref_leg) sys.stderr.write("[M::" + __name__ + "] reference point (" + ref_leg.to_string() + ") is at (" + ", ".join(map(str, ref_pos)) + ")\n") # calculate colors for each particle color_data = {} atom_id = 0 for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 if atom_id % disp_num_particles == 0: sys.stderr.write( "[M::" + __name__ + "] analyzed " + str(atom_id) + " particles (" + str(round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n") # color if color_mode == "c": try: color = ref_name_ref_locus_colors[( g3d_particle.get_ref_name(), g3d_particle.get_ref_locus())] except KeyError: continue elif color_mode == "n": try: color = ref_name_colors[g3d_particle.get_ref_name()] except KeyError: continue elif color_mode == "l": try: color = float(g3d_particle.get_ref_locus()) / ref_lens[ g3d_particle.get_ref_name()] except KeyError: continue elif color_mode == "L": try: arm_locus = g3d_particle.get_ref_locus() - ref_cens[ g3d_particle.get_ref_name()] if arm_locus > 0: arm_len = ref_lens[g3d_particle.get_ref_name()] - ref_cens[ g3d_particle.get_ref_name()] else: arm_len = ref_cens[g3d_particle.get_ref_name()] color = float(abs(arm_locus)) / arm_len except KeyError: continue elif color_mode == "i": color = intra_hom_fraction( g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation) if color is None: continue elif color_mode == "I": color = intra_hom_count( g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance), max_separation) elif color_mode == "h": homologous_g3d_particle = g3d_data.get_g3d_particle_from_hom_name_ref_locus( homologous_hom_name(g3d_particle.get_hom_name()), g3d_particle.get_ref_locus()) if homologous_g3d_particle is None: continue color = math.sqrt( (g3d_particle.get_x() - homologous_g3d_particle.get_x())**2 + (g3d_particle.get_y() - homologous_g3d_particle.get_y())**2 + (g3d_particle.get_z() - homologous_g3d_particle.get_z())**2) elif color_mode == "d": color = hom_diversity( g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance)) elif color_mode == "r": color = hom_richness( g3d_data.get_g3d_particles_near(g3d_particle.get_position(), max_distance)) elif color_mode == "C" or color_mode == "D": color = math.sqrt((g3d_particle.get_x() - ref_pos[0])**2 + (g3d_particle.get_y() - ref_pos[1])**2 + (g3d_particle.get_z() - ref_pos[2])**2) #sys.stderr.write(str(color) + "\n") color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color # smoothing if not smooth_distance is None: g3d_data.prepare_nearby() smooth_color_data = {} atom_id = 0 for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 if atom_id % disp_num_particles == 0: sys.stderr.write( "[M::" + __name__ + "] smoothed " + str(atom_id) + " particles (" + str( round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n") color = smooth_color( g3d_particle, g3d_data.get_g3d_particles_near(g3d_particle.get_position(), smooth_distance), color_data) if not color is None: smooth_color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] = color color_data = smooth_color_data # radial if radial_mode: num_radial_bins = int(radial_max_r / radial_bin_r) + 1 radial_color_sums = [0.0] * num_radial_bins radial_color_nums = [0] * num_radial_bins # calculate center of mass, and normalization factor hom_names, loci_np_array, position_np_array = g3d_data.to_np_arrays() ref_pos = np.mean(position_np_array, axis=0) mean_radial = np.mean(np.sum((position_np_array - ref_pos)**2, axis=-1)**0.5, axis=0) sys.stderr.write("[M::" + __name__ + "] radial mode: average radial distance = " + str(mean_radial) + ", which will be normalize to 1.0\n") # examine each particle for g3d_particle in g3d_data.get_g3d_particles(): atom_id += 1 if atom_id % disp_num_particles == 0: sys.stderr.write( "[M::" + __name__ + "] radial mode for " + str(atom_id) + " particles (" + str( round(100.0 * atom_id / g3d_data.num_g3d_particles(), 2)) + "%)\n") if (g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()) not in color_data: continue color = color_data[g3d_particle.get_hom_name(), g3d_particle.get_ref_locus()] radial = math.sqrt( (g3d_particle.get_x() - ref_pos[0])**2 + (g3d_particle.get_y() - ref_pos[1])**2 + (g3d_particle.get_z() - ref_pos[2])**2) / mean_radial radial_bin_id = int(radial / radial_bin_r + 0.5) #sys.stderr.write(str(radial)+", " + str(radial_bin_id) + "=" + str(radial_bin_id*radial_bin_r)+ ", "+ str(color)+"\n") if radial_bin_id >= num_radial_bins: continue # out of bound, skip radial_color_sums[radial_bin_id] += color radial_color_nums[radial_bin_id] += 1 # output sys.stderr.write("[M::" + __name__ + "] writing radial mode output\n") for radial_bin_id in range(num_radial_bins): if radial_color_nums[radial_bin_id] < radial_min_num_particles: output_value = radial_missing_value else: output_value = radial_color_sums[ radial_bin_id] / radial_color_nums[radial_bin_id] sys.stdout.write("\t".join([ str(radial_bin_id * radial_bin_r), str(output_value), str(radial_color_nums[radial_bin_id]) ]) + "\n") return 0 # output sys.stderr.write( "[M::" + __name__ + "] writing " + str(len(color_data)) + " colors (" + str(round(100.0 * len(color_data) / g3d_data.num_g3d_particles(), 2)) + "%)\n") for hom_name, ref_locus in sorted(color_data.keys()): sys.stdout.write("\t".join( [hom_name, str(ref_locus), str(color_data[(hom_name, ref_locus)])]) + "\n") return 0
def con3(argv): # default parameters max_distance = 3.0 matrix_mode = False chr_len_file_name = None matrix_bin_size = 1000000 merge_haplotypes = False info_mode = False # read arguments try: opts, args = getopt.getopt(argv[1:], "d:m:b:Hi") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write("Usage: dip-c con3 [options] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -d FLOAT max distance for generating a contact [" + str(max_distance) + "]\n") sys.stderr.write( " -m <chr.len> output a matrix of binned counts based on chromosome lengths (tab-delimited: chr, len)\n" ) sys.stderr.write( " -b INT bin size (bp) for \"-m\" (bins are centered around multiples of bin size) [" + str(matrix_bin_size) + "]\n") sys.stderr.write( " -H merge the two haplotypes (for \"-m\")\n") sys.stderr.write( " -i output bin info (tab-delimited: homolog or chr if \"-H\", bin center) instead (for \"-m\")\n" ) return 1 num_color_schemes = 0 for o, a in opts: if o == "-d": max_distance = float(a) elif o == "-m": matrix_mode = True chr_len_file_name = a elif o == "-b": matrix_bin_size = int(a) elif o == "-H": merge_haplotypes = True elif o == "-i": info_mode = True # read 3DG file if not info_mode: g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # matrix mode if matrix_mode: # read chromosome lengths hom_lens = {} hom_bin_lens = {} hom_offsets = {} matrix_size = 0 chr_len_file = open(chr_len_file_name, "rb") for chr_len_file_line in chr_len_file: ref_name, ref_len = chr_len_file_line.strip().split("\t") ref_len = int(ref_len) for haplotype in ([Haplotypes.paternal] if merge_haplotypes else [Haplotypes.paternal, Haplotypes.maternal]): hom_name = ref_name_haplotype_to_hom_name( (ref_name, haplotype)) hom_bin_len = int(round(float(ref_len) / matrix_bin_size)) + 1 hom_lens[hom_name] = ref_len hom_bin_lens[hom_name] = hom_bin_len hom_offsets[hom_name] = matrix_size matrix_size += hom_bin_len if info_mode: for bin_id in range(hom_bin_len): sys.stdout.write("\t".join( [(ref_name if merge_haplotypes else hom_name), str(bin_id * matrix_bin_size)]) + "\n") # generate matrix if not info_mode: matrix_data = g3d_data_to_matrix(g3d_data, max_distance, hom_offsets, matrix_bin_size, matrix_size, merge_haplotypes) np.savetxt(sys.stdout, matrix_data, fmt='%i', delimiter='\t') else: if not info_mode: con_data = g3d_data_to_con_data(g3d_data, max_distance) con_data.sort_cons() sys.stderr.write("[M::" + __name__ + "] writing output for " + str(con_data.num_cons()) + " contacts (" + str( round( 100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str( round( 100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n") sys.stdout.write(con_data.to_string() + "\n") return 0
def pd(argv): # default parameters leg_file_1_name = None leg_file_2_name = None # read arguments try: opts, args = getopt.getopt(argv[1:], "1:2:") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write( "Usage: dip-c pd [options] -1 <in1.leg> [-2 <in2.leg>] <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write(" -1 <in1.leg> LEG file (required)\n") sys.stderr.write(" -2 <in2.leg> LEG file [<in1.leg>]\n") return 1 for o, a in opts: if o == "-1": leg_file_1_name = a elif o == "-2": leg_file_2_name = a if leg_file_1_name is None: sys.stderr.write("[E::" + __name__ + "] -1 is required\n") return 1 if leg_file_2_name is None: leg_file_2_name = leg_file_1_name # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") g3d_data.prepare_interpolate() # convert LEG file to 3DG particles positions_1 = np.empty([0, 3]) for leg_file_1_line in open(leg_file_1_name, "rb"): is_out, position = g3d_data.interpolate_leg( string_to_leg(leg_file_1_line.strip())) if position is None: position = np.array([np.nan, np.nan, np.nan]) positions_1 = np.vstack([positions_1, position]) positions_2 = np.empty([0, 3]) for leg_file_2_line in open(leg_file_2_name, "rb"): is_out, position = g3d_data.interpolate_leg( string_to_leg(leg_file_2_line.strip())) if position is None: position = np.array([np.nan, np.nan, np.nan]) positions_2 = np.vstack([positions_2, position]) # calculate pairwise distances distances = distance.cdist(positions_1, positions_2) np.savetxt(sys.stdout, distances, delimiter='\t') return 0
def clean3(argv): # default parameters con_file_name = None max_clean_distance = 500000 clean_quantile = 0.06 # display parameters display_quantiles = np.arange(0.0, 1.01, 0.01) # read arguments try: opts, args = getopt.getopt(argv[1:], "c:d:q:") except getopt.GetoptError as err: sys.stderr.write("[E::" + __name__ + "] unknown command\n") return 1 if len(args) == 0: sys.stderr.write( "Usage: dip-c clean3 [options] -c <in.con> <in.3dg>\n") sys.stderr.write("Options:\n") sys.stderr.write( " -c <in.con> contact file for cleaning (required)\n") sys.stderr.write( " -d INT max distance (bp) from a contact leg to a 3D genome particle [" + str(max_clean_distance) + "]\n") sys.stderr.write(" -q FLOAT quantile of particles to remove [" + str(clean_quantile) + "]\n") return 1 for o, a in opts: if o == "-c": con_file_name = a elif o == "-d": max_clean_distance = int(a) elif o == "-q": clean_quantile = float(a) if con_file_name is None: sys.stderr.write("[E::" + __name__ + "] -c is required\n") return 1 # read 3DG file g3d_data = file_to_g3d_data(open(args[0], "rb")) g3d_data.sort_g3d_particles() g3d_resolution = g3d_data.resolution() sys.stderr.write( "[M::" + __name__ + "] read a 3D structure with " + str(g3d_data.num_g3d_particles()) + " particles at " + ("N.A." if g3d_resolution is None else str(g3d_resolution)) + " bp resolution\n") # read legs from CON file con_file = gzip.open(con_file_name, "rb") if con_file_name.endswith(".gz") else open( con_file_name, "rb") con_data = file_to_con_data(con_file) sys.stderr.write( "[M::" + __name__ + "] read " + str(con_data.num_cons()) + " contacts (" + str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str( round(100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n") leg_data = LegData() leg_data.add_con_data(con_data) leg_data.sort_phased_legs() sys.stderr.write("[M::" + __name__ + "] sorted " + str(leg_data.num_legs()) + " legs\n") # find cut-off leg_counts = g3d_data.leg_counts(leg_data, max_clean_distance) sys.stderr.write("[M::" + __name__ + "] statistics:\n") sys.stderr.write("quantile\t#legs\n") for display_quantile in display_quantiles: sys.stderr.write( str(display_quantile) + "\t" + str( int( round(np.percentile(leg_counts, display_quantile * 100.0), 0))) + "\n") min_leg_count = int( math.ceil(np.percentile(leg_counts, clean_quantile * 100.0))) sys.stderr.write("[M::" + __name__ + "] min leg count: " + str(min_leg_count) + "\n") # clean g3d_data.clean_leg_poor(leg_data, max_clean_distance, min_leg_count) leg_counts = g3d_data.leg_counts(leg_data, max_clean_distance) g3d_data.sort_g3d_particles() sys.stderr.write("[M::" + __name__ + "] writing output for " + str(g3d_data.num_g3d_particles()) + " particles\n") sys.stdout.write(g3d_data.to_string() + "\n") return 0