def main(): parser = argparse.ArgumentParser(description='Wrapper for HiC-Spector') parser.add_argument('--m1') parser.add_argument('--m2') parser.add_argument('--node_file') parser.add_argument('--num_evec', type=int, default=20) parser.add_argument('--out') args = parser.parse_args() nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed( args.node_file, 'NA') m1_csr = processing.construct_csr_matrix_from_data_and_nodes( args.m1, nodes, blacklist_nodes, False) m2_csr = processing.construct_csr_matrix_from_data_and_nodes( args.m2, nodes, blacklist_nodes, False) m1up = m1_csr m1down = m1up.transpose() m1down.setdiag(0) m1 = m1up + m1down m2up = m2_csr m2down = m2up.transpose() m2down.setdiag(0) m2 = m2up + m2down sys.stdout = open(args.out, 'w') get_reproducibility(m1, m2, args.num_evec)
def main(): parser = argparse.ArgumentParser( description='Compute RW transformation of 3D data') parser.add_argument('--datatype', default='hic') parser.add_argument('--m', type=str) parser.add_argument('--matrix_format', type=str, default='n1n2val', help='c1n1c2n2val') parser.add_argument('--node_file', type=str) parser.add_argument('--remove_diagonal', action='store_true') parser.add_argument('--mname', type=str) parser.add_argument('--outdir', type=str, default='OUT') parser.add_argument('--outpref', type=str, default='outpref') parser.add_argument('--norm', type=str, default='uniform') parser.add_argument('--method', type=str, default='RandomWalks') parser.add_argument('--tmin', type=int, default=1) parser.add_argument('--tmax', type=int, default=3) parser.add_argument('--transition', action='store_true') parser.add_argument('--blacklist', default='NA') args = parser.parse_args() os.system('mkdir -p ' + args.outdir) nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed( args.node_file, args.blacklist) m = processing.construct_csr_matrix_from_data_and_nodes( args.m, nodes, blacklist_nodes, args.remove_diagonal) m_norm = data_operations.process_matrix(m, args.norm) mup = m_norm mdown = mup.transpose() mdown.setdiag(0) m_full = mup + mdown if args.transition: m_full = to_transition(m_full) outname = args.outdir + '/' + args.outpref for t in range(args.tmin, (args.tmax + 1)): if t == 1: rw = copy.deepcopy(m_full) else: rw = rw.dot(m_full) processing.write_matrix_from_csr_and_nodes( rw, nodes_idx, outname + '.rw_t' + str(t) + '.gz')
def main(): parser = argparse.ArgumentParser( description='Compute reproducibility of 3D genome data') parser.add_argument('--datatype', default='hic') parser.add_argument( '--m1', type=str, default= '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC014.res40000.byChr.chr21.gz' ) parser.add_argument( '--m2', type=str, default= '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC001.res40000.byChr.chr21.gz' ) parser.add_argument('--matrix_format', type=str, default='n1n2val', help='c1n1c2n2val') parser.add_argument( '--node_file', type=str, default= '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/nodes/Nodes.w40000.chr21.gz' ) parser.add_argument('--remove_diagonal', action='store_true') parser.add_argument('--m1name', type=str, default='HIC014') parser.add_argument('--m2name', type=str, default='HIC001') parser.add_argument('--outdir', type=str, default='OUT') parser.add_argument('--outpref', type=str, default='outpref') parser.add_argument('--m_subsample', type=str, default='lowest') parser.add_argument( '--concise_analysis', action='store_true', help= 'Add this flag to only output the reproducibility score, and not perform the distance dependence analyses.' ) parser.add_argument('--norm', type=str, default='uniform') parser.add_argument('--method', type=str, default='RandomWalks') parser.add_argument('--tmin', type=int, default=1) parser.add_argument('--tmax', type=int, default=3) parser.add_argument('--approximation', type=int, default=40000) parser.add_argument('--transition', action='store_true') parser.add_argument('--blacklist', default='NA') args = parser.parse_args() #write_arguments(args) os.system('mkdir -p ' + args.outdir) print "GenomeDISCO | " + strftime( "%c") + " | Starting reproducibility analysis" nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed( args.node_file, args.blacklist) print "GenomeDISCO | " + strftime("%c") + " | Loading contact maps" m1 = processing.construct_csr_matrix_from_data_and_nodes( args.m1, nodes, blacklist_nodes, args.remove_diagonal) m2 = processing.construct_csr_matrix_from_data_and_nodes( args.m2, nodes, blacklist_nodes, args.remove_diagonal) stats = {} stats[args.m1name] = {} stats[args.m2name] = {} stats[args.m1name]['depth'] = m1.sum() stats[args.m2name]['depth'] = m2.sum() m1_subsample = copy.deepcopy(m1) m2_subsample = copy.deepcopy(m2) if args.m_subsample != 'NA': if args.m_subsample == 'lowest': if stats[args.m1name]['depth'] >= stats[args.m2name]['depth']: m_subsample = copy.deepcopy(m2) if stats[args.m1name]['depth'] < stats[args.m2name]['depth']: m_subsample = copy.deepcopy(m1) else: m_subsample = processing.construct_csr_matrix_from_data_and_nodes( args.m_subsample, nodes, blacklist_nodes, args.remove_diagonal) print "GenomeDISCO | " + strftime( "%c") + " | Subsampling to the depth of " + args.m_subsample print "GenomeDISCO | " + strftime( "%c") + " | Subsampling depth = " + str(m_subsample.sum()) desired_depth = m_subsample.sum() #desired_depth=156023 if m1.sum() > desired_depth: m1_subsample = data_operations.subsample_to_depth( m1, desired_depth) if m2.sum() > desired_depth: m2_subsample = data_operations.subsample_to_depth( m2, desired_depth) stats[args.m1name]['subsampled_depth'] = m1_subsample.sum() stats[args.m2name]['subsampled_depth'] = m2_subsample.sum() print "GenomeDISCO | " + strftime( "%c") + ' | Normalizing with ' + args.norm m1_norm = data_operations.process_matrix(m1_subsample, args.norm) m2_norm = data_operations.process_matrix(m2_subsample, args.norm) if not args.concise_analysis: #distance dependence analysis print "GenomeDISCO | " + strftime( "%c") + " | Distance dependence analysis" if args.datatype == 'hic': m1dd = data_operations.get_distance_dep(m1_subsample) m2dd = data_operations.get_distance_dep(m2_subsample) if args.datatype == 'capturec': m1dd = data_operations.get_distance_dep_using_nodes_capturec( m1_subsample, nodes, nodes_idx, args.approximation) m2dd = data_operations.get_distance_dep_using_nodes_capturec( m2_subsample, nodes, nodes_idx, args.approximation) dd_diff = get_dd_diff(m1dd, m2dd) visualization.plot_dds([m1dd, m2dd], [args.m1name, args.m2name], args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' + args.m2name + '.distDep', args.approximation) print "GenomeDISCO | " + strftime( "%c") + " | Computing reproducibility score" if args.method == 'RandomWalks': comparer = DiscoRandomWalks(args) reproducibility_text, score, scores = comparer.compute_reproducibility( m1_norm, m2_norm, args) ''' print "GenomeDISCO | "+strftime("%c")+" | Writing results" write_html_report(stats,args,reproducibility_text,score) ''' out = open( args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' + args.m2name + '.scores.txt', 'w') out.write(args.m1name + '\t' + args.m2name + '\t' + str('{:.3f}'.format(score)) + '\n') out.close() out = open( args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' + args.m2name + '.scoresByStep.txt', 'w') t_strings = [] score_strings = [] t_counter = 0 for t in range(1, (args.tmax + 1)): if t >= args.tmin: score_strings.append(str('{:.3f}'.format(scores[t_counter]))) t_counter += 1 else: score_strings.append('NA') t_strings.append(str(t)) out.write('#m1' + '\t' + 'm2' + '\t' + '\t'.join(t_strings) + '\n') out.write(args.m1name + '\t' + args.m2name + '\t' + '\t'.join(score_strings) + '\n') out.close() out = open( args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' + args.m2name + '.datastats.txt', 'w') out.write('#m1name' + '\t' + 'm2name' + '\t' + 'SeqDepth.m1' + '\t' + 'SeqDepth.m2' + '\t' + 'SubsampledSeqDepth.m1' + '\t' + 'SubsampledSeqDepth.m2' + '\t' + 'DistDepDiff' + '\n') dd_value = 'NA' if not args.concise_analysis: dd_value = str('{:.10f}'.format(dd_diff)) out.write(args.m1name + '\t' + args.m2name + '\t' + str(stats[args.m1name]['depth']) + '\t' + str(stats[args.m2name]['depth']) + '\t' + str(stats[args.m1name]['subsampled_depth']) + '\t' + str(stats[args.m2name]['subsampled_depth']) + '\t' + dd_value + '\n') out.close() print "GenomeDISCO | Differences by random walk step: " + '\t'.join( score_strings) print "GenomeDISCO | " + strftime("%c") + " | DONE"
def main(): parser = argparse.ArgumentParser( description='Simulate Hi-C data based on real datasets.') parser.add_argument( '--matrices', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz' ) parser.add_argument('--matrix_names', default='HIC001') parser.add_argument( '--nodes', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/nodes/nodes.chr21.gz' ) parser.add_argument( '--distDepData', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz,/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC002/HIC002.chr21.gz' ) parser.add_argument('--edgenoise', default='0.0') parser.add_argument('--nodenoise', default='0.0') parser.add_argument('--boundarynoise', default='0') parser.add_argument('--depth', type=int, default=1000000) parser.add_argument('--outdir', default='/ifs/scratch/oursu/test/testmat') parser.add_argument('--resolution', type=int, default=40000) args = parser.parse_args() #setup nodes nodes, nodes_idx = processing.read_nodes_from_bed(args.nodes) #now go through each of the matrices, and simulate from them matrices = args.matrices.split(',') matrix_names = args.matrix_names.split(',') for m_idx in range(len(matrices)): #read in the matrix mname = matrix_names[m_idx] mfile = matrices[m_idx] my_matrix_orig = read_in_data(mfile, nodes) for edgenoise in args.edgenoise.split(','): for nodenoise in args.nodenoise.split(','): for boundarynoise in args.boundarynoise.split(','): ablist = ['a', 'b'] for ab_idx in range(len(ablist)): ab = ablist[ab_idx] my_matrix = shift_dataset( my_matrix_orig, int(boundarynoise)) #============= ddfiles = args.distDepData.split(',') print my_matrix for ddfile_idx in range(len(ddfiles)): ddfile = ddfiles[ddfile_idx] dd = read_in_data(ddfile, nodes) prob_matrix = get_probability_matrix( my_matrix, dd, float(edgenoise), float(nodenoise)) print prob_matrix intro = args.outdir + '/Depth_' + str( args.depth) + '.' + mname sampled_matrix = sample_interactions( prob_matrix, args.depth) ftowrite = intro + '.EN_' + str( edgenoise ) + '.NN_' + str(nodenoise) + '.BN_' + str( boundarynoise) + '.' + ab + '.dd_' + str( ddfile_idx) + '.gz' print ftowrite write_matrix(sampled_matrix, ftowrite, args)
def main(): parser = argparse.ArgumentParser( description='Simulate Hi-C data based on real datasets.') parser.add_argument( '--matrices', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz' ) parser.add_argument('--matrix_names', default='HIC001') parser.add_argument( '--nodes', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/nodes/nodes.chr21.gz' ) parser.add_argument( '--distDepData', default= '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz,/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC002/HIC002.chr21.gz' ) parser.add_argument('--edgenoise', default='0.0') parser.add_argument('--nodenoise', default='0.0') parser.add_argument('--boundarynoise', default='0') parser.add_argument('--depth', type=int, default=1000000) parser.add_argument('--outdir', default='/ifs/scratch/oursu/test/testmat') parser.add_argument('--resolution', type=int, default=40000) parser.add_argument('--mini', type=int, default=-1) parser.add_argument('--maxi', type=int, default=-1) args = parser.parse_args() #setup nodes nodes, nodes_idx, blacklisted_nodes = processing.read_nodes_from_bed( args.nodes) #set mini and maxi coordinates to focus on when simulating if args.mini <= -1: args.mini = 0 if args.maxi <= -1: args.maxi = len(nodes.keys()) #now go through each of the matrices, and simulate from them matrices = args.matrices.split(',') matrix_names = args.matrix_names.split(',') for m_idx in range(len(matrices)): #read in the matrix mname = matrix_names[m_idx] mfile = matrices[m_idx] my_matrix_orig = read_in_data(mfile, nodes) for edgenoise in args.edgenoise.split(','): for nodenoise in args.nodenoise.split(','): for boundarynoise in args.boundarynoise.split(','): my_matrix = shift_dataset(my_matrix_orig, int(boundarynoise)) ddfiles = args.distDepData.split(',') for ddfile_idx in range(len(ddfiles)): ddfile = ddfiles[ddfile_idx] dd = read_in_data(ddfile, nodes) prob_matrix = get_probability_matrix( my_matrix, dd, float(edgenoise), float(nodenoise), args.mini, args.maxi, np.random.RandomState(hash('probability') % 10000)) ablist = ['a', 'b'] for ab_idx in range(len(ablist)): ab = ablist[ab_idx] if ab == 'a': s = hash(mname + ddfile) % 10000 if ab == 'b': s = hash(mname + ddfile) % 10000 + 101 intro = args.outdir + '/Depth_' + str( args.depth) + '.' + mname sampled_matrix = sample_interactions( copy.deepcopy(prob_matrix), args.depth, np.random.RandomState(s)) ftowrite = intro + '.EN_' + str( edgenoise ) + '.NN_' + str(nodenoise) + '.BN_' + str( boundarynoise) + '.' + ab + '.dd_' + str( ddfile_idx) + '.gz' print ftowrite write_matrix(sampled_matrix, ftowrite, args)