def main():
    parser = argparse.ArgumentParser(description='Wrapper for HiC-Spector')
    parser.add_argument('--m1')
    parser.add_argument('--m2')
    parser.add_argument('--node_file')
    parser.add_argument('--num_evec', type=int, default=20)
    parser.add_argument('--out')
    args = parser.parse_args()

    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, 'NA')
    m1_csr = processing.construct_csr_matrix_from_data_and_nodes(
        args.m1, nodes, blacklist_nodes, False)
    m2_csr = processing.construct_csr_matrix_from_data_and_nodes(
        args.m2, nodes, blacklist_nodes, False)

    m1up = m1_csr
    m1down = m1up.transpose()
    m1down.setdiag(0)
    m1 = m1up + m1down

    m2up = m2_csr
    m2down = m2up.transpose()
    m2down.setdiag(0)
    m2 = m2up + m2down

    sys.stdout = open(args.out, 'w')
    get_reproducibility(m1, m2, args.num_evec)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='Compute RW transformation of 3D data')
    parser.add_argument('--datatype', default='hic')
    parser.add_argument('--m', type=str)
    parser.add_argument('--matrix_format',
                        type=str,
                        default='n1n2val',
                        help='c1n1c2n2val')
    parser.add_argument('--node_file', type=str)
    parser.add_argument('--remove_diagonal', action='store_true')
    parser.add_argument('--mname', type=str)
    parser.add_argument('--outdir', type=str, default='OUT')
    parser.add_argument('--outpref', type=str, default='outpref')
    parser.add_argument('--norm', type=str, default='uniform')
    parser.add_argument('--method', type=str, default='RandomWalks')
    parser.add_argument('--tmin', type=int, default=1)
    parser.add_argument('--tmax', type=int, default=3)
    parser.add_argument('--transition', action='store_true')
    parser.add_argument('--blacklist', default='NA')
    args = parser.parse_args()

    os.system('mkdir -p ' + args.outdir)
    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, args.blacklist)

    m = processing.construct_csr_matrix_from_data_and_nodes(
        args.m, nodes, blacklist_nodes, args.remove_diagonal)

    m_norm = data_operations.process_matrix(m, args.norm)

    mup = m_norm
    mdown = mup.transpose()
    mdown.setdiag(0)
    m_full = mup + mdown

    if args.transition:
        m_full = to_transition(m_full)

    outname = args.outdir + '/' + args.outpref
    for t in range(args.tmin, (args.tmax + 1)):
        if t == 1:
            rw = copy.deepcopy(m_full)
        else:
            rw = rw.dot(m_full)
        processing.write_matrix_from_csr_and_nodes(
            rw, nodes_idx, outname + '.rw_t' + str(t) + '.gz')
def main():
    parser = argparse.ArgumentParser(
        description='Compute reproducibility of 3D genome data')
    parser.add_argument('--datatype', default='hic')
    parser.add_argument(
        '--m1',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC014.res40000.byChr.chr21.gz'
    )
    parser.add_argument(
        '--m2',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/processed_data/HIC001.res40000.byChr.chr21.gz'
    )
    parser.add_argument('--matrix_format',
                        type=str,
                        default='n1n2val',
                        help='c1n1c2n2val')
    parser.add_argument(
        '--node_file',
        type=str,
        default=
        '/srv/gsfs0/projects/kundaje/users/oursu/3d/LA/merged_nodups/nodes/Nodes.w40000.chr21.gz'
    )
    parser.add_argument('--remove_diagonal', action='store_true')
    parser.add_argument('--m1name', type=str, default='HIC014')
    parser.add_argument('--m2name', type=str, default='HIC001')
    parser.add_argument('--outdir', type=str, default='OUT')
    parser.add_argument('--outpref', type=str, default='outpref')
    parser.add_argument('--m_subsample', type=str, default='lowest')
    parser.add_argument(
        '--concise_analysis',
        action='store_true',
        help=
        'Add this flag to only output the reproducibility score, and not perform the distance dependence analyses.'
    )
    parser.add_argument('--norm', type=str, default='uniform')
    parser.add_argument('--method', type=str, default='RandomWalks')
    parser.add_argument('--tmin', type=int, default=1)
    parser.add_argument('--tmax', type=int, default=3)
    parser.add_argument('--approximation', type=int, default=40000)
    parser.add_argument('--transition', action='store_true')
    parser.add_argument('--blacklist', default='NA')
    args = parser.parse_args()

    #write_arguments(args)

    os.system('mkdir -p ' + args.outdir)

    print "GenomeDISCO | " + strftime(
        "%c") + " | Starting reproducibility analysis"
    nodes, nodes_idx, blacklist_nodes = processing.read_nodes_from_bed(
        args.node_file, args.blacklist)

    print "GenomeDISCO | " + strftime("%c") + " | Loading contact maps"
    m1 = processing.construct_csr_matrix_from_data_and_nodes(
        args.m1, nodes, blacklist_nodes, args.remove_diagonal)
    m2 = processing.construct_csr_matrix_from_data_and_nodes(
        args.m2, nodes, blacklist_nodes, args.remove_diagonal)

    stats = {}
    stats[args.m1name] = {}
    stats[args.m2name] = {}
    stats[args.m1name]['depth'] = m1.sum()
    stats[args.m2name]['depth'] = m2.sum()

    m1_subsample = copy.deepcopy(m1)
    m2_subsample = copy.deepcopy(m2)
    if args.m_subsample != 'NA':
        if args.m_subsample == 'lowest':
            if stats[args.m1name]['depth'] >= stats[args.m2name]['depth']:
                m_subsample = copy.deepcopy(m2)
            if stats[args.m1name]['depth'] < stats[args.m2name]['depth']:
                m_subsample = copy.deepcopy(m1)
        else:
            m_subsample = processing.construct_csr_matrix_from_data_and_nodes(
                args.m_subsample, nodes, blacklist_nodes, args.remove_diagonal)
        print "GenomeDISCO | " + strftime(
            "%c") + " | Subsampling to the depth of " + args.m_subsample
        print "GenomeDISCO | " + strftime(
            "%c") + " | Subsampling depth = " + str(m_subsample.sum())
        desired_depth = m_subsample.sum()
        #desired_depth=156023
        if m1.sum() > desired_depth:
            m1_subsample = data_operations.subsample_to_depth(
                m1, desired_depth)
        if m2.sum() > desired_depth:
            m2_subsample = data_operations.subsample_to_depth(
                m2, desired_depth)

    stats[args.m1name]['subsampled_depth'] = m1_subsample.sum()
    stats[args.m2name]['subsampled_depth'] = m2_subsample.sum()

    print "GenomeDISCO | " + strftime(
        "%c") + ' | Normalizing with ' + args.norm
    m1_norm = data_operations.process_matrix(m1_subsample, args.norm)
    m2_norm = data_operations.process_matrix(m2_subsample, args.norm)

    if not args.concise_analysis:
        #distance dependence analysis
        print "GenomeDISCO | " + strftime(
            "%c") + " | Distance dependence analysis"
        if args.datatype == 'hic':
            m1dd = data_operations.get_distance_dep(m1_subsample)
            m2dd = data_operations.get_distance_dep(m2_subsample)
        if args.datatype == 'capturec':
            m1dd = data_operations.get_distance_dep_using_nodes_capturec(
                m1_subsample, nodes, nodes_idx, args.approximation)
            m2dd = data_operations.get_distance_dep_using_nodes_capturec(
                m2_subsample, nodes, nodes_idx, args.approximation)
        dd_diff = get_dd_diff(m1dd, m2dd)
        visualization.plot_dds([m1dd, m2dd], [args.m1name, args.m2name],
                               args.outdir + '/' + args.outpref + '.' +
                               args.m1name + '.vs.' + args.m2name + '.distDep',
                               args.approximation)

    print "GenomeDISCO | " + strftime(
        "%c") + " | Computing reproducibility score"
    if args.method == 'RandomWalks':
        comparer = DiscoRandomWalks(args)
    reproducibility_text, score, scores = comparer.compute_reproducibility(
        m1_norm, m2_norm, args)
    '''
    print "GenomeDISCO | "+strftime("%c")+" | Writing results"
    write_html_report(stats,args,reproducibility_text,score)
    '''
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.scores.txt', 'w')
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              str('{:.3f}'.format(score)) + '\n')
    out.close()
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.scoresByStep.txt', 'w')
    t_strings = []
    score_strings = []
    t_counter = 0
    for t in range(1, (args.tmax + 1)):
        if t >= args.tmin:
            score_strings.append(str('{:.3f}'.format(scores[t_counter])))
            t_counter += 1
        else:
            score_strings.append('NA')
        t_strings.append(str(t))
    out.write('#m1' + '\t' + 'm2' + '\t' + '\t'.join(t_strings) + '\n')
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              '\t'.join(score_strings) + '\n')
    out.close()
    out = open(
        args.outdir + '/' + args.outpref + '.' + args.m1name + '.vs.' +
        args.m2name + '.datastats.txt', 'w')
    out.write('#m1name' + '\t' + 'm2name' + '\t' + 'SeqDepth.m1' + '\t' +
              'SeqDepth.m2' + '\t' + 'SubsampledSeqDepth.m1' + '\t' +
              'SubsampledSeqDepth.m2' + '\t' + 'DistDepDiff' + '\n')
    dd_value = 'NA'
    if not args.concise_analysis:
        dd_value = str('{:.10f}'.format(dd_diff))
    out.write(args.m1name + '\t' + args.m2name + '\t' +
              str(stats[args.m1name]['depth']) + '\t' +
              str(stats[args.m2name]['depth']) + '\t' +
              str(stats[args.m1name]['subsampled_depth']) + '\t' +
              str(stats[args.m2name]['subsampled_depth']) + '\t' + dd_value +
              '\n')
    out.close()
    print "GenomeDISCO | Differences by random walk step: " + '\t'.join(
        score_strings)
    print "GenomeDISCO | " + strftime("%c") + " | DONE"
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Simulate Hi-C data based on real datasets.')
    parser.add_argument(
        '--matrices',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz'
    )
    parser.add_argument('--matrix_names', default='HIC001')
    parser.add_argument(
        '--nodes',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/nodes/nodes.chr21.gz'
    )
    parser.add_argument(
        '--distDepData',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz,/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC002/HIC002.chr21.gz'
    )
    parser.add_argument('--edgenoise', default='0.0')
    parser.add_argument('--nodenoise', default='0.0')
    parser.add_argument('--boundarynoise', default='0')
    parser.add_argument('--depth', type=int, default=1000000)
    parser.add_argument('--outdir', default='/ifs/scratch/oursu/test/testmat')
    parser.add_argument('--resolution', type=int, default=40000)
    args = parser.parse_args()

    #setup nodes
    nodes, nodes_idx = processing.read_nodes_from_bed(args.nodes)

    #now go through each of the matrices, and simulate from them
    matrices = args.matrices.split(',')
    matrix_names = args.matrix_names.split(',')
    for m_idx in range(len(matrices)):
        #read in the matrix
        mname = matrix_names[m_idx]
        mfile = matrices[m_idx]
        my_matrix_orig = read_in_data(mfile, nodes)
        for edgenoise in args.edgenoise.split(','):
            for nodenoise in args.nodenoise.split(','):
                for boundarynoise in args.boundarynoise.split(','):
                    ablist = ['a', 'b']
                    for ab_idx in range(len(ablist)):
                        ab = ablist[ab_idx]
                        my_matrix = shift_dataset(
                            my_matrix_orig, int(boundarynoise))  #=============
                        ddfiles = args.distDepData.split(',')
                        print my_matrix
                        for ddfile_idx in range(len(ddfiles)):
                            ddfile = ddfiles[ddfile_idx]
                            dd = read_in_data(ddfile, nodes)
                            prob_matrix = get_probability_matrix(
                                my_matrix, dd, float(edgenoise),
                                float(nodenoise))
                            print prob_matrix

                            intro = args.outdir + '/Depth_' + str(
                                args.depth) + '.' + mname
                            sampled_matrix = sample_interactions(
                                prob_matrix, args.depth)
                            ftowrite = intro + '.EN_' + str(
                                edgenoise
                            ) + '.NN_' + str(nodenoise) + '.BN_' + str(
                                boundarynoise) + '.' + ab + '.dd_' + str(
                                    ddfile_idx) + '.gz'
                            print ftowrite
                            write_matrix(sampled_matrix, ftowrite, args)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description='Simulate Hi-C data based on real datasets.')
    parser.add_argument(
        '--matrices',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz'
    )
    parser.add_argument('--matrix_names', default='HIC001')
    parser.add_argument(
        '--nodes',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/nodes/nodes.chr21.gz'
    )
    parser.add_argument(
        '--distDepData',
        default=
        '/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC001/HIC001.chr21.gz,/ifs/scratch/oursu/3d/paper/2017-06-08/LA/reproducibility/res40000/data/edges/HIC002/HIC002.chr21.gz'
    )
    parser.add_argument('--edgenoise', default='0.0')
    parser.add_argument('--nodenoise', default='0.0')
    parser.add_argument('--boundarynoise', default='0')
    parser.add_argument('--depth', type=int, default=1000000)
    parser.add_argument('--outdir', default='/ifs/scratch/oursu/test/testmat')
    parser.add_argument('--resolution', type=int, default=40000)
    parser.add_argument('--mini', type=int, default=-1)
    parser.add_argument('--maxi', type=int, default=-1)
    args = parser.parse_args()

    #setup nodes
    nodes, nodes_idx, blacklisted_nodes = processing.read_nodes_from_bed(
        args.nodes)

    #set mini and maxi coordinates to focus on when simulating
    if args.mini <= -1:
        args.mini = 0
    if args.maxi <= -1:
        args.maxi = len(nodes.keys())

    #now go through each of the matrices, and simulate from them
    matrices = args.matrices.split(',')
    matrix_names = args.matrix_names.split(',')
    for m_idx in range(len(matrices)):
        #read in the matrix
        mname = matrix_names[m_idx]
        mfile = matrices[m_idx]
        my_matrix_orig = read_in_data(mfile, nodes)
        for edgenoise in args.edgenoise.split(','):
            for nodenoise in args.nodenoise.split(','):
                for boundarynoise in args.boundarynoise.split(','):
                    my_matrix = shift_dataset(my_matrix_orig,
                                              int(boundarynoise))
                    ddfiles = args.distDepData.split(',')
                    for ddfile_idx in range(len(ddfiles)):
                        ddfile = ddfiles[ddfile_idx]
                        dd = read_in_data(ddfile, nodes)
                        prob_matrix = get_probability_matrix(
                            my_matrix, dd, float(edgenoise), float(nodenoise),
                            args.mini, args.maxi,
                            np.random.RandomState(hash('probability') % 10000))
                        ablist = ['a', 'b']
                        for ab_idx in range(len(ablist)):
                            ab = ablist[ab_idx]
                            if ab == 'a':
                                s = hash(mname + ddfile) % 10000
                            if ab == 'b':
                                s = hash(mname + ddfile) % 10000 + 101

                            intro = args.outdir + '/Depth_' + str(
                                args.depth) + '.' + mname
                            sampled_matrix = sample_interactions(
                                copy.deepcopy(prob_matrix), args.depth,
                                np.random.RandomState(s))
                            ftowrite = intro + '.EN_' + str(
                                edgenoise
                            ) + '.NN_' + str(nodenoise) + '.BN_' + str(
                                boundarynoise) + '.' + ab + '.dd_' + str(
                                    ddfile_idx) + '.gz'
                            print ftowrite
                            write_matrix(sampled_matrix, ftowrite, args)