Ejemplo n.º 1
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)
        
        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f
    
        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist
    
        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked,
                                 filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Ejemplo n.º 2
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # hash that gonna be append to output file names
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.quality_plot:
        logging.info('Generating Hi-C QC plot at:\n  ' +
               path.join(opts.workdir, path.split(opts.fastq)[-1] + '.pdf'))
        dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz,
                                              nreads=100000, paired=False,
                                              savefig=path.join(
                                                  opts.workdir,
                                                  path.split(opts.fastq)[-1] + '.pdf'))
        logging.info('  - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends)
        logging.info('  - Ligation sites: %.3f%%', ligated)
        return

    logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir)

    outfiles = full_mapping(opts.index, opts.fastq,
                            path.join(opts.workdir,
                                      '01_mapped_r%d' % (opts.read)),
                            r_enz=opts.renz, temp_dir=opts.tmp, nthreads=opts.cpus,
                            frag_map=not opts.iterative, clean=not opts.keep_tmp,
                            windows=opts.windows, get_nread=True, skip=opts.skip,
                            suffix=param_hash, **opts.gem_param)

    # adjust line count
    if opts.skip:
        for i, (out, _) in enumerate(outfiles[1:], 1):
            outfiles[i] = out, outfiles[i-1][1] - sum(1 for _ in open(outfiles[i-1][0]))

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, outfiles, launch_time, finish_time)

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        mlog.write('\n'.join([
            ('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out))
            for out, num in outfiles]) + '\n')
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass
Ejemplo n.º 3
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # hash that gonna be append to output file names
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.quality_plot:
        logging.info('Generating Hi-C QC plot at:\n  ' +
               path.join(opts.workdir, path.split(opts.fastq)[-1] + '.pdf'))
        dangling_ends, ligated = quality_plot(opts.fastq, r_enz=opts.renz,
                                              nreads=100000, paired=False,
                                              savefig=path.join(
                                                  opts.workdir,
                                                  path.split(opts.fastq)[-1] + '.pdf'))
        logging.info('  - Dangling-ends (sensu-stricto): %.3f%%', dangling_ends)
        logging.info('  - Ligation sites: %.3f%%', ligated)
        return

    logging.info('mapping %s read %s to %s', opts.fastq, opts.read, opts.workdir)
    outfiles = full_mapping(opts.index, opts.fastq,
                            path.join(opts.workdir,
                                      '01_mapped_r%d' % (opts.read)),
                            opts.renz, temp_dir=opts.tmp, nthreads=opts.cpus,
                            frag_map=not opts.iterative, clean=opts.keep_tmp,
                            windows=opts.windows, get_nread=True, skip=opts.skip,
                            suffix=param_hash, **opts.gem_param)

    # adjust line count
    if opts.skip:
        for i, (out, _) in enumerate(outfiles[1:], 1):
            outfiles[i] = out, outfiles[i-1][1] - sum(1 for _ in open(outfiles[i-1][0]))
    
    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, outfiles, launch_time, finish_time)
    
    # write machine log
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        fcntl.flock(mlog, fcntl.LOCK_EX)
        mlog.write('\n'.join([
            ('# MAPPED READ%s\t%d\t%s' % (opts.read, num, out))
            for out, num in outfiles]) + '\n')
        fcntl.flock(mlog, fcntl.LOCK_UN)

    # clean
    if not opts.keep_tmp:
        logging.info('cleaning temporary files')
        system('rm -rf ' + opts.tmp)
Ejemplo n.º 4
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    if not opts.mapped1 and not opts.mapped2:
        f_names1, f_names2, renz = load_parameters_fromdb(
            opts, reads, opts.jobids)
    else:
        if opts.mapped1:
            f_names1 = opts.mapped1
        if opts.mapped2:
            f_names2 = opts.mapped2
        renz = opts.renz

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2 = None
    elif opts.read == 2:
        out_file2 = None
        f_names1 = f_names2
        f_names2 = None
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of pickle genome to make it faster
        genome = load(open(opts.genome[0], 'rb'))
    except (UnpicklingError, KeyError):
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        if opts.mapped1 or opts.mapped2:
            counts, multis = parse_sam(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
        else:
            counts, multis = parse_map(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' %
                           (read, counts[read][item],
                            out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Ejemplo n.º 5
0
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time):
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='INTERSECTION_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table INTERSECTION_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiple_interactions text,
            Median_fragment_length,
            MAD_fragment_length,
            Max_fragment_length,
            unique (PATHid))""")
            cur.execute("""
        create table FILTER_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Name text,
            Count int,
            JOBid int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Filter',           '%s')
     """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)

        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        add_path(cur, reads, '2D_BED', jobid, opts.workdir)
        add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir)
        try:
            cur.execute("""
            insert into INTERSECTION_OUTPUTs
            (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
            values
            (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
            """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(
                ['%s:%d' % (k, multiples[k])
                 for k in sorted(multiples)]), median, mad, max_f))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute(
                    'delete from INTERSECTION_OUTPUTs where PATHid = %d' %
                    (get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into INTERSECTION_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
                values
                (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
                """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(
                    ['%s:%d' % (k, multiples[k])
                     for k in sorted(multiples)]), median, mad, max_f))
        for f in masked:
            add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir)
            try:
                cur.execute("""
            insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, JOBid)
            values
            (NULL,    %d,     '%s',      '%s', %d)
                """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                       masked[f]['name'], masked[f]['reads'], jobid))
            except lite.IntegrityError:
                print 'WARNING: already filtered'
                if opts.force:
                    cur.execute(
                        'delete from FILTER_OUTPUTs where PATHid = %d' %
                        (get_path_id(cur, masked[f]['fnam'], opts.workdir)))
                    cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, JOBid)
                values
                (NULL,    %d,     '%s',      '%s', %d)
                    """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                           masked[f]['name'], masked[f]['reads'], jobid))
        try:
            cur.execute("""
        insert into FILTER_OUTPUTs
        (Id  , PATHid, Name, Count, JOBid)
        values
        (NULL,    %d,     '%s',      '%s', %d)
            """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs',
                   n_valid_pairs, jobid))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute('delete from FILTER_OUTPUTs where PATHid = %d' %
                            (get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, JOBid)
                values
                (NULL,    %d,     '%s',      '%s', %d)
                """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs',
                       n_valid_pairs, jobid))
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
        print_db(cur, 'INTERSECTION_OUTPUTs')
        print_db(cur, 'FILTER_OUTPUTs')
Ejemplo n.º 6
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        if opts.fast_fragment:
            reads = fname1
            counts_multis = [
                '#' in line.split('\t')[0] for line in open(reads)
            ]
            count = len(counts_multis)
            multiples = {}
            multiples[1] = sum(
                [count_mult for count_mult in counts_multis if count_mult])
            del counts_multis
        else:
            # compute the intersection of the two read ends
            print('Getting intersection between read 1 and read 2')
            count, multiples = get_intersection(fname1,
                                                fname2,
                                                reads,
                                                compress=opts.compress_input)

        # compute insert size
        print('Get insert size...')
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        try:
            median, max_f, mad = fragment_size(reads,
                                               nreads=1000000,
                                               stats=('median', 'first_decay',
                                                      'MAD'),
                                               savefig=hist_path)
        except ZeroDivisionError:
            warn('WARNING: cannot compute fragment length, too few '
                 'dangling-ends. Setting median length to 400 nt.')
            median = max_f = mad = 0
        if median < 50:
            warn('WARNING: fragment length too short ({}). '
                 'Setting median length to 400 nt.'.format(mad))
            median, max_f, mad = 400, 100, 40
        if opts.median:
            median = opts.median
        if opts.max_f:
            max_f = opts.max_f
        if opts.mad:
            mad = opts.mad

        print('  - median insert size =', median)
        print('  - median absolution of insert size =', mad)
        print(
            '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =',
            max_f)

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print('   Using the maximum continuous fragment size'
              '(%d bp) to check '
              'for pseudo-dangling ends' % max_mole)
        print('   Using maximum continuous fragment size plus the MAD '
              '(%d bp) to check for random breaks' % min_dist)

        print("identify pairs to filter...")
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              strict_duplicates=opts.strict_duplicates,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print(median, max_f, mad)
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)
Ejemplo n.º 7
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if not opts.nosql:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id
    else:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                         for l in open(biases))

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        hic_data.find_compartments(crms=opts.crms)

        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=True,
                            max_tad_size=max_tad_size,
                            no_heuristic=True)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)
Ejemplo n.º 8
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count,
                                draw_hist=True,
                                by_mean=not opts.fast_filter, savefig=path.join(
                                    opts.workdir, '04_normalization',
                                    'bad_columns_%s_%d_%d_%s.pdf' % (
                                        opts.reso, opts.perc_zeros, opts.min_count,
                                        param_hash)) if
                                not opts.fast_filter else None)
    except ValueError:
        raise ValueError('ERROR: probably all columns filtered out...')
    # bad columns
    bad_columns_file = path.join(opts.workdir, '04_normalization',
                                 'bad_columns_%s_%d_%d_%s.tsv' % (
                                     opts.reso, opts.perc_zeros, opts.min_count, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    if not opts.filter_only:
        print 'Get biases using ICE...'
        hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                               factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = cis_trans_N_d = float('nan')
    if not opts.filter_only:
        cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
        cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)
        
    if not opts.filter_only:
        print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
        print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (
                                    opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
        savefig=inter_vs_gcoord)
    
    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = 'NA'
    if not opts.filter_only:
        out_bias = open(bias_file, 'w')
        out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                                  for i in hic_data.bias])
                       + '\n')
        out_bias.close()


    # pickle the HiC-data object
    print 'Saving genomic matrix'
    pickle_path = path.join(opts.workdir, '04_normalization',
                            'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash))
    out = open(pickle_path, 'w')
    dump(hic_data, out)
    out.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            if not opts.filter_only:
                inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                      'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                  'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        else:
            if not opts.filter_only:
                genom_map_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        if not opts.filter_only:
            genom_map_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, cmap='jet',
                    name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt)
        hic_map(hic_data, normalized=False, cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig, savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
                a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
                len(hic_data.bads.keys()), len(hic_data),
                intra_dir_nrm_fig, intra_dir_nrm_txt,
                inter_dir_nrm_fig, inter_dir_nrm_txt,
                genom_map_nrm_fig, genom_map_nrm_txt,
                intra_dir_raw_fig, intra_dir_raw_txt,
                inter_dir_raw_fig, inter_dir_raw_txt,
                genom_map_raw_fig, genom_map_raw_txt,
                pickle_path, launch_time, finish_time)
Ejemplo n.º 9
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # hash that gonna be append to output file names
    param_hash = digest_parameters(opts, get_md5=True)

    # create tmp directory
    if not opts.tmp:
        temp_dir = opts.workdir + '_tmp_r%d_%s' % (opts.read, param_hash)
    else:
        temp_dir = path.join(opts.tmp,
                             'TADbit_tmp_r%d_%s' % (opts.read, param_hash))

    # QC plot
    fig_path = path.join(
        opts.workdir, '%s_%s_%s.png' % (path.split(opts.fastq)[-1], '-'.join(
            map(str, opts.renz)), param_hash))
    logging.info('Generating Hi-C QC plot')

    dangling_ends, ligated = quality_plot(opts.fastq,
                                          r_enz=opts.renz,
                                          nreads=100000,
                                          paired=False,
                                          savefig=fig_path)
    for renz in dangling_ends:
        logging.info('  - Dangling-ends (sensu-stricto): %.3f%%',
                     dangling_ends[renz])
    for renz in ligated:
        logging.info('  - Ligation sites: %.3f%%', ligated[renz])
    if opts.skip_mapping:
        save_to_db(opts, dangling_ends, ligated, fig_path, [], launch_time,
                   time.localtime())
        return

    # Mapping
    if opts.fast_fragment:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))
        logging.info('parsing genomic sequence')
        try:
            # allows the use of pickle genome to make it faster
            genome_seq = load(open(opts.genome[0], 'rb'))
        except (UnpicklingError, KeyError):
            genome_seq = parse_fasta(opts.genome)

        logging.info('mapping %s and %s to %s', opts.fastq, opts.fastq2,
                     opts.workdir)
        outfiles = fast_fragment_mapping(
            opts.index,
            opts.fastq,
            opts.fastq2,
            opts.renz,
            genome_seq,
            path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash),
            clean=not opts.keep_tmp,
            get_nread=True,
            mapper_binary=opts.mapper_binary,
            mapper_params=opts.mapper_param,
            suffix=param_hash,
            temp_dir=temp_dir,
            nthreads=opts.cpus)
    else:
        logging.info('mapping %s read %s to %s', opts.fastq, opts.read,
                     opts.workdir)
        outfiles = full_mapping(opts.index,
                                opts.fastq,
                                path.join(opts.workdir,
                                          '01_mapped_r%d' % (opts.read)),
                                mapper=opts.mapper,
                                r_enz=opts.renz,
                                temp_dir=temp_dir,
                                nthreads=opts.cpus,
                                frag_map=not opts.iterative,
                                clean=not opts.keep_tmp,
                                windows=opts.windows,
                                get_nread=True,
                                skip=opts.skip,
                                suffix=param_hash,
                                mapper_binary=opts.mapper_binary,
                                mapper_params=opts.mapper_param)

    # adjust line count
    if opts.skip:
        for i, (out, _) in enumerate(outfiles[1:], 1):
            outfiles[i] = out, outfiles[i - 1][1] - sum(
                1 for _ in open(outfiles[i - 1][0]))

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time,
               finish_time)
    try:
        save_to_db(opts, dangling_ends, ligated, fig_path, outfiles,
                   launch_time, finish_time)
    except Exception as e:
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
        print_exc()
        exit(1)

    # write machine log
    try:
        while path.exists(path.join(opts.workdir, '__lock_log')):
            time.sleep(0.5)
            open(path.join(opts.workdir, '__lock_log'), 'a').close()
        with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
            mlog.write('\n'.join([('# MAPPED READ%s\t%d\t%s' %
                                   (opts.read, num, out))
                                  for out, num in outfiles]) + '\n')
            # release lock
        try:
            remove(path.join(opts.workdir, '__lock_log'))
        except OSError:
            pass
    except Exception as e:
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
        print_exc()
        exit(1)

    # clean
    if not opts.keep_tmp:
        logging.info('cleaning temporary files')
        system('rm -rf ' + temp_dir)
Ejemplo n.º 10
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print '         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print '         - reproducibility score: %.4f' % (reprod)
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    printime('  - Mergeing experiments')
    system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
    printime('  - Indexing new BAM file')
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
    else:
        system(samtools  + ' index %s' % (outbam))

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, launch_time, finish_time)
    printime('\nDone.')
Ejemplo n.º 11
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    print(
        '''
%s%s

  - Region: Chromosome %s from %d to %d at resolution %s (%d particles)
    ''' % ('Preparing ' if opts.job_list else '',
           ('Optimization\n' + '*' *
            (21 if opts.job_list else 11)) if opts.optimize else
           ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm,
           opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg))

    # load data
    if opts.matrix:
        crm = load_hic_data(opts)
    else:
        # FIXME: copied from somewhere else
        (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id,
         reso) = load_parameters_fromdb(opts)
        hic_data = load_hic_data_from_reads(mreads, reso)
        hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
        hic_data.bias = dict(
            (int(l.split()[0]), float(l.split()[1])) for l in open(biases))

    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # prepare output folders
    batch_job_hash = digest_parameters(
        opts,
        get_md5=True,
        extra=[
            'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run',
            'job_list', 'rand', 'nmodels', 'nkeep', 'optimize',
            'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg',
            'ori_end'
        ])

    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(
        opts.workdir, '06_model',
        '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(
            path.join(
                outdir, 'job_list_%s.q' %
                ('optimization' if opts.optimize else 'modeling')), 'w')
    else:
        job_file_handler = None

    ###############
    # Optimization
    print '     o Optimizing parameters'
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        print('\n optimization done')
        # correlate all optimization and get best set of parameters

    if not (opts.optimize and opts.job_list):
        optpar, results = correlate_models(opts, outdir, exp)
    else:
        results = []

    ###########
    # Modeling
    if not opts.optimize:
        big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
Ejemplo n.º 12
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v != 'raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1 = opts.coord1
    coord2 = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
        region2 = None
        start2 = None
        end2 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None
        if coord2:
            try:
                crm2, pos2 = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2 = int(start2)
                end2 = int(end2)
            except ValueError:
                region2 = coord2
                start2 = None
                end2 = None
        else:
            region2 = None
            start2 = None
            end2 = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(
            zip(bamfile.references, [x for x in bamfile.lengths]))
        total = 0
        section_pos = dict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else
                           'NRM' if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads,
                    opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1,
                    start1=start1,
                    end1=end1,
                    region2=region2,
                    start2=start2,
                    end2=end2,
                    tmpdir=tmpdir,
                    ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks,
                    verbose=not opts.quiet,
                    clean=clean)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemeted for '
                         'matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1, p + opts.reso)
                             for r, reg in enumerate(regions) for p in range(
                                 starts[r] if r < len(starts) and starts[r]
                                 else 0, ends[r] if r < len(ends) and ends[r]
                                 else sections[reg], opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name, nicer(
                    opts.reso).replace(' ', ''), ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' %
                              (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' %
                              (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' %
                              (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' %
                                        (row_names.next()) + '\t'.join(
                                            str(matrix.get((i, j), 0))
                                            for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(
                        str(matrix.get((i, j), 0)) for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                cmap = plt.get_cmap(opts.cmap)
                if norm != 'raw':
                    cmap.set_bad('grey', 1.)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s.%s' % (norm, name, nicer(opts.reso).replace(
                    ' ', ''), ('_' + param_hash), opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                if opts.interactive:
                    _ = plt.figure(figsize=(8, 7))
                else:
                    _ = plt.figure(figsize=(16, 14))
                # ax1 = plt.subplot(111)
                ax1 = plt.axes([0.1, 0.1, 0.7, 0.8])
                ax2 = plt.axes([0.82, 0.1, 0.07, 0.8])
                matrix = array([
                    array([matrix.get((i, j), 0) for i in xrange(b1, e1)])
                    for j in xrange(b2, e2)
                ])
                mini = np_min(matrix[nonzero(matrix)]) / 2.
                matrix[matrix == 0] = mini
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:, bad1] = 1
                    for bad2 in bads2:
                        m[bad2, :] = 1
                matrix = log2(ma.masked_array(matrix, m))
                ax1.imshow(matrix,
                           interpolation='None',
                           origin='lower',
                           cmap=cmap,
                           vmin=vmin,
                           vmax=vmax)

                if len(regions) <= 2:
                    pltbeg1 = 0 if start1 is None else start1
                    pltend1 = sections[regions[0]] if end1 is None else end1
                    pltbeg2 = pltbeg1 if len(
                        regions) == 1 else 0 if start2 is None else start2
                    pltend2 = pltend1 if len(regions) == 1 else sections[
                        regions[-1]] if end2 is None else end2

                    ax1.set_xlabel('{}:{:,}-{:,}'.format(
                        regions[0], pltbeg1 if pltbeg1 else 1, pltend1))
                    ax1.set_ylabel('{}:{:,}-{:,}'.format(
                        regions[-1], pltbeg2 if pltbeg2 else 1, pltend2))

                    def format_xticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg1)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    def format_yticks(tickstring, _=None):
                        tickstring = int(tickstring * opts.reso + pltbeg2)
                        return nicer(tickstring if tickstring else 1,
                                     coma=True)

                    ax1.xaxis.set_major_formatter(FuncFormatter(format_xticks))
                    ax1.yaxis.set_major_formatter(FuncFormatter(format_yticks))

                    labels = ax1.get_xticklabels()
                    plt.setp(labels, rotation=-25, ha='left')

                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                else:
                    vals = [0]
                    keys = ['']
                    for crm in regions:
                        vals.append(section_pos[crm][0] / opts.reso)
                        keys.append(crm)
                    vals.append(section_pos[crm][1] / opts.reso)
                    ax1.set_yticks(vals)
                    ax1.set_yticklabels('')
                    ax1.set_yticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_yticklabels(keys, minor=True)
                    for t in ax1.yaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False

                    ax1.set_xticks(vals)
                    ax1.set_xticklabels('')
                    ax1.set_xticks([
                        float(vals[i] + vals[i + 1]) / 2
                        for i in xrange(len(vals) - 1)
                    ],
                                   minor=True)
                    ax1.set_xticklabels(keys, minor=True)
                    for t in ax1.xaxis.get_minor_ticks():
                        t.tick1On = False
                        t.tick2On = False
                    ax1.set_xlabel('Chromosomes')
                    ax1.set_ylabel('Chromosomes')
                    ax1.set_xlim(-0.5, len(matrix[0]) - 0.5)
                    ax1.set_ylim(-0.5, len(matrix) - 0.5)
                data = [i for d in matrix for i in d if isfinite(i)]
                mindata = nanmin(data)
                maxdata = nanmax(data)
                gradient = linspace(maxdata, mindata,
                                    max((len(matrix), len(matrix[0]))))
                gradient = dstack((gradient, gradient))[0]
                h = ax2.hist(data,
                             color='darkgrey',
                             linewidth=2,
                             orientation='horizontal',
                             bins=50,
                             histtype='step',
                             normed=True)
                _ = ax2.imshow(gradient,
                               aspect='auto',
                               cmap=cmap,
                               extent=(0, max(h[0]), mindata, maxdata))
                ax2.yaxis.tick_right()
                ax2.yaxis.set_label_position("right")
                ax2.set_xticks([])
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' %
                              (name, norm, nicer(opts.reso)))
                ax2.set_ylabel('Hi-C Log2 interactions', rotation=-90)
                ax2.set_xlabel('Count')
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(
            write_matrix(mreads,
                         opts.reso,
                         load(open(biases)) if biases else None,
                         outdir,
                         filter_exclude=opts.filter,
                         normalizations=opts.normalizations,
                         region1=region1,
                         start1=start1,
                         end1=end1,
                         region2=region2,
                         start2=start2,
                         end2=end2,
                         tmpdir=tmpdir,
                         append_to_tar=None,
                         ncpus=opts.cpus,
                         nchunks=opts.nchunks,
                         verbose=not opts.quiet,
                         extra=param_hash,
                         clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s ' % tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 13
0
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               nbad_columns, ncolumns, scc, std, reprod,
               eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
               biases1, biases2, launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MERGE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            create table FILTER_OUTPUTs
               (Id integer primary key,
                PATHid int,
                Name text,
                Count int,
                JOBid int,
                unique (PATHid))""")
            cur.execute("""
            create table MERGE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Wrkd1Path int,
                Wrkd2Path int,
                Bed1Path int,
                Bed2Path int,
                MergePath int,
                unique (JOBid))""")
            cur.execute("""
            create table MERGE_STATs
               (Id integer primary key,
                JOBid int,
                Inputs text,
                decay_corr text,
                eigen_corr text,
                reprod real,
                scc real,
                std_scc real,
                N_columns int,
                N_filtered int,
                Resolution int,
                bias1Path int,
                bias2Path int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type   , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Merge',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)
        add_path(cur, decay_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, decay_corr_fig, 'FIGURE'    , jobid, opts.workdir)
        add_path(cur, eigen_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, eigen_corr_fig, 'FIGURE'    , jobid, opts.workdir)

        add_path(cur, opts.workdir , 'WORKDIR'    , jobid)
        add_path(cur, opts.workdir1, 'WORKDIR1'   , jobid, opts.workdir)
        add_path(cur, opts.workdir2, 'WORKDIR2'   , jobid, opts.workdir)
        add_path(cur, mreads1      , 'EXT_HIC_BAM', jobid, opts.workdir)
        add_path(cur, mreads2      , 'EXT_HIC_BAM', jobid, opts.workdir)
        add_path(cur, outbed       , 'HIC_BAM'    , jobid, opts.workdir)

        if opts.norm:
            add_path(cur, biases1      , 'BIASES'     , jobid, opts.workdir)
            add_path(cur, biases2      , 'BIASES'     , jobid, opts.workdir)

            biasid1 = get_path_id(cur, biases1, opts.workdir)
            biasid2 = get_path_id(cur, biases2, opts.workdir)
        else:
            biasid1 = 0
            biasid2 = 0

        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads1, opts.workdir)))
        bed1 = cur.fetchall()[0][0]
        if opts.workdir1:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir1, opts.workdir)))
            w1path = cur.fetchall()[0][0]
        else:
            w1path = 0
        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads2, opts.workdir)))
        bed2 = cur.fetchall()[0][0]
        if opts.workdir2:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir2, opts.workdir)))
            w2path = cur.fetchall()[0][0]
        else:
            w2path = 0
        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(outbed, opts.workdir)))
        outbedid = cur.fetchall()[0][0]
        if not opts.skip_comparison:
            decay_corr = '-'.join(['%.1f' % (v)
                                   for v in corr[:10:2]]).replace('0.', '.')
            eigen_corr = '-'.join(['%.2f' % (max(v))
                                   for v in eig_corr[:4]]).replace('0.', '.')
        else:
            decay_corr = eigen_corr = None
        cur.execute("""
        insert into MERGE_OUTPUTs
        (Id  , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath)
        values
        (NULL,    %d,        %d,        %d,       %d,       %d,        %d)
        """ % (jobid,    w1path,    w2path,     bed1,     bed2,  outbedid))

        if not opts.skip_comparison:
            cur.execute("""
            insert into MERGE_STATs
            (Id  , JOBid, N_columns,   N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path)
            values
            (NULL,    %d,        %d,           %d,         %d,     %f,  %f,      %f,       '%s',       '%s',        %d,        %d)
            """ % (jobid,  ncolumns, nbad_columns, opts.reso , reprod, scc,     std, decay_corr, eigen_corr,   biasid1,   biasid2))

        masked1 = {'valid-pairs': {'count': 0}}
        if opts.workdir1:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile1 = opts.tmpdb1
                try: # to copy in case read1 was already mapped for example
                    copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1)
                except IOError:
                    pass
            else:
                dbfile1 = path.join(opts.workdir1, 'trace.db')
            tmpcon = lite.connect(dbfile1)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked1[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile1)
        masked2 = {'valid-pairs': {'count': 0}}
        if opts.workdir2:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile2 = opts.tmpdb2
                try: # to copy in case read2 was already mapped for example
                    copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2)
                except IOError:
                    pass
            else:
                dbfile2 = path.join(opts.workdir2, 'trace.db')
            tmpcon = lite.connect(dbfile2)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked2[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile2)

        for f in masked1:
            if f  != 'valid-pairs':
                outmask = path.join(opts.workdir, '03_filtered_reads',
                                    'all_r1-r2_intersection_%s.tsv_%s.tsv' % (
                                        param_hash, f))
                out = open(outmask, 'w')
                for line in open(path.join(opts.workdir1, masked1[f]['path'])):
                    out.write(line)
                for line in open(path.join(opts.workdir2, masked2[f]['path'])):
                    out.write(line)
                add_path(cur, outmask, 'FILTER', jobid, opts.workdir)
            else:
                outmask = outbed

            cur.execute("""
            insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, JOBid)
            values
            (NULL,     %d, '%s',  '%s',    %d)
            """ % (get_path_id(cur, outmask, opts.workdir),
                   f, masked1[f]['count'] + masked2[f]['count'], jobid))

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        print_db(cur, 'MERGE_OUTPUTs')
        print_db(cur, 'MERGE_STATs')
        print_db(cur, 'FILTER_OUTPUTs')

    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 14
0
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam, hist_path, median, max_f, mad, launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='INTERSECTION_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table INTERSECTION_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiple_interactions text,
            Median_fragment_length,
            MAD_fragment_length,
            Max_fragment_length,
            unique (PATHid))""")
            cur.execute("""
        create table FILTER_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Name text,
            Count int,
            Applied text,
            JOBid int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Filter',           '%s')
     """ % (parameters,
            time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)

        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        add_path(cur, outbam, 'HIC_BAM', jobid, opts.workdir)
        add_path(cur, outbam + '.bai', 'HIC_BAI', jobid, opts.workdir)
        add_path(cur,  reads, '2D_BED', jobid, opts.workdir)
        add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir)
        try:
            cur.execute("""
            insert into INTERSECTION_OUTPUTs
            (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
            values
            (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
            """ % (get_path_id(cur, mreads, opts.workdir),
                   count, ' '.join(['%s:%d' % (k, multiples[k])
                                    for k in sorted(multiples)]),
                   median, mad, max_f))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute(
                    'delete from INTERSECTION_OUTPUTs where PATHid = %d' % (
                        get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into INTERSECTION_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
                values
                (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
                """ % (get_path_id(cur, mreads, opts.workdir),
                       count, ' '.join(['%s:%d' % (k, multiples[k])
                                        for k in sorted(multiples)]),
                       median, mad, max_f))
        for nf, f in enumerate(masked, 1):
            try:
                add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir)
            except KeyError:
                continue
            try:
                cur.execute("""
            insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, Applied, JOBid)
            values
                (NULL,     %d, '%s',  '%s',    '%s',    %d)
                """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                       masked[f]['name'], masked[f]['reads'],
                       'True' if nf in opts.apply else 'False', jobid))
            except lite.IntegrityError:
                print 'WARNING: already filtered'
                if opts.force:
                    cur.execute(
                        'delete from FILTER_OUTPUTs where PATHid = %d' % (
                            get_path_id(cur, masked[f]['fnam'], opts.workdir)))
                    cur.execute("""
                insert into FILTER_OUTPUTs
                    (Id  , PATHid, Name, Count, Applied, JOBid)
                values
                    (NULL,     %d, '%s',  '%s',    '%s',    %d)
                    """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                           masked[f]['name'], masked[f]['reads'],
                           'True' if nf in opts.apply else 'False', jobid))
        try:
            cur.execute("""
        insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, Applied, JOBid)
        values
            (NULL,     %d, '%s',  '%s',    '%s',    %d)
            """ % (get_path_id(cur, mreads, opts.workdir),
                   'valid-pairs', n_valid_pairs, '', jobid))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute(
                    'delete from FILTER_OUTPUTs where PATHid = %d' % (
                        get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, Applied, JOBid)
                values
                (NULL,     %d, '%s',  '%s',    '%s',    %d)
                """ % (get_path_id(cur, mreads, opts.workdir),
                       'valid-pairs', n_valid_pairs, '', jobid))
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
        print_db(cur, 'INTERSECTION_OUTPUTs')
        print_db(cur, 'FILTER_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 15
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
                opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
                opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    print 'loading first sample', mreads1
    hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

    print 'loading second sample', mreads2
    hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

    if opts.norm and biases1:
        bad_co1 = path.join(opts.workdir1, bad_co1)
        print 'loading bad columns from first sample', bad_co1
        hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1))
        biases1 = path.join(opts.workdir1, biases1)
        print 'loading biases from first sample', biases1
        hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases1))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')
    if opts.norm and biases2:
        bad_co2 = path.join(opts.workdir2, bad_co2)
        print 'loading bad columns from second sample', bad_co2
        hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2))
        biases2 = path.join(opts.workdir2, biases2)
        print 'loading biases from second sample', biases2
        hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases2))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        
    # if opts.norm:
        # has bias file

    if not opts.skip_comparison:
        print 'correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat, get_bads=True)
        print 'correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = None
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (
        param_hash))

    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), len(hic_data1), nreads,
                eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
                biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
Ejemplo n.º 16
0
def save_to_db(opts, outdir, results, batch_job_hash, launch_time,
               finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')

    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='JOBs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MODELED_REGIONs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MODELED_REGIONs
           (Id integer primary key,
            PATHid int,
            PARAM_md5 text,
            RESO int,
            BEG int,
            END int,
            unique (PARAM_md5))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MODELs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MODELs
           (Id integer primary key,
            REGIONid int,
            JOBid int,
            OPTPAR_md5 text,
            MaxDist int,
            UpFreq int,
            LowFreq int,
            Scale int,
            Cutoff int,
            Nmodels int,
            Kept int,
            Correlation int)""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            # In case optimization or modeling  is split in different computers
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s',    '%s',           '%s')
     """ % ((parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
             time.strftime("%d/%m/%Y %H:%M:%S", finish_time),
             (('PRE_' if opts.job_list else '') +
              ('OPTIM' if opts.optimize else 'MODEL')), param_hash)))
        except lite.IntegrityError:
            pass
        ##### STORE OPTIMIZATION RESULT
        jobid = get_jobid(cur)
        add_path(cur, outdir, 'DIR', jobid, opts.workdir)
        pathid = get_path_id(cur, outdir, opts.workdir)
        # models = compile_models(opts, outdir, exp=exp, ngood=opts.nkeep)
        ### STORE GENERAL OPTIMIZATION INFO
        try:
            cur.execute("""
            insert into MODELED_REGIONs
            (Id  , PATHid, PARAM_md5, RESO, BEG, END)
            values
            (NULL,     %d,      "%s",   %d,  %d,  %d)
            """ % (pathid, batch_job_hash, opts.reso, opts.beg, opts.end))
        except lite.IntegrityError:
            pass
        ### STORE EACH OPTIMIZATION
        cur.execute("SELECT Id from MODELED_REGIONs where PARAM_md5='%s'" %
                    (batch_job_hash))
        optimid = cur.fetchall()[0][0]
        for m, u, l, d, s in results:
            optpar_md5 = md5('%s%s%s%s%s' % (m, u, l, d, s)).hexdigest()[:12]
            cur.execute(
                ("SELECT Id from MODELs where "
                 "OPTPAR_md5='%s' and REGIONid='%s'") % (optpar_md5, optimid))
            if not cur.fetchall():
                cur.execute("""
                insert into MODELs
                (Id  , REGIONid, JOBid, OPTPAR_md5, MaxDist, UpFreq, LowFreq, Cutoff, Scale, Nmodels, Kept, Correlation)
                values
                (NULL,             %d,    %d,      '%s',      %s,     %s,      %s,     %s,    %s,      %d,   %d,          %f)
                """ % ((optimid, jobid, optpar_md5, m, u, l, d, s,
                        results[(m, u, l, d, s)]['nmodels'],
                        results[(m, u, l, d, s)]['kept'],
                        results[(m, u, l, d, s)]['corr'])))
            else:
                cur.execute(
                    ("update MODELs "
                     "set Nmodels = %d, Kept = %d, Correlation = %f "
                     "where "
                     "OPTPAR_md5='%s' and REGIONid='%s'") %
                    (results[(m, u, l, d, s)]['nmodels'], results[(m, u, l, d,
                                                                   s)]['kept'],
                     results[(m, u, l, d, s)]['corr'], optpar_md5, optimid))

        ### MODELING
        if not opts.optimization_id:
            cur.execute("SELECT Id from MODELED_REGIONs")
            optimid = cur.fetchall()[0]
            if len(optimid) > 1:
                raise IndexError("ERROR: more than 1 optimization in folder "
                                 "choose with 'tadbit describe' and "
                                 "--optimization_id")
            optimid = optimid[0]
        else:
            cur.execute("SELECT Id from MODELED_REGIONs where Id=%d" %
                        (opts.optimization_id))
            optimid = cur.fetchall()[0][0]

    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 17
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        print 'Comparison'
        print ' - loading first sample', mreads1
        hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

        print ' - loading second sample', mreads2
        hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

        if opts.norm and biases1:
            bad_co1 = path.join(opts.workdir1, bad_co1)
            print ' - loading bad columns from first sample', bad_co1
            hic_data1.bads = dict(
                (int(l.strip()), True) for l in open(bad_co1))
            biases1 = path.join(opts.workdir1, biases1)
            print ' - loading biases from first sample', biases1
            hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases1))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        if opts.norm and biases2:
            bad_co2 = path.join(opts.workdir2, bad_co2)
            print ' - loading bad columns from second sample', bad_co2
            hic_data2.bads = dict(
                (int(l.strip()), True) for l in open(bad_co2))
            biases2 = path.join(opts.workdir2, biases2)
            print ' - loading biases from second sample', biases2
            hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases2))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        decay_corr_dat = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        hic_data1 = {}
        hic_data2 = {}
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

    # if opts.norm:
    # has bias file

    if not opts.skip_comparison:
        print '  => correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1,
                                           hic_data2,
                                           normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat,
                                           get_bads=True)
        print '  => correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1,
                                          hic_data2,
                                          normalized=opts.norm,
                                          remove_bad_columns=True,
                                          nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % (param_hash))

    print '\nMergeing...'
    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat,
               eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1,
               biases2, bad_co2, launch_time, finish_time)
    print '\n\nDone.'
Ejemplo n.º 18
0
def save_to_db(opts, bias_file, mreads, bad_col_image,
               nbad_columns, ncolumns, raw_cisprc, norm_cisprc,
               inter_vs_gcoord, a2, bam_filter,
               launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='JOBs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='NORMALIZE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table NORMALIZE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Input int,
                N_columns int,
                N_filtered int,
                BAM_filter int,
                Cis_percentage_Raw real,
                Cis_percentage_Norm real,
                Slope_700kb_10Mb real,
                Resolution int,
                Normalization text,
                Factor int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Normalize',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, bias_file       , 'BIASES'     , jobid, opts.workdir)
        add_path(cur, bad_col_image   , 'FIGURE'     , jobid, opts.workdir)
        add_path(cur, inter_vs_gcoord , 'FIGURE'     , jobid, opts.workdir)
        if opts.bam:
            add_path(cur, path.realpath(opts.bam), 'EXT_2D_BAM' , jobid, opts.workdir)
        if opts.mappability:
            add_path(cur, path.realpath(opts.mappability), 'EXT_MAPPABILITY' , jobid, opts.workdir)
        if opts.fasta:
            add_path(cur, path.realpath(opts.fasta), 'EXT_FASTA' , jobid, opts.workdir)
        # get pathid of input
        cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir)))
        input_bed = cur.fetchall()[0][0]

        a2 = 0 if isnan(a2) else a2
        try:
            cur.execute("""
            insert into NORMALIZE_OUTPUTs
            (Id  , JOBid,     Input, N_columns,   N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb,   Resolution,      Normalization,      Factor)
            values
            (NULL,    %d,        %d,        %d,           %d,         %d,                 %f,                  %f,               %f,           %d,               '%s',          %f)
            """ % (jobid, input_bed,  ncolumns, nbad_columns, bam_filter,   100 * raw_cisprc,   100 * norm_cisprc,               a2,    opts.reso, opts.normalization, opts.factor))
        except lite.OperationalError:
            try:
                cur.execute("""
                insert into NORMALIZE_OUTPUTs
                (Id  , JOBid,     Input, N_columns,   N_filtered, BAM_filter,      Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb,   Resolution,     Normalization,       Factor)
                values
                (NULL,    %d,        %d,        %d,           %d,         %d,                      %f,                  %f,               %f,           %d,               '%s',          %f)
                """ % (jobid, input_bed,  ncolumns, nbad_columns, bam_filter,        100 * raw_cisprc,   100 * norm_cisprc,               a2,    opts.reso, opts.normalization, opts.factor))
            except lite.OperationalError:
                print 'WANRING: Normalized table not written!!!'

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        try:
            print_db(cur, 'FILTER_OUTPUTs')
            print_db(cur, 'INTERSECTION_OUTPUTs')
            print_db(cur, 'MAPPED_INPUTs')
            print_db(cur, 'MAPPED_OUTPUTs')
            print_db(cur, 'PARSED_OUTPUTs')
            print_db(cur, 'FILTER_OUTPUTs')
        except lite.OperationalError:
            pass
        print_db(cur, 'NORMALIZE_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 19
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    if opts.zrange:
        vmin = float(opts.zrange.split(',')[0])
        vmax = float(opts.zrange.split(',')[1])
    else:
        vmin = vmax = None

    if opts.figsize:
        opts.figsize = map(float, opts.figsize.split(','))
    else:
        vmin = vmax = None

    clean = True  # change for debug

    if opts.bam:
        mreads = path.realpath(opts.bam)
        if not opts.biases and all(v !='raw' for v in opts.normalizations):
            raise Exception('ERROR: external BAM input, should provide path to'
                            ' biases file.')
        biases = opts.biases
    else:
        biases, mreads = load_parameters_fromdb(opts)
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases) if biases else None
    if opts.biases:
        biases = opts.biases

    coord1         = opts.coord1
    coord2         = opts.coord2

    if coord2 and not coord1:
        coord1, coord2 = coord2, coord1

    if not coord1:
        region1 = None
        start1  = None
        end1    = None
        region2 = None
        start2  = None
        end2    = None
    else:
        try:
            crm1, pos1   = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1  = int(start1)
            end1    = int(end1)
        except ValueError:
            region1 = coord1
            start1  = None
            end1    = None
        if coord2:
            try:
                crm2, pos2   = coord2.split(':')
                start2, end2 = pos2.split('-')
                region2 = crm2
                start2  = int(start2)
                end2    = int(end2)
            except ValueError:
                region2 = coord2
                start2  = None
                end2    = None
        else:
            region2 = None
            start2  = None
            end2    = None

    if opts.plot and not opts.force_plot:
        if opts.interactive:
            max_size = 1500**2
        else:
            max_size = 5000**2
    else:
        max_size = None

    outdir = path.join(opts.workdir, '05_sub-matrices')
    mkdir(outdir)
    tmpdir = path.join(opts.workdir, '05_sub-matrices',
                       '_tmp_sub-matrices_%s' % param_hash)
    mkdir(tmpdir)

    if region1:
        if region1:
            if not opts.quiet:
                stdout.write('\nExtraction of %s' % (region1))
            if start1:
                if not opts.quiet:
                    stdout.write(':%s-%s' % (start1, end1))
            else:
                if not opts.quiet:
                    stdout.write(' (full chromosome)')
            if region2:
                if not opts.quiet:
                    stdout.write(' intersection with %s' % (region2))
                if start2:
                    if not opts.quiet:
                        stdout.write(':%s-%s\n' % (start2, end2))
                else:
                    if not opts.quiet:
                        stdout.write(' (full chromosome)\n')
            else:
                if not opts.quiet:
                    stdout.write('\n')
    else:
        if not opts.quiet:
            stdout.write('\nExtraction of full genome\n')

    out_files = {}
    out_plots = {}

    if opts.matrix or opts.plot:
        bamfile = AlignmentFile(mreads, 'rb')
        sections = OrderedDict(zip(bamfile.references,
                                   [x for x in bamfile.lengths]))
        total = 0
        section_pos = OrderedDict()
        for crm in sections:
            section_pos[crm] = (total, total + sections[crm])
            total += sections[crm]
        for norm in opts.normalizations:
            norm_string = ('RAW' if norm == 'raw' else 'NRM'
                           if norm == 'norm' else 'DEC')
            printime('Getting %s matrices' % norm)
            try:
                matrix, bads1, bads2, regions, name, bin_coords = get_matrix(
                    mreads, opts.reso,
                    load(open(biases)) if biases and norm != 'raw' else None,
                    normalization=norm,
                    region1=region1, start1=start1, end1=end1,
                    region2=region2, start2=start2, end2=end2,
                    tmpdir=tmpdir, ncpus=opts.cpus,
                    return_headers=True,
                    nchunks=opts.nchunks, verbose=not opts.quiet,
                    clean=clean, max_size=max_size)
            except NotImplementedError:
                if norm == "raw&decay":
                    warn('WARNING: raw&decay normalization not implemented '
                         'for matrices\n... skipping\n')
                    continue
                raise
            b1, e1, b2, e2 = bin_coords
            b1, e1 = 0, e1 - b1
            b2, e2 = 0, e2 - b2
            if opts.row_names:
                starts = [start1, start2]
                ends = [end1, end2]
                row_names = ((reg, p + 1 , p + opts.reso) for r, reg in enumerate(regions)
                             for p in range(starts[r] if r < len(starts) and starts[r] else 0,
                                            ends[r] if r < len(ends) and ends[r] else sections[reg],
                                            opts.reso))
            if opts.matrix:
                printime(' - Writing: %s' % norm)
                fnam = '%s_%s_%s%s.mat' % (norm, name,
                                           nicer(opts.reso, sep=''),
                                           ('_' + param_hash))
                out_files[norm_string] = path.join(outdir, fnam)
                out = open(path.join(outdir, fnam), 'w')
                for reg in regions:
                    out.write('# CRM %s\t%d\n' % (reg, sections[reg]))
                if region2:
                    out.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
                    out.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
                else:
                    out.write('# MASKED %s\n' % (','.join([str(b) for b in bads1])))
                if opts.row_names:
                    out.write('\n'.join('%s\t%d\t%d\t' % (row_names.next()) +
                                        '\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                else:
                    out.write('\n'.join('\t'.join(str(matrix.get((i, j), 0))
                                                  for i in xrange(b1, e1))
                                        for j in xrange(b2, e2)) + '\n')
                out.close()
            if opts.plot:
                # transform matrix
                matrix = array([array([matrix.get((i, j), 0)
                                       for i in xrange(b1, e1)])
                                for j in xrange(b2, e2)])
                m = zeros_like(matrix)
                for bad1 in bads1:
                    m[:,bad1] = 1
                    for bad2 in bads2:
                        m[bad2,:] = 1
                matrix = ma.masked_array(matrix, m)
                printime(' - Plotting: %s' % norm)
                fnam = '%s_%s_%s%s%s.%s' % (
                    norm, name, nicer(opts.reso, sep=''),
                    ('_' + param_hash), '_tri' if opts.triangular else '',
                    opts.format)
                out_plots[norm_string] = path.join(outdir, fnam)
                pltbeg1 = 0 if start1 is None else start1
                pltend1 = sections[regions[0]] if end1 is None else end1
                pltbeg2 = 0 if start2 is None else start2
                pltend2 = sections[regions[-1]] if end2 is None else end2
                xlabel = '{}:{:,}-{:,}'.format(
                    regions[0], pltbeg1 if pltbeg1 else 1, pltend1)
                ylabel = '{}:{:,}-{:,}'.format(
                    regions[-1], pltbeg2 if pltbeg2 else 1, pltend2)
                section_pos = OrderedDict((k, section_pos[k]) for k in section_pos
                                   if k in regions)
                ax1, _ = plot_HiC_matrix(
                    matrix, triangular=opts.triangular,
                    vmin=vmin, vmax=vmax, cmap=opts.cmap,
                    figsize=opts.figsize,
                    bad_color=opts.bad_color if norm != 'raw' else None)
                ax1.set_title('Region: %s, normalization: %s, resolution: %s' % (
                    name, norm, nicer(opts.reso)), y=1.05)
                _format_axes(ax1, start1, end1, start2, end2, opts.reso,
                             regions, section_pos, sections,
                             opts.xtick_rotation, triangular=False)
                if opts.interactive:
                    plt.show()
                    plt.close('all')
                else:
                    tadbit_savefig(path.join(outdir, fnam))
    if not opts.matrix and not opts.only_plot:
        printime('Getting and writing matrices')
        out_files.update(write_matrix(
            mreads, opts.reso,
            load(open(biases)) if biases else None,
            outdir, filter_exclude=opts.filter,
            normalizations=opts.normalizations,
            region1=region1, start1=start1, end1=end1,
            region2=region2, start2=start2, end2=end2,
            tmpdir=tmpdir, append_to_tar=None, ncpus=opts.cpus,
            nchunks=opts.nchunks, verbose=not opts.quiet,
            extra=param_hash, clean=clean))

    if clean:
        printime('Cleaning')
        system('rm -rf %s '% tmpdir)

    if not opts.interactive:
        printime('Saving to DB')
        finish_time = time.localtime()
        save_to_db(opts, launch_time, finish_time, out_files, out_plots)
Ejemplo n.º 20
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 21
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(
            perc_zero=opts.perc_zeros,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)
    except ValueError:
        hic_data.filter_columns(
            perc_zero=100,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)

    # bad columns
    bad_columns_file = path.join(
        opts.workdir, '04_normalization',
        'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    print 'Get biases using ICE...'
    hic_data.normalize_hic(silent=False,
                           max_dev=0.1,
                           iterations=0,
                           factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True)
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
    print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.pdf_%s_%s.pdf' %
        (opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data,
        max_diff=10000,
        resolution=opts.reso,
        normalized=True,
        savefig=inter_vs_gcoord)

    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = open(bias_file, 'w')
    out_bias.write(
        '\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                   for i in hic_data.bias]) + '\n')
    out_bias.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_nrm_fig,
                savedata=intra_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig,
                savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            inter_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        inter_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_nrm_fig,
                savedata=inter_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig,
                savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        else:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        genom_map_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig,
                savedata=genom_map_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig,
                savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time)
Ejemplo n.º 22
0
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time):
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='PARSED_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MAPPED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            BEDid int,
            Uniquely_mapped int,
            unique (PATHid, BEDid))""")
            cur.execute("""
        create table PARSED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiples int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Parse',           '%s')
     """ % (parameters,
            time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, out_file1, 'BED', jobid, opts.workdir)
        for genome in opts.genome:
            add_path(cur, genome, 'FASTA', jobid, opts.workdir)
        if out_file2:
            add_path(cur, out_file2, 'BED', jobid, opts.workdir)
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
        for count in counts:
            try:
                sum_reads = 0
                for i, item in enumerate(counts[count]):
                    cur.execute("""
                    insert into MAPPED_OUTPUTs
                    (Id  , PATHid, BEDid, Uniquely_mapped)
                    values
                    (NULL,    %d,     %d,      %d)
                    """ % (get_path_id(cur, fnames[count][i], opts.workdir),
                           get_path_id(cur, outfiles[count], opts.workdir),
                           counts[count][item]))
                    sum_reads += counts[count][item]
            except lite.IntegrityError:
                print 'WARNING: already parsed (MAPPED_OUTPUTs)'
            try:
                cur.execute("""
                insert into PARSED_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiples)
                values
                (NULL,     %d,      %d,        %d)
                """ % (get_path_id(cur, outfiles[count], opts.workdir),
                       sum_reads, multis[count]))
            except lite.IntegrityError:
                print 'WARNING: already parsed (PARSED_OUTPUTs)'
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
Ejemplo n.º 23
0
def save_to_db(opts, dangling_ends, ligated, fig_path, outfiles, launch_time,
               finish_time):
    """
    write little DB to keep track of processes and options
    """
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        # check if table exists
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MAPPED_INPUTs'""")
        if not cur.fetchall():
            try:
                cur.execute("""
                create table PATHs
                (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            except lite.OperationalError:
                pass  # may append when mapped files cleaned
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            create table MAPPED_INPUTs
               (Id integer primary key,
                PATHid int,
                Entries int,
                Trim text,
                Frag text,
                Read int,
                Enzyme text,
                Dangling_Ends text,
                Ligation_Sites text,
                WRKDIRid int,
                MAPPED_OUTPUTid int,
                INDEXid int,
                unique (PATHid,Entries,Read,Enzyme,WRKDIRid,MAPPED_OUTPUTid,INDEXid))"""
                        )

        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Map',           '%s')
     """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, opts.workdir, 'WORKDIR', jobid)
        add_path(cur, opts.fastq, 'MAPPED_FASTQ', jobid, opts.workdir)
        add_path(cur, opts.index, 'INDEX', jobid, opts.workdir)
        add_path(cur, fig_path, 'FIGURE', jobid, opts.workdir)
        for i, (out, num) in enumerate(outfiles):
            try:
                window = opts.windows[i]
            except IndexError:
                window = opts.windows[-1]
            except TypeError:
                window = 'None'
            add_path(cur, out, '2D_BED' if opts.read == 0 else 'SAM/MAP',
                     jobid, opts.workdir)
            frag = ('none' if opts.iterative else 'fast_frag' if opts.read == 0
                    else 'frag' if i == len(outfiles) - 1 else 'full')
            try:
                cur.execute("""
    insert into MAPPED_INPUTs
     (Id  , PATHid, Entries, Trim, Frag, Read, Enzyme, Dangling_Ends, Ligation_Sites, WRKDIRid, MAPPED_OUTPUTid, INDEXid)
    values
     (NULL,      %d,     %d, '%s', '%s',   %d,   '%s',         '%s',          '%s',       %d,              %d,      %d)
     """ % (get_path_id(cur, opts.fastq, opts.workdir), num, window, frag,
                opts.read, '-'.join(map(str, opts.renz)), ' '.join(
                '%s:%.3f%%' % (r, dangling_ends.get(r, float('nan')))
                for r in opts.renz), ' '.join(
                    '%s:%.3f%%' % ('-'.join(r), ligated.get(r, float('nan')))
                    for r in ligated), get_path_id(cur, opts.workdir),
                get_path_id(cur, out, opts.workdir),
                get_path_id(cur, opts.index, opts.workdir)))
            except lite.IntegrityError:
                pass
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 24
0
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='PARSED_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MAPPED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            BEDid int,
            Uniquely_mapped int,
            unique (PATHid, BEDid))""")
            cur.execute("""
        create table PARSED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiples text,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Parse',           '%s')
     """ % (parameters,
            time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, out_file1, 'BED', jobid, opts.workdir)
        for genome in opts.genome:
            add_path(cur, genome, 'FASTA', jobid, opts.workdir)
        if out_file2:
            add_path(cur, out_file2, 'BED', jobid, opts.workdir)
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
        for count in counts:
            try:
                sum_reads = 0
                for i, item in enumerate(counts[count]):
                    cur.execute("""
                    insert into MAPPED_OUTPUTs
                    (Id  , PATHid, BEDid, Uniquely_mapped)
                    values
                    (NULL,    %d,     %d,      %d)
                    """ % (get_path_id(cur, fnames[count][i], opts.workdir),
                           get_path_id(cur, outfiles[count], opts.workdir),
                           counts[count][item]))
                    sum_reads += counts[count][item]
            except lite.IntegrityError:
                print 'WARNING: already parsed (MAPPED_OUTPUTs)'
            try:
                cur.execute("""
                insert into PARSED_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiples)
                values
                (NULL,     %d,      %d,        '%s')
                """ % (get_path_id(cur, outfiles[count], opts.workdir),
                       sum_reads, ','.join([':'.join(map(str, (n, multis[count][n])))
                                            for n in multis[count] if n])))
            except lite.IntegrityError:
                print 'WARNING: already parsed (PARSED_OUTPUTs)'

        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 25
0
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam, hist_path, median, max_f, mad, launch_time,
               finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='INTERSECTION_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table INTERSECTION_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiple_interactions text,
            Median_fragment_length,
            MAD_fragment_length,
            Max_fragment_length,
            unique (PATHid))""")
            cur.execute("""
        create table FILTER_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Name text,
            Count int,
            Applied text,
            JOBid int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Filter',           '%s')
     """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)

        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        add_path(cur, outbam, 'HIC_BAM', jobid, opts.workdir)
        add_path(cur, outbam + '.bai', 'HIC_BAI', jobid, opts.workdir)
        add_path(cur, reads, '2D_BED', jobid, opts.workdir)
        add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir)
        try:
            real_count = count
            for mult in multiples:
                real_count = real_count - multiples[mult] + multiples[mult] * (
                    (mult * (mult + 1)) // 2)
            cur.execute(
                """
            insert into INTERSECTION_OUTPUTs
            (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
            values
            (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
            """ %
                (get_path_id(cur, mreads, opts.workdir), real_count, ' '.join(
                    ['%s:%d' % (k, multiples[k])
                     for k in sorted(multiples)]), median, mad, max_f))
        except lite.IntegrityError:
            print('WARNING: already filtered')
            if opts.force:
                cur.execute(
                    'delete from INTERSECTION_OUTPUTs where PATHid = %d' %
                    (get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into INTERSECTION_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
                values
                (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
                """ % (get_path_id(cur, mreads, opts.workdir), count, ' '.join(
                    ['%s:%d' % (k, multiples[k])
                     for k in sorted(multiples)]), median, mad, max_f))
        for nf, f in enumerate(masked, 1):
            try:
                add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir)
            except KeyError:
                continue
            try:
                cur.execute("""
            insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, Applied, JOBid)
            values
                (NULL,     %d, '%s',  '%s',    '%s',    %d)
                """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                       masked[f]['name'], masked[f]['reads'],
                       'True' if nf in opts.apply else 'False', jobid))
            except lite.IntegrityError:
                print('WARNING: already filtered')
                if opts.force:
                    cur.execute(
                        'delete from FILTER_OUTPUTs where PATHid = %d' %
                        (get_path_id(cur, masked[f]['fnam'], opts.workdir)))
                    cur.execute("""
                insert into FILTER_OUTPUTs
                    (Id  , PATHid, Name, Count, Applied, JOBid)
                values
                    (NULL,     %d, '%s',  '%s',    '%s',    %d)
                    """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                           masked[f]['name'], masked[f]['reads'],
                           'True' if nf in opts.apply else 'False', jobid))
        try:
            cur.execute("""
        insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, Applied, JOBid)
        values
            (NULL,     %d, '%s',  '%s',    '%s',    %d)
            """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs',
                   n_valid_pairs, '', jobid))
        except lite.IntegrityError:
            print('WARNING: already filtered')
            if opts.force:
                cur.execute('delete from FILTER_OUTPUTs where PATHid = %d' %
                            (get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, Applied, JOBid)
                values
                (NULL,     %d, '%s',  '%s',    '%s',    %d)
                """ % (get_path_id(cur, mreads, opts.workdir), 'valid-pairs',
                       n_valid_pairs, '', jobid))
        print_db(cur, 'PATHs')
        if not opts.fast_fragment:
            print_db(cur, 'MAPPED_OUTPUTs')
            print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
        print_db(cur, 'INTERSECTION_OUTPUTs')
        print_db(cur, 'FILTER_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 26
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids)

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Ejemplo n.º 27
0
def save_to_db(opts, cmp_result, tad_result, reso, inputs,
               launch_time, finish_time):
    if 'tmp' in opts and opts.tmp:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'wa').close()
        # tmp file
        dbfile = opts.tmp
        copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='SEGMENT_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table SEGMENT_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Inputs text,
                TADs int,
                Compartments int,
                Chromosome text,
                Resolution int)""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Segment',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        for crm in max(cmp_result.keys(), tad_result.keys(),
                       key=lambda x: len(x)):
            if crm in cmp_result:
                add_path(cur, cmp_result[crm]['path'], 'COMPARTMENT',
                         jobid, opts.workdir)
            if crm in tad_result:
                add_path(cur, tad_result[crm]['path'], 'TAD', jobid, opts.workdir)
            cur.execute("""
            insert into SEGMENT_OUTPUTs
            (Id  , JOBid, Inputs, TADs, Compartments, Chromosome, Resolution)
            values
            (NULL,    %d,   '%s',   %d,           %d,       '%s',         %d)
            """ % (jobid,
                   ','.join([str(i) for i in inputs]),
                   tad_result[crm]['num'] if crm in tad_result else 0,
                   cmp_result[crm]['num'] if crm in cmp_result else 0,
                   crm,
                   reso))
            print_db(cur, 'PATHs')
            print_db(cur, 'JOBs')
            print_db(cur, 'SEGMENT_OUTPUTs')
    if 'tmp' in opts and opts.tmp:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
Ejemplo n.º 28
0
def save_to_db(opts, cmp_result, tad_result, reso, inputs,
               richA_stats, firsts, param_hash,
               launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='JOBs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='SEGMENT_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table SEGMENT_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Inputs text,
                TADs int,
                Compartments int,
                richA_corr real,
                EV_index int,
                EValue real,
                Chromosome text,
                Resolution int)""")
        try:
            parameters = digest_parameters(opts, get_md5=False, extra=['fasta'])
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Segment',       '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        for ncrm, crm in enumerate(max(cmp_result.keys(), tad_result.keys(), key=len)):
            if crm in cmp_result:
                add_path(cur, cmp_result[crm]['path_cmprt1'], 'COMPARTMENT',
                         jobid, opts.workdir)
                add_path(cur, cmp_result[crm]['path_cmprt2'], 'COMPARTMENT',
                         jobid, opts.workdir)
                add_path(cur, cmp_result[crm]['image_cmprt'], 'FIGURE',
                         jobid, opts.workdir)
                if opts.savecorr:
                    add_path(cur, cmp_result[crm]['path_cormat'],
                             'CROSS_CORR_MAT', jobid, opts.workdir)
            if crm in tad_result:
                add_path(cur, tad_result[crm]['path'], 'TAD', jobid, opts.workdir)
            if opts.rich_in_A:
                add_path(cur, opts.rich_in_A, 'BED', jobid, opts.workdir)

            if crm in firsts:
                evalue = firsts[crm][0][(opts.ev_index[ncrm] - 1) if opts.ev_index else 0]
                eindex = opts.ev_index[ncrm] if opts.ev_index else 1
            else:
                evalue = 'NULL'
                eindex = 'NULL'
            try:
                cur.execute("""
                insert into SEGMENT_OUTPUTs
                (Id  , JOBid, Inputs, TADs, Compartments, richA_corr, EV_index, EValue, Chromosome, Resolution)
                values
                (NULL,    %d,   '%s',   %s,           %s,         %s,       %s,     %s,       '%s',         %d)
                """ % (jobid,
                       ','.join([str(i) for i in inputs]),
                       tad_result[crm]['num'] if crm in tad_result else 'NULL',
                       cmp_result[crm]['num'] if crm in cmp_result else 'NULL',
                       (richA_stats[crm] if crm in richA_stats
                        and richA_stats[crm] is not None else 'NULL'),
                       eindex, evalue, crm, reso))
            except lite.OperationalError:  # TODO: remove this
                print_exc()
                try:
                    cur.execute("alter table SEGMENT_OUTPUTs add column 'richA_corr' 'real'")
                except:
                    pass
                try:
                    cur.execute("alter table SEGMENT_OUTPUTs add column 'EValue' 'real'")
                except:
                    pass
                try:
                    cur.execute("alter table SEGMENT_OUTPUTs add column 'EV_index', 'int'")
                except:
                    pass
                cur.execute("""
                insert into SEGMENT_OUTPUTs
                (Id  , JOBid, Inputs, TADs, Compartments, richA_corr, EV_index, EValue, Chromosome, Resolution)
                values
                (NULL,    %d,   '%s',   %d,           %d,         %s,       %s,     %s,       '%s',         %d)
                """ % (jobid,
                       ','.join([str(i) for i in inputs]),
                       tad_result[crm]['num'] if crm in tad_result else 0,
                       cmp_result[crm]['num'] if crm in cmp_result else 0,
                       (richA_stats[crm] if crm in richA_stats
                        and richA_stats[crm] is not None else 'NULL'),
                       eindex, evalue, crm, reso))
        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        print_db(cur, 'SEGMENT_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
        # release lock
        remove(path.join(opts.workdir, '__lock_db'))
Ejemplo n.º 29
0
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='PARSED_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MAPPED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            BEDid int,
            Uniquely_mapped int,
            unique (PATHid, BEDid))""")
            cur.execute("""
        create table PARSED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiples text,
            unique (PATHid))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='JOBs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Parse',           '%s')
     """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, out_file1, 'BED', jobid, opts.workdir)
        for genome in opts.genome:
            add_path(cur, genome, 'FASTA', jobid, opts.workdir)
        if out_file2:
            add_path(cur, out_file2, 'BED', jobid, opts.workdir)
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
        for count in counts:
            try:
                sum_reads = 0
                for i, item in enumerate(counts[count]):
                    add_path(cur, fnames[count][i], 'MAPPED_FASTQ', jobid,
                             opts.workdir)
                    cur.execute("""
                    insert into MAPPED_OUTPUTs
                    (Id  , PATHid, BEDid, Uniquely_mapped)
                    values
                    (NULL,    %d,     %d,      %d)
                    """ % (get_path_id(cur, fnames[count][i], opts.workdir),
                           get_path_id(cur, outfiles[count],
                                       opts.workdir), counts[count][item]))
                    sum_reads += counts[count][item]
            except lite.IntegrityError:
                print('WARNING: already parsed (MAPPED_OUTPUTs)')
            try:
                cur.execute("""
                insert into PARSED_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiples)
                values
                (NULL,     %d,      %d,        '%s')
                """ % (get_path_id(
                    cur, outfiles[count], opts.workdir), sum_reads, ','.join([
                        ':'.join(map(str, (n, multis[count][n])))
                        for n in multis[count] if n
                    ])))
            except lite.IntegrityError:
                print('WARNING: already parsed (PARSED_OUTPUTs)')

        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 30
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.nosql:
        biases = opts.biases
        mreads = opts.mreads
        inputs = []
    elif opts.biases or opts.mreads:
        if not opts.mreads:
            raise Exception('ERROR: also need to provide BAM file')
        if not opts.biases:
            raise Exception('ERROR: also need to provide biases file')
        biases = opts.biases
        mreads = opts.mreads
        inputs = ['NA', 'NA']
        mkdir(path.join(opts.workdir))
    else:
        biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts)
        inputs = [biases_id, mreads_id]
        # store path ids to be saved in database
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases)

    reso   = opts.reso

    mkdir(path.join(opts.workdir, '06_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    region = None
    if opts.crms and len(opts.crms) == 1:
        region = opts.crms[0]
    hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus,
                                      region=region,
                                      biases=None if opts.all_bins else biases,
                                      filter_exclude=opts.filter)

    # compartments
    cmp_result = {}
    richA_stats = {}
    firsts = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '06_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        if opts.fasta:
            print '  - Computing GC content to label compartments'
            rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso,
                                       chromosomes=opts.crms,
                                       by_chrom=True, n_cpus=opts.cpus)
        elif opts.rich_in_A:
            rich_in_A = opts.rich_in_A
        else:
            rich_in_A = None
        n_evs = opts.n_evs if opts.n_evs > 0 else 3
        firsts, richA_stats = hic_data.find_compartments(
            crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash,
            rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None,
            savecorr=cmprt_dir if opts.savecorr else None,
            max_ev=n_evs,
            ev_index=opts.ev_index,
            vmin=None if opts.fix_corr_scale else 'auto',
            vmax=None if opts.fix_corr_scale else 'auto')

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            if not crm in firsts:
                continue
            ev_file = open(path.join(
                cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                    crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                    param_hash)), 'w')
            ev_file.write('# %s\n' % ('\t'.join(
                'EV_%d (%.4f)' % (i, v)
                for i, v in enumerate(firsts[crm][0], 1))))
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm][1])]))
            ev_file.close()

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash))
            cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                param_hash, opts.format))
            if opts.savecorr:
                cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' %
                                       (crm, param_hash))
            else:
                cormat_file = None
            hic_data.write_compartments(cmprt_file1, chroms=[crm])
            cmp_result[crm] = {'path_cmprt1': cmprt_file1,
                               'path_cmprt2': cmprt_file2,
                               'path_cormat': cormat_file,
                               'image_cmprt': cmprt_image,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '06_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            if hic_data.bads:
                to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            else:
                to_rm = None
            # maximum size of a TAD
            max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=opts.verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)

            # use normalization to compute height on TADs called
            if opts.all_bins:
                if opts.nosql:
                    biases = load(open(biases))
                else:
                    biases = load(open(path.join(opts.workdir, biases)))
                hic_data.bads = biases['badcol']
                hic_data.bias = biases['biases']
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        try:
            save_to_db(opts, cmp_result, tad_result, reso, inputs,
                       richA_stats, firsts, param_hash,
                       launch_time, finish_time)
        except:
            # release lock anyway
            print_exc()
            try:
                remove(path.join(opts.workdir, '__lock_db'))
            except OSError:
                pass
            exit(1)
Ejemplo n.º 31
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(reads,
                                          nreads=1000000,
                                          stats=('median', 'first_decay',
                                                 'MAD'),
                                          savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=0.001,
                              max_frag_size=100000,
                              min_frag_size=50,
                              re_proximity=5,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Ejemplo n.º 32
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # pad mappability at the end if the size is close to gc_content
        if len(mappability)<len(gc_content) and len(mappability)/len(gc_content) > 0.95:
            mappability += [float('nan')] * (len(gc_content)-len(mappability))

        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1`
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path, 
        cis_limit=opts.cis_limit, trans_limit=opts.trans_limit, 
        min_ratio=opts.ratio_limit, fast_filter=opts.fast_filter)

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 33
0
def save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time):
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='PARSED_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table MAPPED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            BEDid int,
            Uniquely_mapped int,
            unique (PATHid, BEDid))""")
            cur.execute("""
        create table PARSED_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiples int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Parse',           '%s')
     """ % (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, out_file1, 'BED', jobid, opts.workdir)
        for genome in opts.genome:
            add_path(cur, genome, 'FASTA', jobid, opts.workdir)
        if out_file2:
            add_path(cur, out_file2, 'BED', jobid, opts.workdir)
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
        for count in counts:
            try:
                sum_reads = 0
                for i, item in enumerate(counts[count]):
                    cur.execute("""
                    insert into MAPPED_OUTPUTs
                    (Id  , PATHid, BEDid, Uniquely_mapped)
                    values
                    (NULL,    %d,     %d,      %d)
                    """ % (get_path_id(cur, fnames[count][i], opts.workdir),
                           get_path_id(cur, outfiles[count],
                                       opts.workdir), counts[count][item]))
                    sum_reads += counts[count][item]
            except lite.IntegrityError:
                print 'WARNING: already parsed (MAPPED_OUTPUTs)'
            try:
                cur.execute("""
                insert into PARSED_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiples)
                values
                (NULL,     %d,      %d,        %d)
                """ % (get_path_id(cur, outfiles[count],
                                   opts.workdir), sum_reads, multis[count]))
            except lite.IntegrityError:
                print 'WARNING: already parsed (PARSED_OUTPUTs)'
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
Ejemplo n.º 34
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None
        except TypeError:  # Py3
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None
        except TypeError:  # Py3
            biases1 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        if opts.workdir1 and opts.workdir2:
            masked1 = {'valid-pairs': {'count': 0}}
            masked2 = {'valid-pairs': {'count': 0}}
        else:
            masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}}
            masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}}

        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print('         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std))
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print('         - reproducibility score: %.4f' % (reprod))
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        masked1 = {}
        masked2 = {}

        corr = eig_corr = scc = std = reprod = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    if not opts.skip_merge:
        outbam = path.join(opts.workdir, '03_filtered_reads',
                           'intersection_%s.bam' % (param_hash))
        printime('  - Mergeing experiments')
        system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
        printime('  - Indexing new BAM file')
        # check samtools version number and modify command line
        version = LooseVersion([l.split()[1]
                                for l in Popen(samtools, stderr=PIPE,
                                               universal_newlines=True).communicate()[1].split('\n')
                                if 'Version' in l][0])
        if version >= LooseVersion('1.3.1'):
            system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
        else:
            system(samtools  + ' index %s' % (outbam))
    else:
        outbam = ''

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(list(bads.keys())), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, masked1, masked2, launch_time, finish_time)
    printime('\nDone.')
Ejemplo n.º 35
0
def save_to_db(opts, launch_time, finish_time, out_files, out_plots):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        try:
            parameters = digest_parameters(opts, get_md5=False, extra=['quiet'])
            param_hash = digest_parameters(opts, get_md5=True , extra=['quiet'])
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Bin',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        except lite.OperationalError:
            try:
                cur.execute("""
                create table PATHs
                (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            except lite.OperationalError:
                pass  # may append when mapped files cleaned
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Bin',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        jobid = get_jobid(cur)
        for fnam in out_files:
            add_path(cur, out_files[fnam], fnam + '_MATRIX', jobid, opts.workdir)
        for fnam in out_plots:
            add_path(cur, out_plots[fnam], fnam + '_FIGURE', jobid, opts.workdir)
        if not opts.quiet:
            print_db(cur, 'JOBs')
            print_db(cur, 'PATHs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 36
0
def save_to_db(opts, bias_file, mreads, bad_col_image, nbad_columns, ncolumns,
               raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, bam_filter,
               launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='JOBs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='NORMALIZE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table NORMALIZE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Input int,
                N_columns int,
                N_filtered int,
                BAM_filter int,
                Cis_percentage_Raw real,
                Cis_percentage_Norm real,
                Slope_700kb_10Mb real,
                Resolution int,
                Normalization text,
                Factor int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute(
                """
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Normalize',           '%s')
            """ %
                (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                 time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, bias_file, 'BIASES', jobid, opts.workdir)
        add_path(cur, bad_col_image, 'FIGURE', jobid, opts.workdir)
        add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir)
        if opts.bam:
            add_path(cur, path.realpath(opts.bam), 'EXT_2D_BAM', jobid,
                     opts.workdir)
        if opts.mappability:
            add_path(cur, path.realpath(opts.mappability), 'EXT_MAPPABILITY',
                     jobid, opts.workdir)
        if opts.fasta:
            add_path(cur, path.realpath(opts.fasta), 'EXT_FASTA', jobid,
                     opts.workdir)
        # get pathid of input
        cur.execute("select id from paths where path = '%s'" %
                    (path.relpath(mreads, opts.workdir)))
        input_bed = cur.fetchall()[0][0]

        a2 = 0 if isnan(a2) else a2
        try:
            cur.execute("""
            insert into NORMALIZE_OUTPUTs
            (Id  , JOBid,     Input, N_columns,   N_filtered, BAM_filter, Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb,   Resolution,      Normalization,      Factor)
            values
            (NULL,    %d,        %d,        %d,           %d,         %d,                 %f,                  %f,               %f,           %d,               '%s',          %f)
            """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter,
                   100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso,
                   opts.normalization, opts.factor))
        except lite.OperationalError:
            try:
                cur.execute("""
                insert into NORMALIZE_OUTPUTs
                (Id  , JOBid,     Input, N_columns,   N_filtered, BAM_filter,      Cis_percentage_Raw, Cis_percentage_Norm, Slope_700kb_10Mb,   Resolution,     Normalization,       Factor)
                values
                (NULL,    %d,        %d,        %d,           %d,         %d,                      %f,                  %f,               %f,           %d,               '%s',          %f)
                """ % (jobid, input_bed, ncolumns, nbad_columns, bam_filter,
                       100 * raw_cisprc, 100 * norm_cisprc, a2, opts.reso,
                       opts.normalization, opts.factor))
            except lite.OperationalError:
                print 'WANRING: Normalized table not written!!!'

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        try:
            print_db(cur, 'FILTER_OUTPUTs')
            print_db(cur, 'INTERSECTION_OUTPUTs')
            print_db(cur, 'MAPPED_INPUTs')
            print_db(cur, 'MAPPED_OUTPUTs')
            print_db(cur, 'PARSED_OUTPUTs')
            print_db(cur, 'FILTER_OUTPUTs')
        except lite.OperationalError:
            pass
        print_db(cur, 'NORMALIZE_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 37
0
def save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               nbad_columns, ncolumns, scc, std, reprod,
               eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
               biases1, biases2, masked1, masked2, launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MERGE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            create table FILTER_OUTPUTs
               (Id integer primary key,
                PATHid int,
                Name text,
                Count int,
                JOBid int,
                unique (PATHid))""")
            cur.execute("""
            create table MERGE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Wrkd1Path int,
                Wrkd2Path int,
                Bed1Path int,
                Bed2Path int,
                MergePath int,
                unique (JOBid))""")
            cur.execute("""
            create table MERGE_STATs
               (Id integer primary key,
                JOBid int,
                Inputs text,
                decay_corr text,
                eigen_corr text,
                reprod real,
                scc real,
                std_scc real,
                N_columns int,
                N_filtered int,
                Resolution int,
                bias1Path int,
                bias2Path int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type   , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Merge',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)
        add_path(cur, decay_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, decay_corr_fig, 'FIGURE'    , jobid, opts.workdir)
        add_path(cur, eigen_corr_dat, 'CORR'      , jobid, opts.workdir)
        add_path(cur, eigen_corr_fig, 'FIGURE'    , jobid, opts.workdir)

        add_path(cur, opts.workdir , 'WORKDIR'    , jobid)
        add_path(cur, opts.workdir1, 'WORKDIR1'   , jobid, opts.workdir)
        add_path(cur, opts.workdir2, 'WORKDIR2'   , jobid, opts.workdir)
        add_path(cur, mreads1      , 'EXT_HIC_BAM', jobid, opts.workdir)
        add_path(cur, mreads2      , 'EXT_HIC_BAM', jobid, opts.workdir)
        if not opts.skip_merge:
            add_path(cur, outbed   , 'HIC_BAM'    , jobid, opts.workdir)

        if opts.norm:
            add_path(cur, biases1      , 'BIASES'     , jobid, opts.workdir)
            add_path(cur, biases2      , 'BIASES'     , jobid, opts.workdir)

            biasid1 = get_path_id(cur, biases1, opts.workdir)
            biasid2 = get_path_id(cur, biases2, opts.workdir)
        else:
            biasid1 = 0
            biasid2 = 0

        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads1, opts.workdir)))
        bed1 = cur.fetchall()[0][0]
        if opts.workdir1:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir1, opts.workdir)))
            w1path = cur.fetchall()[0][0]
        else:
            w1path = 0
        cur.execute("select id from paths where path = '%s'" % (
            path.relpath(mreads2, opts.workdir)))
        bed2 = cur.fetchall()[0][0]
        if opts.workdir2:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(opts.workdir2, opts.workdir)))
            w2path = cur.fetchall()[0][0]
        else:
            w2path = 0
        if not opts.skip_merge:
            cur.execute("select id from paths where path = '%s'" % (
                path.relpath(outbed, opts.workdir)))
            outbedid = cur.fetchall()[0][0]
        if not opts.skip_comparison:
            decay_corr = '-'.join(['%.1f' % (v)
                                   for v in corr[:10:2]]).replace('0.', '.')
            eigen_corr = '-'.join(['%.2f' % (max(v))
                                   for v in eig_corr[:4]]).replace('0.', '.')
        else:
            decay_corr = eigen_corr = None
        if not opts.skip_merge:
            cur.execute("""
            insert into MERGE_OUTPUTs
            (Id  , JOBid, Wrkd1Path, Wrkd2Path, Bed1Path, Bed2Path, MergePath)
            values
            (NULL,    %d,        %d,        %d,       %d,       %d,        %d)
            """ % (jobid,    w1path,    w2path,     bed1,     bed2,  outbedid))

        if not opts.skip_comparison:
            cur.execute("""
            insert into MERGE_STATs
            (Id  , JOBid, N_columns,   N_filtered, Resolution, reprod, scc, std_scc, decay_corr, eigen_corr, bias1Path, bias2Path)
            values
            (NULL,    %d,        %d,           %d,         %d,     %f,  %f,      %f,       '%s',       '%s',        %d,        %d)
            """ % (jobid,  ncolumns, nbad_columns, opts.reso , reprod, scc,     std, decay_corr, eigen_corr,   biasid1,   biasid2))

        if opts.workdir1:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile1 = opts.tmpdb1
                try: # to copy in case read1 was already mapped for example
                    copyfile(path.join(opts.workdir1, 'trace.db'), dbfile1)
                except IOError:
                    pass
            else:
                dbfile1 = path.join(opts.workdir1, 'trace.db')
            tmpcon = lite.connect(dbfile1)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked1[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile1)
        if opts.workdir2:
            if 'tmpdb' in opts and opts.tmpdb:
                # tmp file
                dbfile2 = opts.tmpdb2
                try: # to copy in case read2 was already mapped for example
                    copyfile(path.join(opts.workdir2, 'trace.db'), dbfile2)
                except IOError:
                    pass
            else:
                dbfile2 = path.join(opts.workdir2, 'trace.db')
            tmpcon = lite.connect(dbfile2)
            with tmpcon:
                tmpcur = tmpcon.cursor()
                tmpcur.execute("select Name, PATHid, Count from filter_outputs")
                for name, pathid, count in tmpcur.fetchall():
                    res = tmpcur.execute("select Path from PATHs where Id = %d" % (pathid))
                    tmppath = res.fetchall()[0][0]
                    masked2[name] = {'path': tmppath, 'count': count}
            if 'tmpdb' in opts and opts.tmpdb:
                remove(dbfile2)

        for f in masked1:
            if f  != 'valid-pairs':
                outmask = path.join(opts.workdir, '03_filtered_reads',
                                    'all_r1-r2_intersection_%s.tsv_%s.tsv' % (
                                        param_hash, f))
                out = open(outmask, 'w')
                try:
                    fh = magic_open(path.join(opts.workdir1, masked1[f]['path']))
                except FileNotFoundError:
                    fh = magic_open(path.join(opts.workdir1, masked1[f]['path'] + '.gz'))
                for line in fh:
                    out.write(line)
                try:
                    fh = magic_open(path.join(opts.workdir2, masked2[f]['path']))
                except FileNotFoundError:
                    fh = magic_open(path.join(opts.workdir2, masked2[f]['path'] + '.gz'))
                for line in fh:
                    out.write(line)
                add_path(cur, outmask, 'FILTER', jobid, opts.workdir)
            else:
                if opts.skip_merge:
                    outmask = 'NA'
                else:
                    outmask = outbed
            try:
                path_id = get_path_id(cur, outmask, opts.workdir)
            except IndexError:
                path_id = -1
            cur.execute("""
            insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, JOBid)
            values
            (NULL,     %d, '%s',  '%s',    %d)
            """ % (path_id, f, masked1[f]['count'] + masked2[f]['count'], jobid))

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        print_db(cur, 'MERGE_OUTPUTs')
        print_db(cur, 'MERGE_STATs')
        print_db(cur, 'FILTER_OUTPUTs')

    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 38
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, extra=['quiet'])

    coord1 = opts.coord1

    if not coord1:
        region1 = None
        start1 = None
        end1 = None
    else:
        try:
            crm1, pos1 = coord1.split(':')
            start1, end1 = pos1.split('-')
            region1 = crm1
            start1 = int(start1)
            end1 = int(end1)
        except ValueError:
            region1 = coord1
            start1 = None
            end1 = None

    printime('Importing hic in %s format' % opts.format)
    if opts.format == 'matrix' or opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            masked, chroms_gen, crm, beg, _, _ = read_file_header(f_thing)
        if not chroms_gen or (region1 and region1 not in chroms_gen):
            raise Exception(
                '''ERROR: Chromosome size not included in import file.
                             Please include the chromosome sizes of the data that
                             you want to import in the header of the file. Example:
                             # CRM chr1    249250621''')
    elif opts.format == 'cooler':
        if is_cooler(opts.input, opts.reso if opts.reso > 1 else None):
            chroms_gen = parse_header(opts.input,
                                      opts.reso if opts.reso > 1 else None)
            if not chroms_gen or (region1 and region1 not in chroms_gen):
                raise Exception(
                    '''ERROR: Chromosome size not included in import file.
                                ''')
        else:
            raise Exception('''ERROR: The input file is not a cooler''')

    chroms = OrderedDict(
        (crm, int(chroms_gen[crm] // opts.reso) + 1) for crm in chroms_gen)
    sections = []
    if not region1:
        size = 0
        for crm in chroms:
            size += chroms[crm]
            sections.extend([(crm, i) for i in range(chroms[crm])])
    elif not start1:
        size = chroms[region1]
        sections.extend([(region1, i) for i in range(size)])
    else:
        #size = (end1 - start1)//opts.reso
        size = chroms[region1]
        sections.extend([
            (region1, i)
            for i in range(start1 // opts.reso, (end1 // opts.reso))
        ])
    dict_sec = dict([(j, i) for i, j in enumerate(sections)])
    bias_file = None
    badcol = {}
    if opts.format == 'text':
        with gzopen(opts.input) as f_thing:
            matrix = abc_reader(f_thing, size,
                                start1 // opts.reso if start1 else None)
        size_mat = size
    elif opts.format == 'matrix':
        with gzopen(opts.input) as in_f:
            matrix, size_mat, _, masked, _ = autoreader(in_f)
        if size != size_mat:
            raise Exception('''ERROR: The size of the specified region is
                            different from the data in the matrix''')
    elif opts.format == 'cooler':
        matrix, weights, size, header = parse_cooler(
            opts.input,
            opts.reso if opts.reso > 1 else None,
            normalized=True,
            raw_values=True)
        masked = {}
        size_mat = size
        if len(set(weights)) > 1:
            printime('Transforming cooler weights to biases')
            outdir_norm = path.join(opts.workdir, '04_normalization')
            mkdir(outdir_norm)

            bias_file = path.join(
                outdir_norm, 'biases_%s_%s.pickle' %
                (nicer(opts.reso).replace(' ', ''), param_hash))
            out = open(bias_file, 'wb')
            badcol.update((i, True) for i, m in enumerate(weights) if m == 0)
            dump(
                {
                    'biases':
                    dict((k, b if b > 0 else float('nan'))
                         for k, b in enumerate(weights)),
                    'decay': {},
                    'badcol':
                    badcol,
                    'resolution':
                    opts.reso
                }, out, HIGHEST_PROTOCOL)
            out.close()

    hic = HiC_data(matrix,
                   size_mat,
                   dict_sec=dict_sec,
                   chromosomes=chroms,
                   masked=masked,
                   resolution=opts.reso)

    #from pytadbit.mapping.analyze import hic_map
    #hic_map(hic, normalized=False, focus='chr1', show=True, cmap='viridis')

    printime('Creating BAM file')
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    total_counts = create_BAMhic(hic,
                                 opts.cpus,
                                 outbam,
                                 chroms_gen,
                                 opts.reso,
                                 samtools=opts.samtools)

    finish_time = time.localtime()
    # save all job information to sqlite DB
    save_to_db(opts, total_counts, size_mat, bias_file, len(badcol),
               outbam + '.bam', launch_time, finish_time)
Ejemplo n.º 39
0
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, nbad_columns, ncolumns, intra_dir_nrm_fig,
               intra_dir_nrm_txt, inter_dir_nrm_fig, inter_dir_nrm_txt,
               genom_map_nrm_fig, genom_map_nrm_txt, intra_dir_raw_fig,
               intra_dir_raw_txt, inter_dir_raw_fig, inter_dir_raw_txt,
               genom_map_raw_fig, genom_map_raw_txt, pickle_path, launch_time,
               finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try:  # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='NORMALIZE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table NORMALIZE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Input int,
                N_columns int,
                N_filtered int,
                CisTrans_nrm_all real,
                CisTrans_nrm_out real,
                CisTrans_raw_all real,
                CisTrans_raw_out real,
                Slope_700kb_10Mb real,
                Resolution int,
                Factor int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute(
                """
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Normalize',           '%s')
            """ %
                (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                 time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, pickle_path, 'PICKLE', jobid, opts.workdir)
        add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir)
        add_path(cur, bias_file, 'BIASES', jobid, opts.workdir)
        add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir)
        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        # get pathid of input
        cur.execute("select id from paths where path = '%s'" %
                    (path.relpath(mreads, opts.workdir)))
        input_bed = cur.fetchall()[0][0]
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid,
                     opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid,
                     opts.workdir)
        if genom_map_nrm_fig:
            add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_nrm_txt:
            add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid,
                     opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid,
                     opts.workdir)
        if genom_map_raw_fig:
            add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_raw_txt:
            add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir)

        try:
            cur.execute("""
            insert into NORMALIZE_OUTPUTs
            (Id  , JOBid,     Input, N_columns,   N_filtered, CisTrans_nrm_all,   CisTrans_nrm_out,   CisTrans_raw_all,   CisTrans_raw_out, Slope_700kb_10Mb,   Resolution,      Factor)
            values
            (NULL,    %d,        %d,        %d,           %d,               %f,                 %f,                 %f,                 %f,               %f,           %d,          %f)
            """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_N_D,
                   cis_trans_N_d, cis_trans_n_D, cis_trans_n_d, a2, opts.reso,
                   opts.factor))
        except lite.OperationalError:
            try:
                cur.execute("""
                insert into NORMALIZE_OUTPUTs
                (Id  , JOBid,     Input, N_columns,   N_filtered,  CisTrans_raw_all,   CisTrans_raw_out, Slope_700kb_10Mb,   Resolution,      Factor)
                values
                (NULL,    %d,        %d,        %d,           %d,                %f,                 %f,               %f,           %d,          %f)
                """ % (jobid, input_bed, ncolumns, nbad_columns, cis_trans_n_D,
                       cis_trans_n_d, a2, opts.reso, opts.factor))
            except lite.OperationalError:
                print 'WANRING: Normalized table not written!!!'

        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        try:
            print_db(cur, 'INTERSECTION_OUTPUTs')
            print_db(cur, 'MAPPED_INPUTs')
            print_db(cur, 'MAPPED_OUTPUTs')
            print_db(cur, 'PARSED_OUTPUTs')
        except lite.OperationalError:
            pass
        print_db(cur, 'FILTER_OUTPUTs')
        print_db(cur, 'NORMALIZE_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass
Ejemplo n.º 40
0
def save_to_db(opts, outfiles, launch_time, finish_time):
    # write little DB to keep track of processes and options
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        # check if table exists
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='MAPPED_INPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table PATHs
               (Id integer primary key,
                JOBid int, Path text, Type text,
                unique (Path))""")
            cur.execute("""
            create table JOBs
               (Id integer primary key,
                Parameters text,
                Launch_time text,
                Finish_time text,
                Type text,
                Parameters_md5 text,
                unique (Parameters_md5))""")
            cur.execute("""
            create table MAPPED_INPUTs
               (Id integer primary key,
                PATHid int,
                Entries int,
                Trim text,
                Frag text,
                Read int,
                Enzyme text,
                WRKDIRid int,
                MAPPED_OUTPUTid int,
                INDEXid int,
                unique (PATHid,Entries,Read,Enzyme,WRKDIRid,MAPPED_OUTPUTid,INDEXid))""")

        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Map',           '%s')
     """ % (parameters,
            time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, opts.workdir, 'WORKDIR', jobid)
        add_path(cur, opts.fastq  ,  'MAPPED_FASTQ' , jobid, opts.workdir)
        add_path(cur, opts.index  , 'INDEX'  , jobid, opts.workdir)
        for i, (out, num) in enumerate(outfiles):
            try:
                window = opts.windows[i]
            except IndexError:
                window = opts.windows[-1]
            except TypeError:
                window = 'None'
            add_path(cur, out, 'SAM/MAP', jobid, opts.workdir)
            frag = ('none' if opts.iterative else 'frag' if i==len(outfiles) - 1
                    else 'full')
            try:
                cur.execute("""
    insert into MAPPED_INPUTs
     (Id  , PATHid, Entries, Trim, Frag, Read, Enzyme, WRKDIRid, MAPPED_OUTPUTid, INDEXid)
    values
     (NULL,      %d,     %d, '%s', '%s',   %d,   '%s',       %d,    %d,      %d)
     """ % (get_path_id(cur, opts.fastq, opts.workdir), num, window, frag,
            opts.read, opts.renz, get_path_id(cur, opts.workdir),
            get_path_id(cur, out, opts.workdir),
            get_path_id(cur, opts.index, opts.workdir)))
            except lite.IntegrityError:
                pass
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs' )
        print_db(cur, 'JOBs'  )
Ejemplo n.º 41
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (
                len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception(
                "ERROR: chromosomes in FASTA different the ones in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        fh = open(opts.mappability)
        mappability = dict((c, []) for c in refs)
        line = fh.next()
        crmM, begM, endM, val = line.split()
        crm = crmM
        if crmM not in mappability:
            print('     skipping %s' % crmM)
            while crmM not in mappability:
                line = fh.next()
                crmM, begM, endM, val = line.split()
                crm = crmM
        while any(not mappability[c] for c in mappability):
            for begB in xrange(0, len(genome[crmM]), opts.reso):
                endB = begB + opts.reso
                tmp = 0
                try:
                    while True:
                        crmM, begM, endM, val = line.split()
                        if crm != crmM:
                            try:
                                while crmM not in refs:
                                    line = fh.next()
                                    crmM, _ = line.split('\t', 1)
                            except StopIteration:
                                pass
                            break
                        begM = int(begM)
                        endM = int(endM)
                        if endM > endB:
                            weight = endB - begM
                            if weight >= 0:
                                tmp += weight * float(val)
                            break
                        weight = endM - (begM if begM > begB else begB)
                        if weight < 0:
                            break
                        tmp += weight * float(val)
                        line = fh.next()
                except StopIteration:
                    pass
                mappability[crm].append(tmp / opts.reso)
                crm = crmM
        mappability = reduce(lambda x, y: x + y,
                             (mappability[c] for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols)

    bad_col_image = path.join(
        outdir, 'filtered_bins_%s_%s.png' %
        (nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image, len(badcol),
                   len(biases), raw_cisprc, norm_cisprc, inter_vs_gcoord, a2,
                   opts.filter, launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Ejemplo n.º 42
0
def save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time):
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='INTERSECTION_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
        create table INTERSECTION_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Total_interactions int,
            Multiple_interactions text,
            Median_fragment_length,
            MAD_fragment_length,
            Max_fragment_length,
            unique (PATHid))""")
            cur.execute("""
        create table FILTER_OUTPUTs
           (Id integer primary key,
            PATHid int,
            Name text,
            Count int,
            JOBid int,
            unique (PATHid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )            
            cur.execute("""
    insert into JOBs
     (Id  , Parameters, Launch_time, Finish_time,    Type, Parameters_md5)
    values
     (NULL,       '%s',        '%s',        '%s', 'Filter',           '%s')
     """ % (parameters,
            time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
            time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass

        jobid = get_jobid(cur)
        
        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        add_path(cur,  reads, '2D_BED', jobid, opts.workdir)
        add_path(cur, hist_path, 'FIGURE', jobid, opts.workdir)
        try:
            cur.execute("""
            insert into INTERSECTION_OUTPUTs
            (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
            values
            (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
            """ % (get_path_id(cur, mreads, opts.workdir),
                   count, ' '.join(['%s:%d' % (k, multiples[k])
                                    for k in sorted(multiples)]),
                   median, mad, max_f))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute(
                    'delete from INTERSECTION_OUTPUTs where PATHid = %d' % (
                        get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into INTERSECTION_OUTPUTs
                (Id  , PATHid, Total_interactions, Multiple_interactions, Median_fragment_length, MAD_fragment_length, Max_fragment_length)
                values
                (NULL,    %d,                  %d,                  '%s',                     %d,                  %d,                  %d)
                """ % (get_path_id(cur, mreads, opts.workdir),
                       count, ' '.join(['%s:%d' % (k, multiples[k])
                                        for k in sorted(multiples)]),
                       median, mad, max_f))
        for f in masked:
            add_path(cur, masked[f]['fnam'], 'FILTER', jobid, opts.workdir)
            try:
                cur.execute("""
            insert into FILTER_OUTPUTs
            (Id  , PATHid, Name, Count, JOBid)
            values
            (NULL,    %d,     '%s',      '%s', %d)
                """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                       masked[f]['name'], masked[f]['reads'], jobid))
            except lite.IntegrityError:
                print 'WARNING: already filtered'
                if opts.force:
                    cur.execute(
                        'delete from FILTER_OUTPUTs where PATHid = %d' % (
                            get_path_id(cur, masked[f]['fnam'], opts.workdir)))
                    cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, JOBid)
                values
                (NULL,    %d,     '%s',      '%s', %d)
                    """ % (get_path_id(cur, masked[f]['fnam'], opts.workdir),
                           masked[f]['name'], masked[f]['reads'], jobid))
        try:
            cur.execute("""
        insert into FILTER_OUTPUTs
        (Id  , PATHid, Name, Count, JOBid)
        values
        (NULL,    %d,     '%s',      '%s', %d)
            """ % (get_path_id(cur, mreads, opts.workdir),
                   'valid-pairs', n_valid_pairs, jobid))
        except lite.IntegrityError:
            print 'WARNING: already filtered'
            if opts.force:
                cur.execute(
                    'delete from FILTER_OUTPUTs where PATHid = %d' % (
                        get_path_id(cur, mreads, opts.workdir)))
                cur.execute("""
                insert into FILTER_OUTPUTs
                (Id  , PATHid, Name, Count, JOBid)
                values
                (NULL,    %d,     '%s',      '%s', %d)
                """ % (get_path_id(cur, mreads, opts.workdir),
                       'valid-pairs', n_valid_pairs, jobid))
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
        print_db(cur, 'INTERSECTION_OUTPUTs')        
        print_db(cur, 'FILTER_OUTPUTs')
Ejemplo n.º 43
0
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time):
    con = lite.connect(path.join(opts.workdir, 'trace.db'))
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='NORMALIZE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table NORMALIZE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Input int,
                CisTrans_nrm_all real,
                CisTrans_nrm_out real,
                CisTrans_raw_all real,
                CisTrans_raw_out real,
                Slope_700kb_10Mb real,
                Resolution int,
                Factor int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True)
            cur.execute(
                """
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Normalize',           '%s')
            """ %
                (parameters, time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                 time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir)
        add_path(cur, bias_file, 'BIASES', jobid, opts.workdir)
        add_path(cur, inter_vs_gcoord, 'FIGURE', jobid, opts.workdir)
        add_path(cur, mreads, '2D_BED', jobid, opts.workdir)
        # get pathid of input
        cur.execute("select id from paths where path = '%s'" %
                    (path.relpath(mreads, opts.workdir)))
        input_bed = cur.fetchall()[0][0]
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid,
                     opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid,
                     opts.workdir)
        if genom_map_nrm_fig:
            add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_nrm_txt:
            add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid,
                     opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid,
                     opts.workdir)
        if genom_map_raw_fig:
            add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_raw_txt:
            add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir)

        cur.execute("""
        insert into NORMALIZE_OUTPUTs
        (Id  , JOBid,     Input, CisTrans_nrm_all,   CisTrans_nrm_out,   CisTrans_raw_all,   CisTrans_raw_out, Slope_700kb_10Mb,   Resolution,      Factor)
        values
        (NULL,    %d,        %d,               %f,                 %f,                 %f,                 %f,               %f,           %d,          %f)
        """ % (jobid, input_bed, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, opts.reso, opts.factor))
        print_db(cur, 'MAPPED_INPUTs')
        print_db(cur, 'PATHs')
        print_db(cur, 'MAPPED_OUTPUTs')
        print_db(cur, 'PARSED_OUTPUTs')
        print_db(cur, 'JOBs')
        print_db(cur, 'INTERSECTION_OUTPUTs')
        print_db(cur, 'FILTER_OUTPUTs')
        print_db(cur, 'NORMALIZE_OUTPUTs')
Ejemplo n.º 44
0
def save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
               a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
               nbad_columns, ncolumns,
               intra_dir_nrm_fig, intra_dir_nrm_txt,
               inter_dir_nrm_fig, inter_dir_nrm_txt,
               genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt,
               inter_dir_raw_fig, inter_dir_raw_txt,
               genom_map_raw_fig, genom_map_raw_txt,
               pickle_path, launch_time, finish_time):
    if 'tmpdb' in opts and opts.tmpdb:
        # check lock
        while path.exists(path.join(opts.workdir, '__lock_db')):
            time.sleep(0.5)
        # close lock
        open(path.join(opts.workdir, '__lock_db'), 'a').close()
        # tmp file
        dbfile = opts.tmpdb
        try: # to copy in case read1 was already mapped for example
            copyfile(path.join(opts.workdir, 'trace.db'), dbfile)
        except IOError:
            pass
    else:
        dbfile = path.join(opts.workdir, 'trace.db')
    con = lite.connect(dbfile)
    with con:
        cur = con.cursor()
        cur.execute("""SELECT name FROM sqlite_master WHERE
                       type='table' AND name='NORMALIZE_OUTPUTs'""")
        if not cur.fetchall():
            cur.execute("""
            create table NORMALIZE_OUTPUTs
               (Id integer primary key,
                JOBid int,
                Input int,
                N_columns int,
                N_filtered int,
                CisTrans_nrm_all real,
                CisTrans_nrm_out real,
                CisTrans_raw_all real,
                CisTrans_raw_out real,
                Slope_700kb_10Mb real,
                Resolution int,
                Factor int,
                unique (JOBid))""")
        try:
            parameters = digest_parameters(opts, get_md5=False)
            param_hash = digest_parameters(opts, get_md5=True )
            cur.execute("""
            insert into JOBs
            (Id  , Parameters, Launch_time, Finish_time, Type , Parameters_md5)
            values
            (NULL,       '%s',        '%s',        '%s', 'Normalize',           '%s')
            """ % (parameters,
                   time.strftime("%d/%m/%Y %H:%M:%S", launch_time),
                   time.strftime("%d/%m/%Y %H:%M:%S", finish_time), param_hash))
        except lite.IntegrityError:
            pass
        jobid = get_jobid(cur)
        add_path(cur, pickle_path     , 'PICKLE'     , jobid, opts.workdir)
        add_path(cur, bad_columns_file, 'BAD_COLUMNS', jobid, opts.workdir)
        add_path(cur, bias_file       , 'BIASES'     , jobid, opts.workdir)
        add_path(cur, inter_vs_gcoord , 'FIGURE'     , jobid, opts.workdir)
        add_path(cur, mreads          , '2D_BED'     , jobid, opts.workdir)
        # get pathid of input
        cur.execute("select id from paths where path = '%s'" % (path.relpath(mreads, opts.workdir)))
        input_bed = cur.fetchall()[0][0]
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_nrm_fig:
            add_path(cur, intra_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_nrm_fig:
            add_path(cur, inter_dir_nrm_txt, 'NRM_MATRICES', jobid, opts.workdir)
        if genom_map_nrm_fig:
            add_path(cur, genom_map_nrm_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_nrm_txt:
            add_path(cur, genom_map_nrm_txt, 'NRM_MATRIX', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if intra_dir_raw_fig:
            add_path(cur, intra_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_fig, 'FIGURES', jobid, opts.workdir)
        if inter_dir_raw_fig:
            add_path(cur, inter_dir_raw_txt, 'RAW_MATRICES', jobid, opts.workdir)
        if genom_map_raw_fig:
            add_path(cur, genom_map_raw_fig, 'FIGURE', jobid, opts.workdir)
        if genom_map_raw_txt:
            add_path(cur, genom_map_raw_txt, 'RAW_MATRIX', jobid, opts.workdir)

        try:
            cur.execute("""
            insert into NORMALIZE_OUTPUTs
            (Id  , JOBid,     Input, N_columns,   N_filtered, CisTrans_nrm_all,   CisTrans_nrm_out,   CisTrans_raw_all,   CisTrans_raw_out, Slope_700kb_10Mb,   Resolution,      Factor)
            values
            (NULL,    %d,        %d,        %d,           %d,               %f,                 %f,                 %f,                 %f,               %f,           %d,          %f)
            """ % (jobid, input_bed,  ncolumns, nbad_columns,    cis_trans_N_D,      cis_trans_N_d,      cis_trans_n_D,      cis_trans_n_d,               a2,    opts.reso, opts.factor))
        except lite.OperationalError:
            try:
                cur.execute("""
                insert into NORMALIZE_OUTPUTs
                (Id  , JOBid,     Input, N_columns,   N_filtered,  CisTrans_raw_all,   CisTrans_raw_out, Slope_700kb_10Mb,   Resolution,      Factor)
                values
                (NULL,    %d,        %d,        %d,           %d,                %f,                 %f,               %f,           %d,          %f)
                """ % (jobid, input_bed,  ncolumns, nbad_columns,     cis_trans_n_D,      cis_trans_n_d,               a2,    opts.reso, opts.factor))
            except lite.OperationalError:
                print 'WANRING: Normalized table not written!!!'
            
        print_db(cur, 'PATHs')
        print_db(cur, 'JOBs')
        try:
            print_db(cur, 'INTERSECTION_OUTPUTs')        
            print_db(cur, 'MAPPED_INPUTs')
            print_db(cur, 'MAPPED_OUTPUTs')
            print_db(cur, 'PARSED_OUTPUTs')
        except lite.OperationalError:
            pass
        print_db(cur, 'FILTER_OUTPUTs')
        print_db(cur, 'NORMALIZE_OUTPUTs')
    if 'tmpdb' in opts and opts.tmpdb:
        # copy back file
        copyfile(dbfile, path.join(opts.workdir, 'trace.db'))
        remove(dbfile)
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_db'))
    except OSError:
        pass