Esempio n. 1
0
def merge_bams(bam1,
               bam2,
               output_bam,
               cpus=cpu_count(),
               samtools='samtools',
               verbose=True):
    """
    Merge two bam files with samtools into one.

    :param bam1: path to first file
    :param bam2: path to second file
    """

    samtools = which(samtools)
    if verbose:
        print('  - Mergeing experiments')
    system(samtools + ' merge -@ %d %s %s %s' % (cpus, output_bam, bam1, bam2))
    if verbose:
        print('  - Indexing new BAM file')
    # check samtools version number and modify command line
    version = LooseVersion([
        l.split()[1]
        for l in Popen(samtools, stderr=PIPE,
                       universal_newlines=True).communicate()[1].split('\n')
        if 'Version' in l
    ][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools + ' index -@ %d %s' % (cpus, output_bam))
    else:
        system(samtools + ' index %s' % (output_bam))
Esempio n. 2
0
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', p_fit=None,
         seed=1, **kwargs):
    """
    Normalizes according to oneD normalization that takes into account the GC
    content, mappability and the number of restriction sites per bin.

    Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017).
    OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes.
    bioRxiv. http://doi.org/10.1101/148254

    :param form: string representing an R Formulae
    :param None p_fit: proportion of data to be used in fitting (for very
       large datasets). Number between 0 and 1
    :param kwargs: dictionary with keys present in the formula and values being
       lists of equal length.
       for example:
           oneD(tot=[1,2,3...],
                map=[1,2,3...],
                res=[1,2,3...],
                cg =[1,2,3...])

    :returns: list of biases to use to normalize the raw matrix of interactions
    """

    script_path = which('normalize_oneD.R')
    proc_par = ["Rscript", "--vanilla", script_path]

    in_csv = path.join(tmp_dir, 'tot.csv')
    proc_par.append(in_csv)

    csvfile = open(in_csv, 'w')
    headers = sorted(kwargs.keys())
    csvfile.write(','.join(headers) + '\n')
    csvfile.write('\n'.join(','.join(str(kwargs[k][i]) for k in headers)
                            for i in range(len(kwargs['tot']))) + '\n')
    csvfile.close()

    out_csv = path.join(tmp_dir, 'biases.csv')
    proc_par.append(out_csv)

    proc_par.append('"%s"' % (form))

    if p_fit:
        proc_par.append(str(p_fit))

    if seed > 1:
        proc_par.append(str(seed))
    elif seed < 1:
        raise Exception(('ERROR: seed number (currently: %d) should be an '
                         'interger greater than 1 (because of R)') % (seed))

    proc = Popen(proc_par, stderr=PIPE, universal_newlines=True)
    err = proc.stderr.readlines()
    print('\n'.join(err))

    biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float)

    return biases_oneD
Esempio n. 3
0
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', p_fit=None,
         seed=1, **kwargs):
    """
    Normalizes according to oneD normalization that takes into account the GC
    content, mappability and the number of restriction sites per bin.

    Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017).
    OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes.
    bioRxiv. http://doi.org/10.1101/148254

    :param form: string representing an R Formulae
    :param None p_fit: proportion of data to be used in fitting (for very
       large datasets). Number between 0 and 1
    :param kwargs: dictionary with keys present in the formula and values being
       lists of equal length.
       for example:
           oneD(tot=[1,2,3...],
                map=[1,2,3...],
                res=[1,2,3...],
                cg =[1,2,3...])

    :returns: list of biases to use to normalize the raw matrix of interactions
    """

    script_path = which('normalize_oneD.R')
    proc_par = ["Rscript", "--vanilla", script_path]

    in_csv = path.join(tmp_dir, 'tot.csv')
    proc_par.append(in_csv)

    csvfile = open(in_csv, 'w')
    headers = sorted(kwargs.keys())
    csvfile.write(','.join(headers) + '\n')
    csvfile.write('\n'.join(','.join(str(kwargs[k][i]) for k in headers)
                            for i in xrange(len(kwargs['tot']))) + '\n')
    csvfile.close()

    out_csv = path.join(tmp_dir, 'biases.csv')
    proc_par.append(out_csv)

    proc_par.append('"%s"' % (form))

    if p_fit:
        proc_par.append(str(p_fit))

    if seed > 1:
        proc_par.append(str(seed))
    elif seed < 1:
        raise Exception(('ERROR: seed number (currently: %d) should be an '
                         'interger greater than 1 (because of R)') % (seed))

    proc = Popen(proc_par, stderr=PIPE)
    err = proc.stderr.readlines()
    print '\n'.join(err)

    biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float)

    return biases_oneD
Esempio n. 4
0
def _bowtie2_mapping(bowtie2_index_path,
                     fastq_path1,
                     out_map_path,
                     fastq_path2=None,
                     bowtie2_binary='bowtie2',
                     bowtie2_params=None,
                     **kwargs):
    """
    
    """
    bowtie2_index_path = os.path.abspath(
        os.path.expanduser(bowtie2_index_path))
    fastq_path1 = os.path.abspath(os.path.expanduser(fastq_path1))
    paired_map = False
    if fastq_path2:
        fastq_path2 = os.path.abspath(os.path.expanduser(fastq_path2))
        paired_map = True
    out_map_path = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads = kwargs.get('nthreads', 8)

    # check that we have the GEM binary:
    bowtie2_binary = which(bowtie2_binary)
    if not bowtie2_binary:
        raise Exception('\n\nERROR: %s binary not found' % bowtie2_binary)

    # mapping
    print('TO %s' % bowtie2_binary, fastq_path1, fastq_path2)
    bowtie2_cmd = [
        bowtie2_binary, '-x', bowtie2_index_path, '-p',
        str(nthreads), '--reorder', '-k', '1', '-S', out_map_path
    ]

    if paired_map:
        bowtie2_cmd += ['-1', fastq_path1, '-2', fastq_path2]
    else:
        bowtie2_cmd += ['-U', fastq_path1]

    if bowtie2_params:
        if isinstance(bowtie2_params, dict):
            for bow_param in bowtie2_params:
                bowtie2_cmd.append('-' + bow_param)
                if bowtie2_params[bow_param]:
                    bowtie2_cmd.append(bowtie2_params[bow_param])
        elif isinstance(bowtie2_params, list):
            bowtie2_cmd += bowtie2_params
    elif bowtie2_binary == 'bowtie2':
        bowtie2_cmd.append('--very-sensitive')
    print(' '.join(bowtie2_cmd))
    try:
        # check_call(gem_cmd, stdout=PIPE, stderr=PIPE)
        out, err = Popen(bowtie2_cmd,
                         stdout=PIPE,
                         stderr=PIPE,
                         universal_newlines=True).communicate()
    except CalledProcessError as e:
        print(out)
        print(err)
        raise Exception(e.output)
Esempio n. 5
0
def _bowtie2_mapping(bowtie2_index_path, fastq_path1, out_map_path, fastq_path2 = None,
                     bowtie2_binary='bowtie2', bowtie2_params=None, **kwargs):
    """
    :param None focus: trims the sequence in the input FASTQ file according to a
       (start, end) position, or the name of a restriction enzyme. By default it
       uses the full sequence.
    """
    bowtie2_index_path= os.path.abspath(os.path.expanduser(bowtie2_index_path))
    fastq_path1       = os.path.abspath(os.path.expanduser(fastq_path1))
    paired_map = False
    if fastq_path2:
        fastq_path2       = os.path.abspath(os.path.expanduser(fastq_path2))
        paired_map = True
    out_map_path      = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads          = kwargs.get('nthreads'            , 8)

    # check that we have the GEM binary:
    bowtie2_binary = which(bowtie2_binary)
    if not bowtie2_binary:
        raise Exception('\n\nERROR: bowtie2 binary not found')

    # mapping
    print 'TO BOWTIE2', fastq_path1, fastq_path2
    bowtie2_cmd = [
        bowtie2_binary, '-x', bowtie2_index_path,
        '-p', str(nthreads), '--reorder', '-S',
        out_map_path]

    if paired_map:
        bowtie2_cmd += ['-1',fastq_path1,'-2',fastq_path2]
    else:
        bowtie2_cmd += ['-U', fastq_path1]

    if bowtie2_params:
        for bow_param in bowtie2_params:
            bowtie2_cmd.append('-'+bow_param)
            if bowtie2_params[bow_param]:
                bowtie2_cmd.append(bowtie2_params[bow_param])
    else:
        bowtie2_cmd.append('--very-sensitive')
    print ' '.join(bowtie2_cmd)
    try:
        # check_call(gem_cmd, stdout=PIPE, stderr=PIPE)
        out, err = Popen(bowtie2_cmd, stdout=PIPE, stderr=PIPE).communicate()
    except CalledProcessError as e:
        print out
        print err
        raise Exception(e.output)
Esempio n. 6
0
def merge_bams(bam1, bam2, outbam, cpus = cpu_count(), samtools = 'samtools', verbose = True):
    """
    Merge two bam files with samtools into one.

    :param bam1: path to first file
    :param bam2: path to second file
    """

    samtools = which(samtools)
    if verbose:
        print '  - Mergeing experiments'
    system(samtools  + ' merge -@ %d %s %s %s' % (cpus, outbam, bam1, bam2))
    if verbose:
        print '  - Indexing new BAM file'
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools  + ' index -@ %d %s' % (cpus, outbam))
    else:
        system(samtools  + ' index %s' % (outbam))
Esempio n. 7
0
def oneD(tmp_dir='.', form='tot ~ s(map) + s(cg) + s(res)', **kwargs):
    """
    Normalizes according to oneD normalization that takes into account the GC
    content, mappability and the number of restriction sites per bin.

    Vidal, E., le Dily, F., Quilez, J., Stadhouders, R., Cuartero, Y., Graf, T., Marti-Renom, Marc A., Beato, M., Filion, G. (2017).
    OneD: increasing reproducibility of Hi-C Samples with abnormal karyotypes.
    bioRxiv. http://doi.org/10.1101/148254

    :param form: string representing an R Formulae
    :param kwargs: dictionary with keys present in the formula and values being
       lists of equal length.
       for example:
           oneD(tot=[1,2,3...],
                map=[1,2,3...],
                res=[1,2,3...],
                cg =[1,2,3...])


    :returns: list of biases to use to normalize the raw matrix of interactions
    """
#===============================================================================
#     try:
#         form = robjects.Formula(form)
#     except NameError:
#         raise Exception('ERROR: dryhic (https://github.com/qenvio/dryhic) not '
#                         'installed, OneD normalization not available')
#
#     info = robjects.DataFrame(dict((k, robjects.FloatVector(kwargs[k]))
#                                    for k in kwargs))
#
#     return map(float64, dryhic.oned(info, form))
#===============================================================================
    script_path = which('normalize_oneD.R')
    proc_par = ["Rscript", "--vanilla", script_path]

    csvfile = path.join(tmp_dir,'tot.csv')
    proc_par.append(csvfile)
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerow(kwargs['tot'])

    csvfile = path.join(tmp_dir,'map.csv')
    proc_par.append(csvfile)
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerow(kwargs['map'])

    csvfile = path.join(tmp_dir,'res.csv')
    proc_par.append(csvfile)
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerow(kwargs['res'])

    csvfile = path.join(tmp_dir,'cg.csv')
    proc_par.append(csvfile)
    with open(csvfile, "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerow(kwargs['cg'])

    out_csv = path.join(tmp_dir,'biases.csv')

    proc_par.append(out_csv)
    subprocess.call (proc_par)

    biases_oneD = genfromtxt(out_csv, delimiter=',', dtype=float)
    #with open(out_csv, 'rb') as f:
    #    reader = csv.reader(f)
    #    biases_oneD = list(reader)

    return biases_oneD
Esempio n. 8
0
def _gem_mapping(gem_index_path, fastq_path, out_map_path,
                gem_binary='gem-mapper', **kwargs):
    """
    :param None focus: trims the sequence in the input FASTQ file according to a
       (start, end) position, or the name of a restriction enzyme. By default it
       uses the full sequence.
    :param 33 quality: set it to 'ignore' in order to speed-up the mapping
    """
    gem_index_path    = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path        = os.path.abspath(os.path.expanduser(fastq_path))
    out_map_path      = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads          = kwargs.get('nthreads'            , 8)
    max_edit_distance = kwargs.get('max_edit_distance'   , 0.04)
    mismatches        = kwargs.get('mismatches'          , 0.04)

    # check that we have the GEM binary:
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM binary not found, install it from:'
                        '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                        '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                        'have a recent computer, the '
                        'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                        'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n\nNOTE: GEM does '
                        'not provide any binary for MAC-OS.')

    # mapping
    print 'TO GEM', fastq_path
    kgt = kwargs.get
    gem_cmd = [
        gem_binary, '-I', gem_index_path,
        '-q'                        , kgt('q', 'offset-33'                    ),
        '-m'                        , kgt('m', str(max_edit_distance       )  ),
        '-s'                        , kgt('s', kgt('strata-after-best', '0')  ),
        '--allow-incomplete-strata' , kgt('allow-incomplete-strata', '0.00'   ),
        '--granularity'             , kgt('granularity', '10000'              ),
        '--max-decoded-matches'     , kgt('max-decoded-matches', kgt('d', '1')),
        '--min-decoded-strata'      , kgt('min-decoded-strata', kgt('D', '0') ),
        '--min-insert-size'         , kgt('min-insert-size', '0'              ),
        '--max-insert-size'         , kgt('max-insert-size', '0'              ),
        '--min-matched-bases'       , kgt('min-matched-bases', '0.8'          ),
        '--gem-quality-threshold'   , kgt('gem-quality-threshold', '26'       ),
        '--max-big-indel-length'    , kgt('max-big-indel-length', '15'        ),
        '--mismatch-alphabet'       , kgt('mismatch-alphabet', 'ACGT'         ),
        '-E'                        , kgt('E', '0.30'                         ),
        '--max-extendable-matches'  , kgt('max-extendable-matches', '20'      ),
        '--max-extensions-per-match', kgt('max-extensions-per-match', '1'     ),
        '-e'                        , kgt('e', str(mismatches)                ),
        '-T'                        , str(nthreads),
        '-i'                        , fastq_path,
        '-o', out_map_path.replace('.map', '')]

    if 'paired-end-alignment' in kwargs or 'p' in kwargs:
        gem_cmd.append('--paired-end-alignment')
    if 'map-both-ends' in kwargs or 'b' in kwargs:
        gem_cmd.append('--map-both-ends')
    if 'fast-mapping' in kwargs:
        gem_cmd.append('--fast-mapping')
    if 'unique-mapping' in kwargs:
        gem_cmd.append('--unique-mapping')
    if 'unique-pairing' in kwargs:
        gem_cmd.append('--unique-pairing')

    # check kwargs
    for kw in kwargs:
        if not kw in ['nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'temp_dir', 'skip', 'q', 'm', 's',
                      'strata-after-best', 'allow-incomplete-strata',
                      'granularity', 'max-decoded-matches',
                      'min-decoded-strata', 'min-insert-size',
                      'max-insert-size', 'min-matched-bases',
                      'gem-quality-threshold', 'max-big-indel-length',
                      'mismatch-alphabet', 'E', 'max-extendable-matches',
                      'max-extensions-per-match', 'e', 'paired-end-alignment',
                      'p', 'map-both-ends', 'fast-mapping', 'unique-mapping',
                      'unique-pairing', 'suffix']:
            warn('WARNING: %s not in usual keywords, misspelled?' % kw)

    print ' '.join(gem_cmd)
    try:
        # check_call(gem_cmd, stdout=PIPE, stderr=PIPE)
        out, err = Popen(gem_cmd, stdout=PIPE, stderr=PIPE).communicate()
    except CalledProcessError as e:
        print out
        print err
        raise Exception(e.output)
Esempio n. 9
0
def fast_fragment_mapping(mapper_index_path,
                          fastq_path1,
                          fastq_path2,
                          r_enz,
                          genome_seq,
                          out_map,
                          clean=True,
                          get_nread=False,
                          mapper_binary=None,
                          mapper_params=None,
                          samtools='samtools',
                          **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome with the knowledge of
    the restriction enzyme used (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool, bowtie2-build or hisat2-build
    :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not.
    :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not.
    :param out_map_dir: path to outfile tab separated format containing mapped
       read information.
    :param r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII.
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param False clean: remove intermediate files created in temp_dir
    :param False get_nread: returns a list of lists where each element contains
       a path and the number of reads processed
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper
    :param samtools samtools: path to samtools binary.

    :returns: outfile with the intersected read pairs
    """

    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    samtools = which(samtools)
    # check out folder
    if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))):
        raise Exception(
            '\n\nERROR: Path to store the output does not exist.\n')
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    gem_version = None
    # check that we have the GEM binary:
    gem_binary = mapper_binary or 'gem-mapper'
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    try:
        out, err = Popen([gem_binary, '--version'],
                         stdout=PIPE,
                         stderr=STDOUT,
                         universal_newlines=True).communicate()
        gem_version = int(out[1])
    except ValueError as e:
        gem_version = 2
        print('Falling to gem v2')
    if gem_version < 3:
        raise Exception('\n\nERROR: GEM v3 binary not found, install it from:'
                        '\nhttps://github.com/smarco/gem3-mapper'
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n')
    if mapper_params:
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '')
    base_name1 = '.'.join(base_name1.split('.')[:-1])

    curr_map1, _ = transform_fastq(fastq_path1,
                                   mkstemp(prefix=base_name1 + '_',
                                           dir=temp_dir)[1],
                                   fastq=is_fastq(fastq_path1),
                                   nthreads=nthreads,
                                   light_storage=True)

    base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '')
    base_name2 = '.'.join(base_name2.split('.')[:-1])

    curr_map2, count_fastq = transform_fastq(fastq_path2,
                                             mkstemp(prefix=base_name2 + '_',
                                                     dir=temp_dir)[1],
                                             fastq=is_fastq(fastq_path1),
                                             nthreads=nthreads,
                                             light_storage=True)

    out_map_path = curr_map1 + '_frag%s.map' % (suffix)

    print('Mapping fragments of remaining reads...')
    _gem_mapping(mapper_index_path,
                 curr_map1,
                 out_map_path,
                 fastq_path2=curr_map2,
                 r_enz=r_enz,
                 gem_binary=gem_binary,
                 gem_version=gem_version,
                 **kwargs)
    # clean
    if clean:
        print('   x removing GEM 3 input %s' % (curr_map1))
        os.system('rm -f %s' % (curr_map1))
        print('   x removing GEM 3 input %s' % (curr_map2))
        os.system('rm -f %s' % (curr_map2))

    #sort sam file
    os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' %
              (nthreads, out_map_path, out_map_path, out_map_path))
    genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq)
    frag_chunk = kwargs.get('frag_chunk', 100000)
    frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk)
    if samtools and nthreads > 1:
        print('Splitting sam file')
        # headers
        for i in range(nthreads):
            os.system(samtools + ' view -H -O SAM %s > "%s_%d"' %
                      (out_map_path, out_map_path, (i + 1)))
        chunk_lines = int(
            (count_fastq * 2.3) /
            nthreads)  # estimate lines in sam with reads and frags
        os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" '
              BEGIN { part=0; line=n }       
              { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } 
                else { print $0 >> "%s_"part; line++; } 
                last_read = $1;
              }'
        ''' % (out_map_path, chunk_lines, out_map_path, out_map_path))
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        print('Parsing results...')
        kwargs['nthreads'] = 1
        procs = []
        pool = mu.Pool(nthreads)
        for i in range(nthreads):
            frags_shared = copy.deepcopy(frags)
            procs.append(
                pool.apply_async(parse_gem_3c,
                                 args=('%s_%d' % (out_map_path, (i + 1)),
                                       '%s_parsed_%d' % (out_map_path,
                                                         (i + 1)),
                                       copy.deepcopy(genome_lengths),
                                       frags_shared, False, True),
                                 kwds=kwargs))
            #results.append('%s_parsed_%d' % (out_map_path,(i+1)))
        pool.close()
        pool.join()
        results = [proc.get() for proc in procs if proc.get()]
        if clean:
            for i in range(nthreads):
                print('   x removing tmp mapped %s_%d' % (out_map_path,
                                                          (i + 1)))
                os.system('rm -f %s_%d' % (out_map_path, (i + 1)))

        #Final sort and merge
        nround = 0
        while len(results) > 1:
            nround += 1
            num_procs = min(nthreads, int(len(results) / 2))
            pool = mu.Pool(num_procs)
            procs = [
                pool.apply_async(merge_sort,
                                 (results.pop(0), results.pop(0),
                                  out_map_path + '_%d' % nround, i, True))
                for i in range(num_procs)
            ]
            pool.close()
            pool.join()
            results = [proc.get() for proc in procs if proc.get()]

        map_out = open(out_map, 'w')
        tmp_reads_fh = open(results[0], 'r')
        for crm in genome_seq:
            map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]] + read[2:8] + read[9:]))
        map_out.close()
        if clean:
            print('   x removing tmp mapped %s' % results[0])
            os.system('rm -f %s' % (results[0]))

    else:
        print('Parsing result...')
        parse_gem_3c(out_map_path,
                     out_map,
                     genome_lengths,
                     frags,
                     verbose=False,
                     tmp_format=False,
                     **kwargs)

        # clean
        if clean:
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))

    if get_nread:
        return [(out_map, count_fastq)]
    return out_map
Esempio n. 10
0
def check_options(opts):
    if opts.cfg:
        get_options_from_cfg(opts.cfg, opts)

    opts.gem_binary = which(opts.gem_binary)
    if not opts.gem_binary:
        raise Exception('\n\nERROR: GEM binary not found, install it from:'
                        '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                        '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                        'have a recent computer, the '
                        'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                        'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n\nNOTE: GEM does '
                        'not provide any binary for MAC-OS.')

    # check RE name
    try:
        _ = RESTRICTION_ENZYMES[opts.renz]
    except KeyError:
        print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n'
               + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n')
        raise KeyError()
    except AttributeError:
        pass

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print ('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check paths
    if not path.exists(opts.index):
        raise IOError('ERROR: index file not found at ' + opts.index)

    if not path.exists(opts.fastq):
        raise IOError('ERROR: FASTQ file not found at ' + opts.fastq)
    
    # create tmp directory
    if not opts.tmp:
        opts.tmp = opts.workdir + '_tmp_r%d' % opts.read

    try:
        opts.windows = [[int(i) for i in win.split(':')]
                        for win in opts.windows]
    except TypeError:
        pass
        
    mkdir(opts.workdir)
    # write log
    # if opts.mapping_only:
    log_format = '[MAPPING {} READ{}]   %(message)s'.format(opts.fastq, opts.read)
    # else:
    #     log_format = '[DEFAULT]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print 'Writing log to ' + path.join(opts.workdir, 'process.log')
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='aw')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='aw')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # check GEM mapper extra options
    if opts.gem_param:
        opts.gem_param = dict([o.split(':') for o in opts.gem_param])
    else:
        opts.gem_param = {}
    gem_valid_option = set(["granularity", "q", "quality-format",
                            "gem-quality-threshold", "mismatch-alphabet",
                            "m", "e", "min-matched-bases",
                            "max-big-indel-length", "s", "strata-after-best",
                            "fast-mapping", "unique-mapping", "d", "D",
                            "allow-incomplete-strata", "max-decoded-matches",
                            "min-decoded-strata", "p", "paired-end-alignment",
                            "b", "map-both-ends", "min-insert-size",
                            "max-insert-size", "E", "max-extendable-matches",
                            "max-extensions-per-match", "unique-pairing"])
    for k in opts.gem_param:
        if not k in gem_valid_option:
            raise NotImplementedError(('ERROR: option "%s" not a valid GEM option'
                                       'or not suported by this tool.') % k)
    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        exit('WARNING: exact same job already computed, see JOBs table above')
Esempio n. 11
0
def check_options(opts):

    if not opts.mapper_binary:
        if opts.mapper == 'gem':
            opts.mapper_binary = 'gem-mapper'
        else:
            opts.mapper_binary = opts.mapper
    opts.mapper_binary = which(opts.mapper_binary)
    if not opts.mapper_binary:
        raise Exception(
            '\n\nERROR: Mapper binary not found, for GEM install it from:'
            '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
            '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
            'have a recent computer, the '
            'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
            'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
            'Copy the binary gem-mapper to /usr/local/bin/ for '
            'example (somewhere in your PATH).\n\nNOTE: GEM does '
            'not provide any binary for MAC-OS.')

    opts.gem_version = 0
    if opts.mapper == 'gem':
        opts.gem_version = None
        try:
            out, _ = Popen([opts.mapper_binary, '--version'],
                           stdout=PIPE,
                           stderr=STDOUT,
                           universal_newlines=True).communicate()
            opts.gem_version = int(out[1])
        except ValueError as e:
            opts.gem_version = 2
            print('Falling to gem v2')

    if opts.fast_fragment:
        if opts.gem_version < 3:
            raise Exception('ERROR: Fast fragment mapping needs GEM v3')
        if not opts.fastq2 or not path.exists(opts.fastq2):
            raise Exception(
                'ERROR: Fast fragment mapping needs both fastq files. '
                'Please specify --fastq2')
        if opts.read != 0:
            raise Exception(
                'ERROR: Fast fragment mapping needs to be specified with --read 0'
            )
        if not opts.genome:
            raise Exception('ERROR: Fast fragment mapping needs '
                            'the genome parameter.')
    # check RE name
    if opts.renz == ['CHECK']:
        print('\nSearching for most probable restriction enzyme in file: %s' %
              (opts.fastq))
        try:
            pat, enz, pv = identify_re(opts.fastq, nreads=100000)
            print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv))
            print(' -> Enzymes matching: %s' % (', '.join(enz)))
        except ValueError:
            print(' -> Nothing found...')
        exit()
    for n, renz in enumerate(opts.renz):
        if renz == 'NONE':
            opts.renz[n] = None
            continue
        try:
            _ = RESTRICTION_ENZYMES[renz]
        except KeyError:
            print('\n\nERROR: restriction enzyme %s not found.' % (renz) +
                  'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) +
                  '\n\n')
            raise KeyError()
        except AttributeError:
            pass

    # check skip
    if not path.exists(opts.workdir) and opts.skip:
        print('WARNING: can use output files, found, not skipping...')
        opts.skip = False

    # number of cpus
    if opts.cpus == 0:
        opts.cpus = cpu_count()
    else:
        opts.cpus = min(opts.cpus, cpu_count())

    # check paths
    if opts.mapper == 'gem' and not path.exists(opts.index):
        raise IOError('ERROR: index file not found at ' + opts.index)

    if not path.exists(opts.fastq):
        raise IOError('ERROR: FASTQ file not found at ' + opts.fastq)

    if not is_fastq(opts.fastq):
        raise IOError(
            ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq))

    try:
        opts.windows = [[int(i) for i in win.split(':')]
                        for win in opts.windows]
    except TypeError:
        pass

    mkdir(opts.workdir)
    # write log
    # if opts.mapping_only:
    log_format = '[MAPPING {} READ{}]   %(message)s'.format(
        opts.fastq, opts.read)
    # else:
    #     log_format = '[DEFAULT]   %(message)s'

    # reset logging
    logging.getLogger().handlers = []

    try:
        print('Writing log to ' + path.join(opts.workdir, 'process.log'))
        logging.basicConfig(level=logging.INFO,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log'),
                            filemode='a+')
    except IOError:
        logging.basicConfig(level=logging.DEBUG,
                            format=log_format,
                            filename=path.join(opts.workdir, 'process.log2'),
                            filemode='a+')

    # to display log on stdout also
    logging.getLogger().addHandler(logging.StreamHandler())

    # write version log
    vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log')
    dependencies = get_dependencies_version()
    if not path.exists(
            vlog_path) or open(vlog_path).readlines() != dependencies:
        logging.info('Writing versions of TADbit and dependencies')
        vlog = open(vlog_path, 'w')
        vlog.write(dependencies)
        vlog.close()

    # check mapper extra options
    if opts.mapper_param:
        if (len(opts.mapper_param) == 1 and
            ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])):
            # Single string surrounded by quotes
            opts.mapper_param = opts.mapper_param[0].split()
        else:
            opts.mapper_param = dict([o.split(':') for o in opts.mapper_param])
    else:
        opts.mapper_param = {}
    if opts.mapper == 'gem' and opts.gem_version < 3:
        gem_valid_option = set([
            "granularity", "q", "quality-format", "gem-quality-threshold",
            "mismatch-alphabet", "m", "e", "min-matched-bases",
            "max-big-indel-length", "s", "strata-after-best", "fast-mapping",
            "unique-mapping", "d", "D", "allow-incomplete-strata",
            "max-decoded-matches", "min-decoded-strata", "p",
            "paired-end-alignment", "b", "map-both-ends", "min-insert-size",
            "max-insert-size", "E", "max-extendable-matches",
            "max-extensions-per-match", "unique-pairing"
        ])
        for k in opts.mapper_param:
            if not k in gem_valid_option:
                raise NotImplementedError(
                    ('ERROR: option "%s" not a valid GEM option'
                     'or not suported by this tool.') % k)

    # create empty DB if don't exists
    dbpath = path.join(opts.workdir, 'trace.db')
    open(dbpath, 'a').close()

    # for lustre file system....
    if 'tmpdb' in opts and opts.tmpdb:
        dbdir = opts.tmpdb
        # tmp file
        dbfile = 'trace_%s' % (''.join(
            [ascii_letters[int(random() * 52)] for _ in range(10)]))
        opts.tmpdb = path.join(dbdir, dbfile)
        try:
            copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb)
        except IOError:
            pass

    # check if job already run using md5 digestion of parameters
    if already_run(opts):
        if 'tmpdb' in opts and opts.tmpdb:
            remove(path.join(dbdir, dbfile))
        exit('WARNING: exact same job already computed, see JOBs table above')
Esempio n. 12
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print '         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std)
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print '         - reproducibility score: %.4f' % (reprod)
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    printime('  - Mergeing experiments')
    system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
    printime('  - Indexing new BAM file')
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    if version >= LooseVersion('1.3.1'):
        system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
    else:
        system(samtools  + ' index %s' % (outbam))

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, launch_time, finish_time)
    printime('\nDone.')
Esempio n. 13
0
def gem_mapping(gem_index_path, fastq_path, out_map_path,
                gem_binary='gem-mapper', **kwargs):
    """
    :param None focus: trims the sequence in the input FASTQ file according to a
       (start, end) position, or the name of a restriction enzyme. By default it
       uses the full sequence.
    :param 33 quality: set it to 'ignore' in order to speed-up the mapping
    """
    gem_index_path    = os.path.abspath(os.path.expanduser(gem_index_path))
    fastq_path        = os.path.abspath(os.path.expanduser(fastq_path))
    out_map_path      = os.path.abspath(os.path.expanduser(out_map_path))
    nthreads          = kwargs.get('nthreads'            , 8)
    max_edit_distance = kwargs.get('max_edit_distance'   , 0.04)
    mismatches        = kwargs.get('mismatches'          , 0.04)

    # check that we have the GEM binary:
    gem_binary = which(gem_binary)
    if not gem_binary:
        raise Exception('\n\nERROR: GEM binary not found, install it from:'
                        '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                        '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                        'have a recent computer, the '
                        'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                        'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                        'Copy the binary gem-mapper to /usr/local/bin/ for '
                        'example (somewhere in your PATH).\n\nNOTE: GEM does '
                        'not provide any binary for MAC-OS.')

    # mapping
    print 'TO GEM', fastq_path
    kgt = kwargs.get
    gem_cmd = [
        gem_binary, '-I', gem_index_path,
        '-q'                        , kgt('q', 'offset-33'                     ),
        '-m'                        , kgt('m', str(max_edit_distance          )),
        '-s'                        , kgt('s', kgt('strata-after-best', '0'   )),
        '--allow-incomplete-strata' , kgt('allow-incomplete-strata', '0.00'    ),
        '--granularity'             , kgt('granularity', '10000'               ),
        '--max-decoded-matches'     , kgt('max-decoded-matches', kgt('d', '1' )),
        '--min-decoded-strata'      , kgt('min-decoded-strata', kgt('D', '0'  )),
        '--min-insert-size'         , kgt('min-insert-size', '0'               ),
        '--max-insert-size'         , kgt('max-insert-size', '0'               ),
        '--min-matched-bases'       , kgt('min-matched-bases', '0.8'           ),
        '--gem-quality-threshold'   , kgt('gem-quality-threshold', '26'        ),
        '--max-big-indel-length'    , kgt('max-big-indel-length', '15'         ),
        '--mismatch-alphabet'       , kgt('mismatch-alphabet', 'ACGT'          ),
        '-E'                        , kgt('E', '0.30'                          ),
        '--max-extendable-matches'  , kgt('max-extendable-matches', '20'       ),
        '--max-extensions-per-match', kgt('max-extensions-per-match', '1'      ),
        '-e'                        , kgt('e', str(mismatches                 )),
        '-T'                        , str(nthreads),
        '-i'                        , fastq_path,
        '-o', out_map_path.replace('.map', '')]

    if 'paired-end-alignment' in kwargs or 'p' in kwargs:
        gem_cmd.append('--paired-end-alignment')
    if 'map-both-ends' in kwargs or 'b' in kwargs:
        gem_cmd.append('--map-both-ends')
    if 'fast-mapping' in kwargs:
        gem_cmd.append('--fast-mapping')
    if 'unique-mapping' in kwargs:
        gem_cmd.append('--unique-mapping')
    if 'unique-pairing' in kwargs:
        gem_cmd.append('--unique-pairing')

    # check kwargs
    for kw in kwargs:
        if not kw in ['nthreads', 'max_edit_distance',
                      'mismatches', 'max_reads_per_chunk',
                      'out_files', 'temp_dir', 'skip', 'q', 'm', 's',
                      'strata-after-best', 'allow-incomplete-strata',
                      'granularity', 'max-decoded-matches',
                      'min-decoded-strata', 'min-insert-size',
                      'max-insert-size', 'min-matched-bases',
                      'gem-quality-threshold', 'max-big-indel-length',
                      'mismatch-alphabet', 'E', 'max-extendable-matches',
                      'max-extensions-per-match', 'e', 'paired-end-alignment',
                      'p', 'map-both-ends', 'fast-mapping', 'unique-mapping',
                      'unique-pairing', 'suffix']:
            warn('WARNING: %s not in usual keywords, misspelled?' % kw)

    print ' '.join(gem_cmd)
    try:
        # check_call(gem_cmd, stdout=PIPE, stderr=PIPE)
        out, err = Popen(gem_cmd, stdout=PIPE, stderr=PIPE).communicate()
    except CalledProcessError as e:
        print out
        print err
        raise Exception(e.output)
Esempio n. 14
0
def full_mapping(mapper_index_path,
                 fastq_path,
                 out_map_dir,
                 mapper='gem',
                 r_enz=None,
                 frag_map=True,
                 min_seq_len=15,
                 windows=None,
                 add_site=True,
                 clean=False,
                 get_nread=False,
                 mapper_binary=None,
                 mapper_params=None,
                 **kwargs):
    """
    Maps FASTQ reads to an indexed reference genome. Mapping can be done either
    without knowledge of the restriction enzyme used, or for experiments
    performed without one, like Micro-C (iterative mapping), or using the
    ligation sites created from the digested ends (fragment-based mapping).

    :param mapper_index_path: path to index file created from a reference genome
       using gem-index tool or bowtie2-build
    :param fastq_path: PATH to FASTQ file, either compressed or not.
    :param out_map_dir: path to a directory where to store mapped reads in MAP
       format .
    :param None r_enz: name of the restriction enzyme used in the experiment e.g.
       HindIII. This is optional if frag_map option is False
    :param True frag_map: two step mapper, first full length is mapped, then
       remaining, unmapped reads, are divided into restriction-enzyme fragments
       andeach is mapped.
    :param True add_site: when splitting the sequence by ligated sites found,
       removes the ligation site, and put back the original RE site.
    :param 15 min_seq_len: minimum size of a fragment to map
    :param None windows: tuple of ranges for beginning and end of the
       mapping. This parameter allows to do classical iterative mapping, e.g.
         windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50))
       A unique window can also be passed, for trimming, like this:
         windows=((1,101),)
    :param False clean: remove intermediate files created in temp_dir
    :param 4 nthreads: number of threads to use for mapping (number of CPUs)
    :param 0.04 max_edit_distance: The maximum number of edit operations allowed
       while verifying candidate matches by dynamic programming.
    :param 0.04 mismatches: The maximum number of nucleotide substitutions
       allowed while mapping each k-mer. It is always guaranteed that, however
       other options are chosen, all the matches up to the specified number of
       substitutions will be found by the program.
    :param /tmp temp_dir: important to change. Intermediate FASTQ files will be
       written there.
    :param False get_nreads: returns a list of lists where each element contains
       a path and the number of reads processed
    :param gem-mapper mapper_binary: path to the binary mapper
    :param None mapper_params: extra parameters for the mapper

    :returns: a list of paths to generated outfiles. To be passed to
       :func:`pytadbit.parsers.map_parser.parse_map`
    """

    skip = kwargs.get('skip', False)
    suffix = kwargs.get('suffix', '')
    suffix = ('_' * (suffix != '')) + suffix
    nthreads = kwargs.get('nthreads', 8)
    outfiles = []
    temp_dir = os.path.abspath(
        os.path.expanduser(kwargs.get('temp_dir', gettempdir())))
    if mapper == 'gem':
        gem_version = None
        # check that we have the GEM binary:
        gem_binary = mapper_binary or 'gem-mapper'
        gem_binary = which(gem_binary)
        if not gem_binary:
            raise Exception(
                '\n\nERROR: GEM binary not found, install it from:'
                '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/'
                '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if'
                'have a recent computer, the '
                'GEM-binaries-Linux-x86_64-core_2 otherwise\n - '
                'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - '
                'Copy the binary gem-mapper to /usr/local/bin/ for '
                'example (somewhere in your PATH).\n\nNOTE: GEM does '
                'not provide any binary for MAC-OS.')
        try:
            out, err = Popen([gem_binary, '--version'],
                             stdout=PIPE,
                             stderr=STDOUT,
                             universal_newlines=True).communicate()
            gem_version = int(out[1])
        except ValueError as e:
            gem_version = 2
            print('Falling to gem v2')
    if mapper_params and isinstance(mapper_params, dict):
        kwargs.update(mapper_params)
    # create directories
    for rep in [temp_dir, out_map_dir]:
        mkdir(rep)
    # check space
    fspace = int(get_free_space_mb(temp_dir, div=3))
    if fspace < 200:
        warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir))

    # iterative mapping
    base_name = os.path.split(fastq_path)[-1].replace('.gz', '')
    base_name = '.'.join(base_name.split('.')[:-1])
    input_reads = fastq_path
    if windows is None:
        light_storage = True
        windows = (None, )
    elif isinstance(windows[0], int):
        # if windows starts at zero we do not need to store all the sequence
        # otherwise we need it because sequence can be trimmed two times
        # in fragment based mapping
        light_storage = True if not windows[0] else False
        windows = [tuple(windows)]
    else:
        # ensure that each element is a tuple, not a list
        windows = [tuple(win) for win in windows]
        # in this case we will need to keep the information about original
        # sequence at any point, light storage is thus not possible.
        light_storage = False
    for win in windows:
        # Prepare the FASTQ file and iterate over them
        curr_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            fastq=is_fastq(input_reads),
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if input_reads != fastq_path and clean:
            print('   x removing original input %s' % input_reads)
            os.system('rm -f %s' % (input_reads))
        # First mapping, full length
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix)
        if end:
            print('Mapping reads in window %s-%s%s...' % (beg, end, suffix))
        else:
            print('Mapping full reads...', curr_map)

        if not skip:
            if mapper == 'gem':
                _gem_mapping(mapper_index_path,
                             curr_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             gem_params=mapper_params,
                             **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, curr_map,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path,
                        curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                        os.path.join(
                            out_map_dir, base_name + '_full_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                _bowtie2_mapping(mapper_index_path,
                                 curr_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                # parse map file to extract not uniquely mapped reads
                print('Parsing result...')
                _sam_filter(
                    out_map_path, curr_map,
                    curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_full_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
            # clean
            if clean:
                print('   x removing %s input %s' % (mapper.upper(), curr_map))
                os.system('rm -f %s' % (curr_map))
                print('   x removing map %s' % out_map_path)
                os.system('rm -f %s' % (out_map_path))
            # for next round, we will use remaining unmapped reads
            input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix)
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter))

    # map again splitting unmapped reads into RE fragments
    # (no need to trim this time)
    if frag_map:
        if not r_enz:
            raise Exception('ERROR: need enzyme name to fragment.')
        frag_map, counter = transform_fastq(input_reads,
                                            mkstemp(prefix=base_name + '_',
                                                    dir=temp_dir)[1],
                                            min_seq_len=min_seq_len,
                                            trim=win,
                                            fastq=False,
                                            r_enz=r_enz,
                                            add_site=add_site,
                                            skip=skip,
                                            nthreads=nthreads,
                                            light_storage=light_storage)
        # clean
        if clean:
            print('   x removing pre-%s input %s' %
                  (mapper.upper(), input_reads))
            os.system('rm -f %s' % (input_reads))
        if not win:
            beg, end = 1, 'end'
        else:
            beg, end = win
        out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix)
        if not skip:
            if mapper == 'gem':
                print('Mapping fragments of remaining reads...')
                _gem_mapping(mapper_index_path,
                             frag_map,
                             out_map_path,
                             gem_binary=gem_binary,
                             gem_version=gem_version,
                             **kwargs)
                print('Parsing result...')
                # check if output is sam format for gem3
                if gem_version >= 3:
                    _sam_filter(
                        out_map_path, frag_map,
                        curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
                else:
                    _gem_filter(
                        out_map_path, curr_map + '_fail%s.map' % (suffix),
                        os.path.join(
                            out_map_dir, base_name + '_frag_%s-%s%s.map' %
                            (beg, end, suffix)))
            elif mapper == 'bowtie2' or mapper == 'hisat2':
                print('Mapping fragments of remaining reads...')
                _bowtie2_mapping(mapper_index_path,
                                 frag_map,
                                 out_map_path,
                                 bowtie2_binary=(mapper_binary
                                                 if mapper_binary else mapper),
                                 bowtie2_params=mapper_params,
                                 **kwargs)
                print('Parsing result...')
                _sam_filter(
                    out_map_path, frag_map,
                    curr_map + '_fail%s.map' % (suffix),
                    os.path.join(
                        out_map_dir,
                        base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)))
            else:
                raise Exception('ERROR: unknown mapper.')
        # clean
        if clean:
            print('   x removing %s input %s' % (mapper.upper(), frag_map))
            os.system('rm -f %s' % (frag_map))
            print('   x removing failed to map ' + curr_map + '_fail%s.map' %
                  (suffix))
            os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix)))
            print('   x removing tmp mapped %s' % out_map_path)
            os.system('rm -f %s' % (out_map_path))
        outfiles.append((os.path.join(
            out_map_dir,
            base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter))
    if clean:
        os.system('rm -rf %s' % (temp_dir))
    if get_nread:
        return outfiles
    return [out for out, _ in outfiles]
Esempio n. 15
0
def bed2D_to_BAMhic(infile,
                    valid,
                    ncpus,
                    outbam,
                    frmt,
                    masked=None,
                    samtools='samtools'):
    """
    function adapted from Enrique Vidal <*****@*****.**> scipt to convert
    2D beds into compressed BAM format.

    Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding
    filter files) and outputs a modified indexed BAM with the following fields:

       - read ID
       - filtering flag (see codes in header)
       - chromosome ID of the first pair of the contact
       - genomic position of the first pair of the contact
       - MAPQ set to 0
       - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy)
       - chromosome ID of the second pair of the contact
       - genomic position of the second pair of the contact
       - mapped length of the second pair of the contact
       - sequence is missing (*)
       - quality is missing (*)
       - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact)
       - S1 and S2 tags are the strand orientation of the left and right read-end

    Each pair of contacts produces two lines in the output BAM
    """
    samtools = which(samtools)
    if not samtools:
        raise Exception('ERROR: samtools is needed to save a compressed '
                        'version of the results. Check '
                        'http://samtools.sourceforge.net/ \n')

    # define filter codes
    filter_keys = OrderedDict()
    for k in MASKED:
        filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2**(k - 1)

    output = ''

    # write header
    output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n')
    fhandler = open(infile)
    line = fhandler.next()
    # chromosome lengths
    pos_fh = 0

    while line.startswith('#'):
        (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ")
        output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n')
        pos_fh += len(line)
        line = fhandler.next()

    # filter codes
    for i in filter_keys:
        output += ("\t".join(
            ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n')

    # tags
    output += ("\t".join((
        "@CO", "TC:i",
        "Number of time a sequenced fragment is involved in a pairwise contact\n"
    )))
    output += ("\t".join(
        ("@CO", ("Each read is duplicated: once starting with the "
                 "left read-end, once with the right read-end\n"))))
    output += ("\t".join(
        ("@CO", (" the order of RE sites and strands changes consequently "
                 "depending on which read-end comes first ("
                 "when right end is first: E3 E4 E1 E2)\n"))))
    output += ("\t".join(
        ("@CO", (" CIGAR code contains the length of the "
                 "1st read-end mapped and 'P' or 'S' "
                 "if the copy is the first or the second\n"))))
    output += ("\t".join(
        ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "S1:i",
         "Strand of the 1st read-end (1: positive, 0: negative)\n")))
    output += ("\t".join(
        ("@CO", "S2:i",
         "Strand of the 2nd read-end  (1: positive, 0: negative)\n")))

    # open and init filter files
    if not valid:
        filter_line, filter_handler = get_filters(infile, masked)
    fhandler.seek(pos_fh)
    # check samtools version number and modify command line
    version = LooseVersion([
        l.split()[1]
        for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
        if 'Version' in l
    ][0])
    pre = '-o' if version >= LooseVersion('1.3') else ''

    proc = Popen(
        samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' %
        (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3')
         else ''),  # in new version '.bam' is no longer added
        shell=True,
        stdin=PIPE)
    proc.stdin.write(output)
    if frmt == 'mid':
        map2sam = _map2sam_mid
    elif frmt == 'long':
        map2sam = _map2sam_long
    else:
        map2sam = _map2sam_short

    if valid:
        for line in fhandler:
            flag = 0
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    else:
        for line in fhandler:
            flag = 0
            # check if read matches any filter
            rid = line.split("\t")[0]
            for i in filter_line:
                if filter_line[i] == rid:
                    flag += filter_keys[i]
                    try:
                        filter_line[i] = filter_handler[i].next().strip()
                    except StopIteration:
                        pass
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    proc.stdin.close()
    proc.wait()

    # Index BAM
    _ = Popen(samtools + ' index %s.bam' % (outbam), shell=True).communicate()

    # close file handlers
    fhandler.close()
    if not valid:
        for i in filter_handler:
            filter_handler[i].close()
Esempio n. 16
0
def generate_BAM(infile, valid, ncpus, outbam, frmt):
    # define filter codes
    filter_keys = OrderedDict()
    filter_keys['self-circle']        = 2 ** 0
    filter_keys['dangling-end']       = 2 ** 1
    filter_keys['error']              = 2 ** 2
    filter_keys['extra-dangling-end'] = 2 ** 3
    filter_keys['too-close-from-RES'] = 2 ** 4
    filter_keys['too-short']          = 2 ** 5
    filter_keys['too-large']          = 2 ** 6
    filter_keys['over-represented']   = 2 ** 7
    filter_keys['duplicated']         = 2 ** 8
    filter_keys['random-breaks']      = 2 ** 9
    filter_keys['trans-chromosomic']  = 2 ** 10

    output = ''

    # write header
    output += ("\t".join(("@HD" ,"VN:1.5", "SO:queryname")) + '\n')

    fhandler = open(infile)
    line = fhandler.next()
    # chromosome lengths
    pos_fh = 0

    while line.startswith('#'):
        (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ")
        output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n')
        pos_fh += len(line)
        line = fhandler.next()

    # filter codes
    for i in filter_keys:
        output += ("\t".join(("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n')

    # tags
    output += ("\t".join(("@CO" ,"TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n")))
    output += ("\t".join(("@CO" ,("Each read is duplicated: once starting with the "
                                  "left read-end, once with the right read-end\n"))))
    output += ("\t".join(("@CO" , (" the order of RE sites and strands changes consequently "
                                   "depending on which read-end comes first ("
                                   "when right end is first: E3 E4 E1 E2)\n"))))
    output += ("\t".join(("@CO" ,(" CIGAR code contains the length of the "
                                  "1st read-end mapped and 'P' or 'S' "
                                  "if the copy is the first or the second\n"))))
    output += ("\t".join(("@CO" ,"E1:i", "Position of the left RE site of 1st read-end\n")))
    output += ("\t".join(("@CO" ,"E2:i", "Position of the right RE site of 1st read-end\n")))
    output += ("\t".join(("@CO" ,"E3:i", "Position of the left RE site of 2nd read-end\n")))
    output += ("\t".join(("@CO" ,"E4:i", "Position of the right RE site of 2nd read-end\n")))
    output += ("\t".join(("@CO" ,"S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n")))
    output += ("\t".join(("@CO" ,"S2:i", "Strand of the 2nd read-end  (1: positive, 0: negative)\n")))

    # open and init filter files
    if not valid:
        filter_line, filter_handler = get_filters(infile)
    fhandler.seek(pos_fh)
    samtools = which('samtools')
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    pre = '-o' if version >= LooseVersion('1.3') else ''
    proc = Popen('samtools view -Shb -@ %d - | samtools sort -@ %d - %s %s' % (
        ncpus, ncpus, pre,
        outbam + '.bam' if  version >= LooseVersion('1.3') else ''),  # in new version '.bam' is no longer added
                 shell=True, stdin=PIPE)
    proc.stdin.write(output)
    if frmt == 'mid':
        map2sam = _map2sam_mid
    elif frmt == 'long':
        map2sam = _map2sam_long
    else:
        map2sam = _map2sam_short

    if valid:
        for line in fhandler:
            flag = 0
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    else:
        for line in fhandler:
            flag = 0
            # check if read matches any filter
            rid = line.split("\t")[0]
            for i in filter_line:
                if filter_line[i] == rid:
                    flag += filter_keys[i]
                    try:
                        filter_line[i] = filter_handler[i].next().strip()
                    except StopIteration:
                        pass
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    proc.stdin.close()
    proc.wait()

    # Index BAM
    _ = Popen('samtools index %s.bam' % (outbam), shell=True).communicate()

    # close file handlers
    fhandler.close()
    if not valid:
        for i in filter_handler:
            filter_handler[i].close()
Esempio n. 17
0
def generate_BAM(infile, valid, ncpus, outbam, frmt):
    # define filter codes
    filter_keys = OrderedDict()
    filter_keys['self-circle'] = 2**0
    filter_keys['dangling-end'] = 2**1
    filter_keys['error'] = 2**2
    filter_keys['extra-dangling-end'] = 2**3
    filter_keys['too-close-from-RES'] = 2**4
    filter_keys['too-short'] = 2**5
    filter_keys['too-large'] = 2**6
    filter_keys['over-represented'] = 2**7
    filter_keys['duplicated'] = 2**8
    filter_keys['random-breaks'] = 2**9
    filter_keys['trans-chromosomic'] = 2**10

    output = ''

    # write header
    output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n')

    fhandler = open(infile)
    line = fhandler.next()
    # chromosome lengths
    pos_fh = 0

    while line.startswith('#'):
        (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ")
        output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n')
        pos_fh += len(line)
        line = fhandler.next()

    # filter codes
    for i in filter_keys:
        output += ("\t".join(
            ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n')

    # tags
    output += ("\t".join((
        "@CO", "TC:i",
        "Number of time a sequenced fragment is involved in a pairwise contact\n"
    )))
    output += ("\t".join(
        ("@CO", ("Each read is duplicated: once starting with the "
                 "left read-end, once with the right read-end\n"))))
    output += ("\t".join(
        ("@CO", (" the order of RE sites and strands changes consequently "
                 "depending on which read-end comes first ("
                 "when right end is first: E3 E4 E1 E2)\n"))))
    output += ("\t".join(
        ("@CO", (" CIGAR code contains the length of the "
                 "1st read-end mapped and 'P' or 'S' "
                 "if the copy is the first or the second\n"))))
    output += ("\t".join(
        ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "S1:i",
         "Strand of the 1st read-end (1: positive, 0: negative)\n")))
    output += ("\t".join(
        ("@CO", "S2:i",
         "Strand of the 2nd read-end  (1: positive, 0: negative)\n")))

    # open and init filter files
    if not valid:
        filter_line, filter_handler = get_filters(infile)
    fhandler.seek(pos_fh)
    samtools = which('samtools')
    version = LooseVersion([
        l.split()[1]
        for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
        if 'Version' in l
    ][0])
    pre = '-o' if version >= LooseVersion('1.3') else ''
    proc = Popen(
        'samtools view -Shb -@ %d - | samtools sort -@ %d - %s %s' %
        (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3')
         else ''),  # in new version '.bam' is no longer added
        shell=True,
        stdin=PIPE)
    proc.stdin.write(output)
    if frmt == 'mid':
        map2sam = _map2sam_mid
    elif frmt == 'long':
        map2sam = _map2sam_long
    else:
        map2sam = _map2sam_short

    if valid:
        for line in fhandler:
            flag = 0
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    else:
        for line in fhandler:
            flag = 0
            # check if read matches any filter
            rid = line.split("\t")[0]
            for i in filter_line:
                if filter_line[i] == rid:
                    flag += filter_keys[i]
                    try:
                        filter_line[i] = filter_handler[i].next().strip()
                    except StopIteration:
                        pass
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    proc.stdin.close()
    proc.wait()

    # Index BAM
    _ = Popen('samtools index %s.bam' % (outbam), shell=True).communicate()

    # close file handlers
    fhandler.close()
    if not valid:
        for i in filter_handler:
            filter_handler[i].close()
Esempio n. 18
0
def create_BAMhic(hic,
                  ncpus,
                  outbam,
                  chromosomes,
                  reso,
                  masked=None,
                  samtools='samtools'):
    """
    function adapted from Enrique Vidal <*****@*****.**> scipt to convert
    2D beds into compressed BAM format.

    Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding
    filter files) and outputs a modified indexed BAM with the following fields:

       - read ID
       - filtering flag (see codes in header)
       - chromosome ID of the first pair of the contact
       - genomic position of the first pair of the contact
       - MAPQ set to 0
       - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy)
       - chromosome ID of the second pair of the contact
       - genomic position of the second pair of the contact
       - mapped length of the second pair of the contact
       - sequence is missing (*)
       - quality is missing (*)
       - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact)
       - S1 and S2 tags are the strand orientation of the left and right read-end

    Each pair of contacts produces two lines in the output BAM
    """
    samtools = which(samtools)
    if not samtools:
        raise Exception('ERROR: samtools is needed to save a compressed '
                        'version of the results. Check '
                        'http://samtools.sourceforge.net/ \n')

    # define filter codes
    filter_keys = OrderedDict()
    for k in MASKED:
        filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2**(k - 1)

    output = ''

    # write header
    output += ("\t".join(("@HD", "VN:1.5", "SO:queryname")) + '\n')
    # chromosome lengths
    pos_fh = 0

    for chrom in chromosomes:
        output += ("\t".join(
            ("@SQ", "SN:" + chrom, "LN:" + str(chromosomes[chrom]))) + '\n')

    # filter codes
    for i in filter_keys:
        output += ("\t".join(
            ("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n')

    # tags
    output += ("\t".join((
        "@CO", "TC:i",
        "Number of time a sequenced fragment is involved in a pairwise contact\n"
    )))
    output += ("\t".join(
        ("@CO", ("Each read is duplicated: once starting with the "
                 "left read-end, once with the right read-end\n"))))
    output += ("\t".join(
        ("@CO", (" the order of RE sites and strands changes consequently "
                 "depending on which read-end comes first ("
                 "when right end is first: E3 E4 E1 E2)\n"))))
    output += ("\t".join(
        ("@CO", (" CIGAR code contains the length of the "
                 "1st read-end mapped and 'P' or 'S' "
                 "if the copy is the first or the second\n"))))
    output += ("\t".join(
        ("@CO", "E1:i", "Position of the left RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E2:i", "Position of the right RE site of 1st read-end\n")))
    output += ("\t".join(
        ("@CO", "E3:i", "Position of the left RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "E4:i", "Position of the right RE site of 2nd read-end\n")))
    output += ("\t".join(
        ("@CO", "S1:i",
         "Strand of the 1st read-end (1: positive, 0: negative)\n")))
    output += ("\t".join(
        ("@CO", "S2:i",
         "Strand of the 2nd read-end  (1: positive, 0: negative)\n")))

    # check samtools version number and modify command line
    version = LooseVersion([
        l.split()[1]
        for l in Popen(samtools, stderr=PIPE,
                       universal_newlines=True).communicate()[1].split('\n')
        if 'Version' in l
    ][0])
    pre = '-o' if version >= LooseVersion('1.3') else ''

    proc = Popen(
        samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' %
        (ncpus, ncpus, pre, outbam + '.bam' if version >= LooseVersion('1.3')
         else ''),  # in new version '.bam' is no longer added
        shell=True,
        stdin=PIPE,
        universal_newlines=True)
    proc.stdin.write(output)
    map2sam = _map2sam_mid

    rownam = [(k[0], k[1] * reso + 1)
              for k in sorted(hic.sections, key=lambda x: hic.sections[x])]
    total_counts = 0
    iter_rows = hic.yield_matrix()
    for nrow, row in enumerate(rownam):
        line = next(iter_rows)
        iter_cols = iter(line[nrow:])
        for ncol in range(nrow, len(rownam)):
            col = rownam[ncol]
            val = int(next(iter_cols))
            total_counts += val
            if not val:
                continue
            readid = '%s.%d.%s.%d' % (row[0], nrow, col[0], ncol)
            for nval in range(val):
                line_out = '%s.%d\t%s\t%d\t.\t1\t.\t.\t%s\t%d\t.\t1\t.\t.' % (
                    readid, nval, row[0], row[1], col[0], col[1])
                flag = 0
                proc.stdin.write(map2sam(line_out, flag))
    proc.stdin.close()
    proc.wait()

    # Index BAM
    _ = Popen(samtools + ' index %s.bam' % (outbam),
              shell=True,
              universal_newlines=True).communicate()

    return total_counts
Esempio n. 19
0
def run(opts):
    check_options(opts)
    samtools = which(opts.samtools)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bam1:
        mreads1 = path.realpath(opts.bam1)
        biases1 = opts.biases1
    else:
        biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)
        try:
            biases1 = path.join(opts.workdir1, biases1)
        except AttributeError:
            biases1 = None
        except TypeError:  # Py3
            biases1 = None

    if opts.bam2:
        mreads2 = path.realpath(opts.bam2)
        biases2 = opts.biases2
    else:
        biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)
        try:
            biases2 = path.join(opts.workdir2, biases2)
        except AttributeError:
            biases2 = None
        except TypeError:  # Py3
            biases1 = None

    filter_exclude = opts.filter

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        printime('  - loading first sample %s' % (mreads1))
        hic_data1 = load_hic_data_from_bam(mreads1, opts.reso, biases=biases1,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        printime('  - loading second sample %s' % (mreads2))
        hic_data2 = load_hic_data_from_bam(mreads2, opts.reso, biases=biases2,
                                           tmpdir=path.join(opts.workdir, '00_merge'),
                                           ncpus=opts.cpus,
                                           filter_exclude=filter_exclude)

        if opts.workdir1 and opts.workdir2:
            masked1 = {'valid-pairs': {'count': 0}}
            masked2 = {'valid-pairs': {'count': 0}}
        else:
            masked1 = {'valid-pairs': {'count': sum(hic_data1.values())}}
            masked2 = {'valid-pairs': {'count': sum(hic_data2.values())}}

        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))

        printime('  - comparing experiments')
        printime('    => correlation between equidistant loci')
        corr, _, scc, std, bads = correlate_matrices(
            hic_data1, hic_data2, normalized=opts.norm,
            remove_bad_columns=True, savefig=decay_corr_fig,
            savedata=decay_corr_dat, get_bads=True)
        print('         - correlation score (SCC): %.4f (+- %.7f)' % (scc, std))
        printime('    => correlation between eigenvectors')
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)

        printime('    => reproducibility score')
        reprod = get_reproducibility(hic_data1, hic_data2, num_evec=20, normalized=opts.norm,
                                     verbose=False, remove_bad_columns=True)
        print('         - reproducibility score: %.4f' % (reprod))
        ncols = len(hic_data1)
    else:
        ncols = 0
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        masked1 = {}
        masked2 = {}

        corr = eig_corr = scc = std = reprod = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s.bam' % (param_hash))

    if not opts.skip_merge:
        outbam = path.join(opts.workdir, '03_filtered_reads',
                           'intersection_%s.bam' % (param_hash))
        printime('  - Mergeing experiments')
        system(samtools  + ' merge -@ %d %s %s %s' % (opts.cpus, outbam, mreads1, mreads2))
        printime('  - Indexing new BAM file')
        # check samtools version number and modify command line
        version = LooseVersion([l.split()[1]
                                for l in Popen(samtools, stderr=PIPE,
                                               universal_newlines=True).communicate()[1].split('\n')
                                if 'Version' in l][0])
        if version >= LooseVersion('1.3.1'):
            system(samtools  + ' index -@ %d %s' % (opts.cpus, outbam))
        else:
            system(samtools  + ' index %s' % (outbam))
    else:
        outbam = ''

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(list(bads.keys())), ncols, scc, std, reprod,
                eigen_corr_dat, eigen_corr_fig, outbam, corr, eig_corr,
                biases1, biases2, masked1, masked2, launch_time, finish_time)
    printime('\nDone.')
Esempio n. 20
0
def bed2D_to_BAMhic(infile, valid, ncpus, outbam, frmt, masked=None, samtools='samtools'):
    """
    function adapted from Enrique Vidal <*****@*****.**> scipt to convert
    2D beds into compressed BAM format.

    Gets the *_both_filled_map.tsv contacts from TADbit (and the corresponding
    filter files) and outputs a modified indexed BAM with the following fields:

       - read ID
       - filtering flag (see codes in header)
       - chromosome ID of the first pair of the contact
       - genomic position of the first pair of the contact
       - MAPQ set to 0
       - pseudo CIGAR with sequence length and info about current copy (P: first copy, S: second copy)
       - chromosome ID of the second pair of the contact
       - genomic position of the second pair of the contact
       - mapped length of the second pair of the contact
       - sequence is missing (*)
       - quality is missing (*)
       - TC tag indicating single (1) or multi contact (3 6 ... number being the number of times a given sequenced fragment is involved in a pairwise contact)
       - S1 and S2 tags are the strand orientation of the left and right read-end

    Each pair of contacts produces two lines in the output BAM
    """
    samtools = which(samtools)
    if not samtools:
        raise Exception('ERROR: samtools is needed to save a compressed '
                        'version of the results. Check '
                        'http://samtools.sourceforge.net/ \n')

    # define filter codes
    filter_keys = OrderedDict()
    for k in MASKED:
        filter_keys[MASKED[k]['name'].replace(' ', '-')] = 2 ** (k - 1)

    output = ''

    # write header
    output += ("\t".join(("@HD" ,"VN:1.5", "SO:queryname")) + '\n')
    fhandler = open(infile)
    line = fhandler.next()
    # chromosome lengths
    pos_fh = 0

    while line.startswith('#'):
        (_, _, cr, ln) = line.replace("\t", " ").strip().split(" ")
        output += ("\t".join(("@SQ", "SN:" + cr, "LN:" + ln)) + '\n')
        pos_fh += len(line)
        line = fhandler.next()

    # filter codes
    for i in filter_keys:
        output += ("\t".join(("@CO", "filter:" + i, "flag:" + str(filter_keys[i]))) + '\n')

    # tags
    output += ("\t".join(("@CO" ,"TC:i", "Number of time a sequenced fragment is involved in a pairwise contact\n")))
    output += ("\t".join(("@CO" ,("Each read is duplicated: once starting with the "
                                  "left read-end, once with the right read-end\n"))))
    output += ("\t".join(("@CO" , (" the order of RE sites and strands changes consequently "
                                   "depending on which read-end comes first ("
                                   "when right end is first: E3 E4 E1 E2)\n"))))
    output += ("\t".join(("@CO" ,(" CIGAR code contains the length of the "
                                  "1st read-end mapped and 'P' or 'S' "
                                  "if the copy is the first or the second\n"))))
    output += ("\t".join(("@CO" ,"E1:i", "Position of the left RE site of 1st read-end\n")))
    output += ("\t".join(("@CO" ,"E2:i", "Position of the right RE site of 1st read-end\n")))
    output += ("\t".join(("@CO" ,"E3:i", "Position of the left RE site of 2nd read-end\n")))
    output += ("\t".join(("@CO" ,"E4:i", "Position of the right RE site of 2nd read-end\n")))
    output += ("\t".join(("@CO" ,"S1:i", "Strand of the 1st read-end (1: positive, 0: negative)\n")))
    output += ("\t".join(("@CO" ,"S2:i", "Strand of the 2nd read-end  (1: positive, 0: negative)\n")))

    # open and init filter files
    if not valid:
        filter_line, filter_handler = get_filters(infile, masked)
    fhandler.seek(pos_fh)
    # check samtools version number and modify command line
    version = LooseVersion([l.split()[1]
                            for l in Popen(samtools, stderr=PIPE).communicate()[1].split('\n')
                            if 'Version' in l][0])
    pre = '-o' if version >= LooseVersion('1.3') else ''

    proc = Popen(samtools + ' view -Shb -@ %d - | samtools sort -@ %d - %s %s' % (
        ncpus, ncpus, pre,
        outbam + '.bam' if  version >= LooseVersion('1.3') else ''),  # in new version '.bam' is no longer added
                 shell=True, stdin=PIPE)
    proc.stdin.write(output)
    if frmt == 'mid':
        map2sam = _map2sam_mid
    elif frmt == 'long':
        map2sam = _map2sam_long
    else:
        map2sam = _map2sam_short

    if valid:
        for line in fhandler:
            flag = 0
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    else:
        for line in fhandler:
            flag = 0
            # check if read matches any filter
            rid = line.split("\t")[0]
            for i in filter_line:
                if filter_line[i] == rid:
                    flag += filter_keys[i]
                    try:
                        filter_line[i] = filter_handler[i].next().strip()
                    except StopIteration:
                        pass
            # get output in sam format
            proc.stdin.write(map2sam(line, flag))
    proc.stdin.close()
    proc.wait()

    # Index BAM
    _ = Popen(samtools + ' index %s.bam' % (outbam), shell=True).communicate()

    # close file handlers
    fhandler.close()
    if not valid:
        for i in filter_handler:
            filter_handler[i].close()