Esempio n. 1
0
def main():
    opts = get_options()
    iterative_mapping()

    ## PARSE FASTA
    genome = parse_fasta(opts.fasta if len(opts.fasta) <= 1 else opts.fasta[0],
                         chr_names=opts.chr_name, verbose=True)
Esempio n. 2
0
def main():
    opts = get_options()
    iterative_mapping()

    ## PARSE FASTA
    genome = parse_fasta(opts.fasta if len(opts.fasta) <= 1 else opts.fasta[0],
                         chr_names=opts.chr_name,
                         verbose=True)
Esempio n. 3
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()
Esempio n. 4
0
# print 'read 2'
# mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
#         temp_dir=temp_dir2, windows=((100, 200),))

from pytadbit.mapping.mapper import iterative_mapping  
from pytadbit.mapping.full_mapper import full_mapping 
from pytadbit.parsers.map_parser import parse_map
from pytadbit.parsers.sam_parser import parse_sam
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.mapping.mapper import get_intersection
from pytadbit.mapping.filter import filter_reads, apply_filter

if mapper == 1:
    print 'read 1'
    outfiles1 = iterative_mapping(gem_index_path, fastq, out_map_dir1,
                                  r_beg1, [e + 2 for e in r_end1],
                                  temp_dir=temp_dir1)
    print 'read 2'
    outfiles2 = iterative_mapping(gem_index_path, fastq, out_map_dir2,
                                  r_beg2, [e + 2 for e in r_end2],
                                  temp_dir=temp_dir2)
    parse_thing = parse_sam
elif mapper == 2:
    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, frag_map=False,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, frag_map=False,
                             windows=(zip(*(r_beg2, r_end2))))
Esempio n. 5
0
rep = 'SRR_test'
INFILE = INFILE % rep

OUTPATH = PATH + rep + '_' + str(chunk) + '/'

chr_names = ['2L', '2R', '3L', '3R', '4', 'X']
genome_seq = parse_fasta([PATH + 'dmel_reference/chr%s.fa' % crm for crm in chr_names], chr_names)

frags = map_re_sites('HindIII', genome_seq, verbose=True)

sams1 = iterative_mapping(
            gem_index_path       = PATH + 'dmel_reference/dm3.genome.gem',
            fastq_path           = INFILE,
            out_sam_path         = OUTPATH + '%s_r1.txt' % rep,
            temp_dir             = PATH + 'tmp_dir/',
            range_start          = [10] * 5, # starts with a flag sequence
            range_stop           = range(30, 55, 5),
            nthreads             = 8,  # on intel corei7 CPUs 4 threads are as fast as
                                       # 8, but leave some room for you other applications
            max_reads_per_chunk  = chunk,
            single_end           = True)
print 'created thes SAM files:', sams1

sams2 = iterative_mapping(
            gem_index_path       = PATH + 'dmel_reference/dm3.genome.gem',
            fastq_path           = INFILE,
            out_sam_path         = OUTPATH + '%s_r2.txt' % rep,
            temp_dir             = PATH + 'tmp_dir/',
            range_start          = range(80, 55, -5), # starts with a flag sequence
            range_stop           = [100] * 5,
            nthreads             = 8,  # on intel corei7 CPUs 4 threads are as fast as
Esempio n. 6
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '':  # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else:  # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '':  # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else:  # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads,
            resolution=res,
            by_chrom='intra',
            savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp:  # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        remove(out_sam_left_path + '*')
        remove(out_sam_right_path + '*')
        remove(join(output_directory, '*.tsv'))
        print 'Done.'
        stdout.flush()