Ejemplo n.º 1
0
    def test_18_filter_reads(self):
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
                   re_name='DPNII', mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping.mapper import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali), verbose=False,
                                  fast=(ali=='map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1125)
                self.assertEqual(masked[6]['reads'], 2328)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 94)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue (masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open('lala-map-filt~')
                              if not l.startswith('#')]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
Ejemplo n.º 2
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
Ejemplo n.º 3
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
Ejemplo n.º 4
0
        }
        out1.write(read.format(**read1))
        out2.write(read.format(**read2))
    i += 1

out1.close()
out2.close()

# PARSE SAM
if ali == 'map':
    from pytadbit.parsers.map_parser import parse_map as parser
else:
    from pytadbit.parsers.sam_parser import parse_sam as parser

parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
       './lala1-%s~' % (ali),
       './lala2-%s~' % (ali),
       genome,
       re_name='DPNII',
       mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali))

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala-%s~' % (ali))
Ejemplo n.º 5
0
    else:
        read2 = {'crm': crm1, 'pos': pos1, 'flag': flags[sd1], 'id': 'lala05.1%011d' % (i)}
        read1 = {'crm': crm2, 'pos': pos2, 'flag': flags[sd2], 'id': 'lala05.1%011d' % (i)}
        out1.write(read.format(**read1))
        out2.write(read.format(**read2))
    i += 1

out1.close()
out2.close()

# PARSE SAM
if ali == 'map':
    from pytadbit.parsers.map_parser import parse_map as parser
else:
    from pytadbit.parsers.sam_parser import parse_sam as parser

parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
       './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
       re_name='DPNII', mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali))

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala-%s~' % (ali))

Ejemplo n.º 6
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()
Ejemplo n.º 7
0
    def test_18_filter_reads(self):
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random() * 100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali),
                   './lala2-%s~' % (ali),
                   genome,
                   re_name='DPNII',
                   mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping.mapper import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali),
                                  verbose=False,
                                  fast=(ali == 'map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1125)
                self.assertEqual(masked[6]['reads'], 2328)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 94)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue(masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~',
                     'lala-map-filt~',
                     masked,
                     filters=[1],
                     reverse=True)
        self.assertEqual(
            len([
                True for l in open('lala-map-filt~') if not l.startswith('#')
            ]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
Ejemplo n.º 8
0
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, frag_map=False,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, frag_map=False,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map
elif mapper == 3:
    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map

read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win)
parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
            genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
            re_name='HindIII', verbose=True)

reads = 'both_reads.tsv_%s-%s' % (mapper, win)
get_intersection(read1, read2, reads)

masked = filter_reads(reads)
freads = 'filtered_reads.tsv_%s-%s' % (mapper, win)
apply_filter(reads, freads, masked)
Ejemplo n.º 9
0
                                       # 8, but leave some room for you other applications
            max_reads_per_chunk  = chunk,
            single_end           = True)

sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')]
sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')]

print 'created thes SAM files:', sams2

parse_sam(sams1, sams2, frags, 
          OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep,
          genome_seq, 'HindIII', verbose=True)

reads1 = OUTPATH + 'reads1_%s.tsv' % rep
reads2 = OUTPATH + 'reads2_%s.tsv' % rep
reads  = OUTPATH + 'reads12_%s.tsv' % rep

get_intersection(reads1, reads2, reads, verbose=True)

from pytadbit.mapping.analyze import hic_map
hic_map(reads, genome_seq, resolution=100000, savedata='lala')

from pytadbit.mapping.analyze import plot_genomic_distribution

plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it

#  690691 SRR_test_10000000/reads1_SRR_test.tsv
#  690691 SRR_test_100000/reads1_SRR_test.tsv
# 1242927 SRR_test_200000/reads1_SRR_test.tsv
# 1035866 SRR_test_500000/reads1_SRR_test.tsv
Ejemplo n.º 10
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '':  # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else:  # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '':  # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else:  # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads,
            resolution=res,
            by_chrom='intra',
            savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp:  # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        remove(out_sam_left_path + '*')
        remove(out_sam_right_path + '*')
        remove(join(output_directory, '*.tsv'))
        print 'Done.'
        stdout.flush()
Ejemplo n.º 11
0
            break
    read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)}
    read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)}
    out1.write(read_str.format(**read1))
    out2.write(read_str.format(**read2))
    # TOO CLOSE FROM RE
    

out1.close()
out2.close()

# PARSE SAM
from pytadbit.parsers.sam_parser import parse_sam

parse_sam(['test_read1.sam~'], ['test_read2.sam~'],
          'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1~', 'lala2~', 'lala~')

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala~')