def filterReads(self, conservative=True):
        """
        Filter the reads to remove duplicates and experimental abnormalities
        Requires 4 CPU
        """

        reads = self.parsed_reads_dir + '/both_map.tsv'
        filt_reads = self.parsed_reads_dir + '/filtered_map.tsv'

        masked = filter_reads(reads,
                              max_molecule_length=610,
                              min_dist_to_re=915,
                              over_represented=0.005,
                              max_frag_size=100000,
                              min_frag_size=100,
                              re_proximity=4)

        if conservative == True:
            # Ignore filter 5 (based on docs) as not very helpful
            apply_filter(reads,
                         filt_reads,
                         masked,
                         filters=[1, 2, 3, 4, 6, 7, 8, 9, 10])
        else:
            # Less conservative option
            apply_filter(reads, filt_reads, masked, filters=[1, 2, 3, 9, 10])
Beispiel #2
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != '18':
            return
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
                   re_name='DPNII', mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali), verbose=False,
                                  fast=(ali=='map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1110)
                self.assertEqual(masked[6]['reads'], 2332)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 141)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue (masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open('lala-map-filt~')
                              if not l.startswith('#')]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
Beispiel #3
0
    def test_18_filter_reads(self):
        if ONLY and not "18" in ONLY:
            return
        if CHKTIME:
            t0 = time()
        for ali in ["map", "sam"]:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta("test.fa~", verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta("test.fa~")
            # PARSE SAM
            if ali == "map":
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print "ERROR: PYSAM not found, skipping test\n"
                    continue

            parser(["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)],
                   "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome,
                   re_name="DPNII", mapper="GEM")

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali),
                             "lala-%s~" % (ali))
            # FILTER
            masked = filter_reads("lala-%s~" % (ali), verbose=False,
                                  fast=(ali=="map"))
            self.assertEqual(masked[1]["reads"], 1000)
            self.assertEqual(masked[2]["reads"], 1000)
            self.assertEqual(masked[3]["reads"], 1000)
            self.assertEqual(masked[4]["reads"], 1000)
            if same_seed:
                self.assertEqual(masked[5]["reads"], 1110)
                self.assertEqual(masked[6]["reads"], 2332)
                self.assertEqual(masked[7]["reads"], 0)
                self.assertEqual(masked[8]["reads"], 141)
                self.assertEqual(masked[10]["reads"], 1)
            else:
                self.assertTrue (masked[5]["reads"] > 1000)
            self.assertEqual(masked[9]["reads"], 1000)
        apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open("lala-map-filt~")
                              if not l.startswith("#")]), 1000)
        d = plot_iterative_mapping("lala1-map~", "lala2-map~")
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print "18", time() - t0
Beispiel #4
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != '18':
            return
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
                   re_name='DPNII', mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali), verbose=False,
                                  fast=(ali=='map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1110)
                self.assertEqual(masked[6]['reads'], 2332)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 141)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue (masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open('lala-map-filt~')
                              if not l.startswith('#')]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
Beispiel #5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)
        
        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f
    
        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist
    
        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked,
                                 filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Beispiel #6
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
Beispiel #7
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
            break
    read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)}
    read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)}
    out1.write(read_str.format(**read1))
    out2.write(read_str.format(**read2))
    # TOO CLOSE FROM RE
    

out1.close()
out2.close()

# PARSE SAM
from pytadbit.parsers.sam_parser import parse_sam

parse_sam(['test_read1.sam~'], ['test_read2.sam~'],
          'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1~', 'lala2~', 'lala~')

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala~')




Beispiel #9
0
        }
        out1.write(read.format(**read1))
        out2.write(read.format(**read2))
    i += 1

out1.close()
out2.close()

# PARSE SAM
if ali == 'map':
    from pytadbit.parsers.map_parser import parse_map as parser
else:
    from pytadbit.parsers.sam_parser import parse_sam as parser

parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
       './lala1-%s~' % (ali),
       './lala2-%s~' % (ali),
       genome,
       re_name='DPNII',
       mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali))

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala-%s~' % (ali))
    else:
        read2 = {'crm': crm1, 'pos': pos1, 'flag': flags[sd1], 'id': 'lala05.1%011d' % (i)}
        read1 = {'crm': crm2, 'pos': pos2, 'flag': flags[sd2], 'id': 'lala05.1%011d' % (i)}
        out1.write(read.format(**read1))
        out2.write(read.format(**read2))
    i += 1

out1.close()
out2.close()

# PARSE SAM
if ali == 'map':
    from pytadbit.parsers.map_parser import parse_map as parser
else:
    from pytadbit.parsers.sam_parser import parse_sam as parser

parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
       './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
       re_name='DPNII', mapper='GEM')

# GET INTERSECTION
from pytadbit.mapping.mapper import get_intersection

get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali))

# FILTER
from pytadbit.mapping.filter import filter_reads

masked = filter_reads('lala-%s~' % (ali))

Beispiel #11
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(reads,
                                          nreads=1000000,
                                          stats=('median', 'first_decay',
                                                 'MAD'),
                                          savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=0.001,
                              max_frag_size=100000,
                              min_frag_size=50,
                              re_proximity=5,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
Beispiel #12
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != "18":
            return
        if CHKTIME:
            t0 = time()
        for ali in ["map", "sam"]:
            seed(1)
            if 13436 == int(random() * 100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta("test.fa~", verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta("test.fa~")
            # PARSE SAM
            if ali == "map":
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print "ERROR: PYSAM not found, skipping test\n"
                    continue

            parser(
                ["test_read1.%s~" % (ali)],
                ["test_read2.%s~" % (ali)],
                "./lala1-%s~" % (ali),
                "./lala2-%s~" % (ali),
                genome,
                re_name="DPNII",
                mapper="GEM",
            )

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection

            get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali))
            # FILTER
            masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map"))
            self.assertEqual(masked[1]["reads"], 1000)
            self.assertEqual(masked[2]["reads"], 1000)
            self.assertEqual(masked[3]["reads"], 1000)
            self.assertEqual(masked[4]["reads"], 1000)
            if same_seed:
                self.assertEqual(masked[5]["reads"], 1110)
                self.assertEqual(masked[6]["reads"], 2332)
                self.assertEqual(masked[7]["reads"], 0)
                self.assertEqual(masked[8]["reads"], 141)
                self.assertEqual(masked[10]["reads"], 1)
            else:
                self.assertTrue(masked[5]["reads"] > 1000)
            self.assertEqual(masked[9]["reads"], 1000)
        apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False)
        self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000)
        d = plot_iterative_mapping("lala1-map~", "lala2-map~")
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print "18", time() - t0
Beispiel #13
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        if opts.fast_fragment:
            reads = fname1
            counts_multis = [
                '#' in line.split('\t')[0] for line in open(reads)
            ]
            count = len(counts_multis)
            multiples = {}
            multiples[1] = sum(
                [count_mult for count_mult in counts_multis if count_mult])
            del counts_multis
        else:
            # compute the intersection of the two read ends
            print('Getting intersection between read 1 and read 2')
            count, multiples = get_intersection(fname1,
                                                fname2,
                                                reads,
                                                compress=opts.compress_input)

        # compute insert size
        print('Get insert size...')
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        try:
            median, max_f, mad = fragment_size(reads,
                                               nreads=1000000,
                                               stats=('median', 'first_decay',
                                                      'MAD'),
                                               savefig=hist_path)
        except ZeroDivisionError:
            warn('WARNING: cannot compute fragment length, too few '
                 'dangling-ends. Setting median length to 400 nt.')
            median = max_f = mad = 0
        if median < 50:
            warn('WARNING: fragment length too short ({}). '
                 'Setting median length to 400 nt.'.format(mad))
            median, max_f, mad = 400, 100, 40
        if opts.median:
            median = opts.median
        if opts.max_f:
            max_f = opts.max_f
        if opts.mad:
            mad = opts.mad

        print('  - median insert size =', median)
        print('  - median absolution of insert size =', mad)
        print(
            '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =',
            max_f)

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print('   Using the maximum continuous fragment size'
              '(%d bp) to check '
              'for pseudo-dangling ends' % max_mole)
        print('   Using maximum continuous fragment size plus the MAD '
              '(%d bp) to check for random breaks' % min_dist)

        print("identify pairs to filter...")
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              strict_duplicates=opts.strict_duplicates,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print(median, max_f, mad)
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)
Beispiel #14
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '':  # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else:  # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '':  # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else:  # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads,
            resolution=res,
            by_chrom='intra',
            savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp:  # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        remove(out_sam_left_path + '*')
        remove(out_sam_right_path + '*')
        remove(join(output_directory, '*.tsv'))
        print 'Done.'
        stdout.flush()
Beispiel #15
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()
Beispiel #16
0
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, frag_map=False,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, frag_map=False,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map
elif mapper == 3:
    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map

read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win)
parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
            genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
            re_name='HindIII', verbose=True)

reads = 'both_reads.tsv_%s-%s' % (mapper, win)
get_intersection(read1, read2, reads)

masked = filter_reads(reads)
freads = 'filtered_reads.tsv_%s-%s' % (mapper, win)
apply_filter(reads, freads, masked)
DANGLING = sys.argv[3]
SELF_CIRCLE = sys.argv[4]
SUMMARY_EXCLUDED = sys.argv[5]
max_molecule_length = int(sys.argv[6])
over_represented = float(sys.argv[7])
min_frag_size = int(sys.argv[8])
max_frag_size = int(sys.argv[9])
re_proximity = int(sys.argv[10])
both_reads_mapped = int(sys.argv[11])

# Count number of reads excluded by each filter (note that a read can be included in more than one filter!)
infile = glob.glob('%s/*_both_map.tsv' % PROCESSED)[0]
pair_id = infile.split("/")[-1].replace("_both_map.tsv", "")
masked = filter_reads(infile, max_molecule_length=max_molecule_length,
                                    over_represented=over_represented,
                                    min_frag_size=min_frag_size,
                                    max_frag_size=max_frag_size,
                                    re_proximity=re_proximity,
                                    verbose=False)
filters_applied_numeric = [1,2,3,4,9,10]
is_applied = []
my_columns = ['filter_index', 'exclusion', 'reads_number', 'reads_fraction']
excluded = pd.DataFrame(columns=my_columns)
for k in xrange(1, len(masked) + 1):
    df = pd.DataFrame([k, masked[k]['name'], masked[k]['reads']]).transpose()
    df.columns = my_columns[:-1]
    df['reads_fraction'] = df['reads_number'] / both_reads_mapped
    excluded = pd.concat([excluded, df])
    if k in filters_applied_numeric:
    	is_applied.append(1)
    else:
    	is_applied.append(0)