def filterReads(self, conservative=True): """ Filter the reads to remove duplicates and experimental abnormalities Requires 4 CPU """ reads = self.parsed_reads_dir + '/both_map.tsv' filt_reads = self.parsed_reads_dir + '/filtered_map.tsv' masked = filter_reads(reads, max_molecule_length=610, min_dist_to_re=915, over_represented=0.005, max_frag_size=100000, min_frag_size=100, re_proximity=4) if conservative == True: # Ignore filter 5 (based on docs) as not very helpful apply_filter(reads, filt_reads, masked, filters=[1, 2, 3, 4, 6, 7, 8, 9, 10]) else: # Less conservative option apply_filter(reads, filt_reads, masked, filters=[1, 2, 3, 9, 10])
def test_18_filter_reads(self): if ONLY and ONLY != '18': return if CHKTIME: t0 = time() for ali in ['map', 'sam']: seed(1) if 13436 == int(random()*100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta('test.fa~', verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta('test.fa~') # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print 'ERROR: PYSAM not found, skipping test\n' continue parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER masked = filter_reads('lala-%s~' % (ali), verbose=False, fast=(ali=='map')) self.assertEqual(masked[1]['reads'], 1000) self.assertEqual(masked[2]['reads'], 1000) self.assertEqual(masked[3]['reads'], 1000) self.assertEqual(masked[4]['reads'], 1000) if same_seed: self.assertEqual(masked[5]['reads'], 1110) self.assertEqual(masked[6]['reads'], 2332) self.assertEqual(masked[7]['reads'], 0) self.assertEqual(masked[8]['reads'], 141) self.assertEqual(masked[10]['reads'], 1) else: self.assertTrue (masked[5]['reads'] > 1000) self.assertEqual(masked[9]['reads'], 1000) apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open('lala-map-filt~') if not l.startswith('#')]), 1000) d = plot_iterative_mapping('lala1-map~', 'lala2-map~') self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print '18', time() - t0
def test_18_filter_reads(self): if ONLY and not "18" in ONLY: return if CHKTIME: t0 = time() for ali in ["map", "sam"]: seed(1) if 13436 == int(random()*100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta("test.fa~", verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta("test.fa~") # PARSE SAM if ali == "map": from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print "ERROR: PYSAM not found, skipping test\n" continue parser(["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)], "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome, re_name="DPNII", mapper="GEM") # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali)) # FILTER masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali=="map")) self.assertEqual(masked[1]["reads"], 1000) self.assertEqual(masked[2]["reads"], 1000) self.assertEqual(masked[3]["reads"], 1000) self.assertEqual(masked[4]["reads"], 1000) if same_seed: self.assertEqual(masked[5]["reads"], 1110) self.assertEqual(masked[6]["reads"], 2332) self.assertEqual(masked[7]["reads"], 0) self.assertEqual(masked[8]["reads"], 141) self.assertEqual(masked[10]["reads"], 1) else: self.assertTrue (masked[5]["reads"] > 1000) self.assertEqual(masked[9]["reads"], 1000) apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000) d = plot_iterative_mapping("lala1-map~", "lala2-map~") self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print "18", time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes( reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print (' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print (' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def main(): fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq' fastq = 'short_dixon-2012_200bp.fastq' # fastq = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq' gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem' out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/' out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/' temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/' temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/' print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=((1,100),), add_site=True) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=((101, 200),), add_site=True) # print 'read 1' # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', # temp_dir=temp_dir1, # windows=(zip(*([0] * len(range(25, 105, 5)), # range(25,105,5))))) # print 'read 2' # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, # windows=(zip(*([100] * len(range(125, 205, 5)), # range(125,205,5))))) print outfiles1 print 'xcmvnkljnv' print outfiles2 from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter read1, read2 = 'read1.tsv', 'read2.tsv', parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv' get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv' apply_filter(reads, freads, masked)
break read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)} read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)} out1.write(read_str.format(**read1)) out2.write(read_str.format(**read2)) # TOO CLOSE FROM RE out1.close() out2.close() # PARSE SAM from pytadbit.parsers.sam_parser import parse_sam parse_sam(['test_read1.sam~'], ['test_read2.sam~'], 'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1~', 'lala2~', 'lala~') # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala~')
} out1.write(read.format(**read1)) out2.write(read.format(**read2)) i += 1 out1.close() out2.close() # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: from pytadbit.parsers.sam_parser import parse_sam as parser parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala-%s~' % (ali))
else: read2 = {'crm': crm1, 'pos': pos1, 'flag': flags[sd1], 'id': 'lala05.1%011d' % (i)} read1 = {'crm': crm2, 'pos': pos2, 'flag': flags[sd2], 'id': 'lala05.1%011d' % (i)} out1.write(read.format(**read1)) out2.write(read.format(**read2)) i += 1 out1.close() out2.close() # PARSE SAM if ali == 'map': from pytadbit.parsers.map_parser import parse_map as parser else: from pytadbit.parsers.sam_parser import parse_sam as parser parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)], './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali), 'lala-%s~' % (ali)) # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala-%s~' % (ali))
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) # compute the intersection of the two read ends print 'Getting intersection between read 1 and read 2' count, multiples = get_intersection(fname1, fname2, reads) # compute insert size print 'Get insert size...' hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) median, max_f, mad = insert_sizes(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) print ' - median insert size =', median print ' - double median absolution of insert size =', mad print ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print( ' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends') % max_mole print( ' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks') % min_dist print "identify pairs to filter..." masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=0.001, max_frag_size=100000, min_frag_size=50, re_proximity=5, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) finish_time = time.localtime() print median, max_f, mad # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, hist_path, median, max_f, mad, launch_time, finish_time)
def test_18_filter_reads(self): if ONLY and ONLY != "18": return if CHKTIME: t0 = time() for ali in ["map", "sam"]: seed(1) if 13436 == int(random() * 100000): same_seed = True genome = generate_random_ali(ali) genome_bis = parse_fasta("test.fa~", verbose=False) self.assertEqual(genome, genome_bis) else: same_seed = False genome = parse_fasta("test.fa~") # PARSE SAM if ali == "map": from pytadbit.parsers.map_parser import parse_map as parser else: try: from pytadbit.parsers.sam_parser import parse_sam as parser except ImportError: print "ERROR: PYSAM not found, skipping test\n" continue parser( ["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)], "./lala1-%s~" % (ali), "./lala2-%s~" % (ali), genome, re_name="DPNII", mapper="GEM", ) # GET INTERSECTION from pytadbit.mapping import get_intersection get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali)) # FILTER masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map")) self.assertEqual(masked[1]["reads"], 1000) self.assertEqual(masked[2]["reads"], 1000) self.assertEqual(masked[3]["reads"], 1000) self.assertEqual(masked[4]["reads"], 1000) if same_seed: self.assertEqual(masked[5]["reads"], 1110) self.assertEqual(masked[6]["reads"], 2332) self.assertEqual(masked[7]["reads"], 0) self.assertEqual(masked[8]["reads"], 141) self.assertEqual(masked[10]["reads"], 1) else: self.assertTrue(masked[5]["reads"] > 1000) self.assertEqual(masked[9]["reads"], 1000) apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False) self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000) d = plot_iterative_mapping("lala1-map~", "lala2-map~") self.assertEqual(d[0][1], 6000) if CHKTIME: self.assertEqual(True, True) print "18", time() - t0
def run(opts): check_options(opts) launch_time = time.localtime() fname1, fname2 = load_parameters_fromdb(opts) param_hash = digest_parameters(opts) reads = path.join(opts.workdir, '03_filtered_reads', 'all_r1-r2_intersection_%s.tsv' % param_hash) mreads = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % param_hash) if not opts.resume: mkdir(path.join(opts.workdir, '03_filtered_reads')) if opts.fast_fragment: reads = fname1 counts_multis = [ '#' in line.split('\t')[0] for line in open(reads) ] count = len(counts_multis) multiples = {} multiples[1] = sum( [count_mult for count_mult in counts_multis if count_mult]) del counts_multis else: # compute the intersection of the two read ends print('Getting intersection between read 1 and read 2') count, multiples = get_intersection(fname1, fname2, reads, compress=opts.compress_input) # compute insert size print('Get insert size...') hist_path = path.join(opts.workdir, 'histogram_fragment_sizes_%s.pdf' % param_hash) try: median, max_f, mad = fragment_size(reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'), savefig=hist_path) except ZeroDivisionError: warn('WARNING: cannot compute fragment length, too few ' 'dangling-ends. Setting median length to 400 nt.') median = max_f = mad = 0 if median < 50: warn('WARNING: fragment length too short ({}). ' 'Setting median length to 400 nt.'.format(mad)) median, max_f, mad = 400, 100, 40 if opts.median: median = opts.median if opts.max_f: max_f = opts.max_f if opts.mad: mad = opts.mad print(' - median insert size =', median) print(' - median absolution of insert size =', mad) print( ' - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f) max_mole = max_f # pseudo DEs min_dist = max_f + mad # random breaks print(' Using the maximum continuous fragment size' '(%d bp) to check ' 'for pseudo-dangling ends' % max_mole) print(' Using maximum continuous fragment size plus the MAD ' '(%d bp) to check for random breaks' % min_dist) print("identify pairs to filter...") masked = filter_reads(reads, max_molecule_length=max_mole, over_represented=opts.over_represented, max_frag_size=opts.max_frag_size, min_frag_size=opts.min_frag_size, re_proximity=opts.re_proximity, strict_duplicates=opts.strict_duplicates, min_dist_to_re=min_dist, fast=True) n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply) outbam = path.join(opts.workdir, '03_filtered_reads', 'intersection_%s' % param_hash) if opts.valid: infile = mreads else: infile = reads bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked, samtools=opts.samtools) finish_time = time.localtime() print(median, max_f, mad) # save all job information to sqlite DB save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked, outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() remove(out_sam_left_path + '*') remove(out_sam_right_path + '*') remove(join(output_directory, '*.tsv')) print 'Done.' stdout.flush()
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, frag_map=False, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, frag_map=False, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map elif mapper == 3: print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=(zip(*(r_beg2, r_end2)))) parse_thing = parse_map read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win) parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv_%s-%s' % (mapper, win) get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv_%s-%s' % (mapper, win) apply_filter(reads, freads, masked)
DANGLING = sys.argv[3] SELF_CIRCLE = sys.argv[4] SUMMARY_EXCLUDED = sys.argv[5] max_molecule_length = int(sys.argv[6]) over_represented = float(sys.argv[7]) min_frag_size = int(sys.argv[8]) max_frag_size = int(sys.argv[9]) re_proximity = int(sys.argv[10]) both_reads_mapped = int(sys.argv[11]) # Count number of reads excluded by each filter (note that a read can be included in more than one filter!) infile = glob.glob('%s/*_both_map.tsv' % PROCESSED)[0] pair_id = infile.split("/")[-1].replace("_both_map.tsv", "") masked = filter_reads(infile, max_molecule_length=max_molecule_length, over_represented=over_represented, min_frag_size=min_frag_size, max_frag_size=max_frag_size, re_proximity=re_proximity, verbose=False) filters_applied_numeric = [1,2,3,4,9,10] is_applied = [] my_columns = ['filter_index', 'exclusion', 'reads_number', 'reads_fraction'] excluded = pd.DataFrame(columns=my_columns) for k in xrange(1, len(masked) + 1): df = pd.DataFrame([k, masked[k]['name'], masked[k]['reads']]).transpose() df.columns = my_columns[:-1] df['reads_fraction'] = df['reads_number'] / both_reads_mapped excluded = pd.concat([excluded, df]) if k in filters_applied_numeric: is_applied.append(1) else: is_applied.append(0)