def _filter_over_represented(fnam, over_represented, output): # t0 = time() frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) # use cut-1 because it represents the length of the list cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1] masked = {8 : {'name': 'over-represented' , 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: read, cr1, _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split('\t') if (frag_count.get((cr1, rs1), 0) > cut or frag_count.get((cr2, rs2), 0) > cut): masked[8]["reads"] += 1 outfil[8].write(read + '\n') line = fhandler.next() except StopIteration: pass # print 'done 3', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked
def _filter_over_represented(fnam, over_represented, output): # t0 = time() frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) # use cut-1 because it represents the length of the list cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1] masked = {8: {"name": "over-represented", "reads": 0}} outfil = {} for k in masked: masked[k]["fnam"] = output + "_" + masked[k]["name"].replace(" ", "_") + ".tsv" outfil[k] = open(masked[k]["fnam"], "w") fhandler = open(fnam) line = fhandler.next() while line.startswith("#"): line = fhandler.next() try: while True: read, cr1, _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split("\t") if frag_count.get((cr1, rs1), 0) > cut or frag_count.get((cr2, rs2), 0) > cut: masked[8]["reads"] += 1 outfil[8].write(read + "\n") line = fhandler.next() except StopIteration: pass # print 'done 3', time() - t0 for k in masked: masked[k]["fnam"] = output + "_" + masked[k]["name"].replace(" ", "_") + ".tsv" outfil[k].close() return masked
def filter_reads_OLD(fnam, max_molecule_length=500, over_represented=0.005, max_frag_size=100000, min_frag_size=100, re_proximity=5, verbose=True, savedata=None, min_dist_to_re=750): """ Apply different filters on pair of reads: 1- self-circle : reads are comming from a single RE fragment and point to the outside (----<===---===>---) 2- dangling-end : reads are comming from a single RE fragment and point to the inside (----===>---<===---) 3- error : reads are comming from a single RE fragment and point in the same direction 4- extra dangling-end : reads are comming from different RE fragment but are close enough (< max_molecule length) and point to the inside 5- too close from RES : semi-dangling-end filter, start position of one of the read is too close (5 bp by default) from RE cutting site. 6- too short : remove reads comming from small restriction less than 100 bp (default) because they are comparable to the read length 7- too large : remove reads comming from large restriction fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome) as they likely represent poorly assembled or repeated regions 8- over-represented : reads coming from the top 0.5% most frequently detected restriction fragments, they may be prone to PCR artifacts or represent fragile regions of the genome or genome assembly errors 9- duplicated : the combination of the start positions of the reads is repeated -> PCR artifact (only keep one copy) 10- random breaks : start position of one of the read is too far ( more than min_dist_to_re) from RE cutting site. Non-canonical enzyme activity or random physical breakage of the chromatin. :param fnam: path to file containing the pair of reads in tsv format, file generated by :func:`pytadbit.mapping.mapper.get_intersection` :param 500 max_molecule_length: facing reads that are within max_molecule_length, will be classified as 'extra dangling-ends' :param 0.005 over_represented: to remove the very top fragment containing more reads :param 100000 max_frag_size: maximum fragment size allowed (fragments should not span over several bins) :param 100 min_frag_size: remove fragment that are too short (shorter than the sequenced read length) :param 5 re_proximity: should be adjusted according to RE site, to filter semi-dangling-ends :param 750 min_dist_to_re: minimum distance the start of a read should be from a RE site (usually 1.5 times the insert size). Applied in filter 10 :param None savedata: PATH where to write the number of reads retained by each filter :return: dicitonary with, as keys, the kind of filter applied, and as values a set of read IDs to be removed *Note: Filtering is not exclusive, one read can be filtered several times.* """ masked = {1 : {'name': 'self-circle' , 'reads': set()}, 2 : {'name': 'dangling-end' , 'reads': set()}, 3 : {'name': 'error' , 'reads': set()}, 4 : {'name': 'extra dangling-end', 'reads': set()}, 5 : {'name': 'too close from RES', 'reads': set()}, 6 : {'name': 'too short' , 'reads': set()}, 7 : {'name': 'too large' , 'reads': set()}, 8 : {'name': 'over-represented' , 'reads': set()}, 9 : {'name': 'duplicated' , 'reads': set()}, 10: {'name': 'random breaks' , 'reads': set()}} uniq_check = set() # uniq_check = {} frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) # use cut-1 because it represents the length of the list cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1] total = 1 bads = 0 fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: (read, cr1, pos1, sd1, _, res1, re1, cr2, pos2, sd2, _, res2, re2) = line.split('\t') (ps1, ps2, sd1, sd2, re1, rs1, re2, rs2) = map(int, (pos1, pos2, sd1, sd2, re1, res1, re2, res2)) bad = False if cr1 == cr2: if re1 == re2: if sd1 != sd2: if (ps2 > ps1) == sd2: # ----<===---===>--- self-circles masked[1]["reads"].add(read) bad = True else: # ----===>---<===--- dangling-ends masked[2]["reads"].add(read) bad = True else: # --===>--===>-- or --<===--<===-- or same errors masked[3]["reads"].add(read) bad = True elif (abs(ps1 - ps2) < max_molecule_length and sd2 != sd1 and (ps2 > ps1) != sd2): # different fragments but facing and very close masked[4]["reads"].add(read) bad = True diff11 = re1 - ps1 diff12 = ps1 - rs1 diff21 = re2 - ps2 diff22 = ps2 - rs2 if ((diff11 < re_proximity) or (diff12 < re_proximity) or (diff21 < re_proximity) or (diff22 < re_proximity)): masked[5]["reads"].add(read) bad = True if (((diff11 > min_dist_to_re) and (diff12 > min_dist_to_re)) or ((diff21 > min_dist_to_re) and (diff22 > min_dist_to_re))): masked[10]["reads"].add(read) bad = True dif1 = re1 - rs1 dif2 = re2 - rs2 if (dif1 < min_frag_size) or (dif2 < min_frag_size): masked[6]["reads"].add(read) bad = True if (dif1 > max_frag_size) or (dif2 > max_frag_size): masked[7]["reads"].add(read) bad = True if (frag_count.get((cr1, res1), 0) > cut or frag_count.get((cr2, res2), 0) > cut): masked[8]["reads"].add(read) bad = True uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2))) if uniq_key in uniq_check: masked[9]["reads"].add(read) bad = True else: uniq_check.add(uniq_key) bads += bad line = fhandler.next() total += 1 except StopIteration: pass fhandler.close() if savedata: out = open(savedata, 'w') out.write('TOTAL\t%d\n' % total) for k in xrange(1, len(masked) + 1): out.write('%s\t%d\n' % (masked[k]['name'], len(masked[k]['reads']))) out.write('Valid pairs\t%d\n' % (total - bads)) out.close() if verbose: print 'Filtered reads (and percentage of total):\n' print ' %-25s : %12d (100.00%%)' % ('TOTAL mapped', total) print ' ' + '-' * 53 for k in xrange(1, len(masked) + 1): print ' %2d- %-25s : %12d (%6.2f%%)' %( k, masked[k]['name'], len(masked[k]['reads']), float(len(masked[k]['reads'])) / total * 100) print '\n %-25s : %12d (%6.2f%%)' %( 'Valid-pairs', total - bads, float(total - bads) / ( total) * 100) del(uniq_check) return masked
def filter_reads(fnam, max_molecule_length=500, over_represented=0.005, max_frag_size=100000, min_frag_size=100, re_proximity=5, verbose=True): """ Apply different filters on pair of reads (in order of application): 1- self-circle : reads are comming from a single RE fragment and point to the outside (----<===---===>---) 2- dangling-end : reads are comming from a single RE fragment and point to the inside (----===>---<===---) 3- extra dangling-end : reads are comming from different RE fragment but are close enough (< max_molecule length) and point to the inside 4- error : reads are comming from a single RE fragment and point in the same direction 5- too close from RE : start position of one of the read is too close ( 5 bp by default) from RE cutting site. Non-canonical enzyme activity or random physical breakage of the chromatin. 6- too short : remove reads comming from small restriction less than 100 bp (default) because they are comparable to the read length 7- too large : remove reads comming from large restriction fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome) as they likely represent poorly assembled or repeated regions 8- over-represented : reads coming from the top 0.5% most frequently detected restriction fragments, they may be prone to PCR artifacts or represent fragile regions of the genome or genome assembly errors 9- duplicated : the combination of the start positions of the reads is repeated -> PCR artifact (only keep one copy) :param fnam: path to file containing the pair of reads in tsv format, file generated by :func:`pytadbit.mapping.mapper.get_intersection` :param 500 max_molecule_length: facing reads that are within max_molecule_length, will be classified as 'extra dangling-ends' :param 0.005 over_represented: :param 100000 max_frag_size: :param 100 min_frag_size: :param 5 re_proximity: :return: dicitonary with, as keys, the kind of filter applied, and as values a set of read IDs to be removed """ masked = {1: {'name': 'self-circle' , 'reads': set()}, 2: {'name': 'dangling-end' , 'reads': set()}, 3: {'name': 'error' , 'reads': set()}, 4: {'name': 'extra dangling-end', 'reads': set()}, 5: {'name': 'too close from RE' , 'reads': set()}, 6: {'name': 'too short' , 'reads': set()}, 7: {'name': 'too large' , 'reads': set()}, 8: {'name': 'over-represented' , 'reads': set()}, 9: {'name': 'duplicated' , 'reads': set()}} uniq_check = set() # uniq_check = {} frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) cut = sorted([frag_count[crm] for crm in frag_count])[cut] fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): line = fhandler.next() while True: (read, cr1, pos1, sd1, _, rs1, re1, cr2, pos2, sd2, _, rs2, re2) = line.strip().split('\t') (ps1, ps2, sd1, sd2, re1, rs1, re2, rs2) = map(int, (pos1, pos2, sd1, sd2, re1, rs1, re2, rs2)) if cr1 == cr2: if re1 == re2: if sd1 != sd2: if (ps2 > ps1) == sd2: # ----<===---===>--- self-circles masked[1]["reads"].add(read) else: # ----===>---<===--- dangling-ends masked[2]["reads"].add(read) else: # --===>--===>-- or --<===--<===-- or same errors masked[3]["reads"].add(read) try: line = fhandler.next() except StopIteration: break continue elif (abs(ps1 - ps2) < max_molecule_length and sd2 != sd1 and ps2 > ps1 != sd2): # different fragments but facing and very close masked[4]["reads"].add(read) try: line = fhandler.next() except StopIteration: break continue if ((abs(re1 - ps1) < re_proximity) or (abs(rs1 - ps1) < re_proximity) or (abs(re2 - ps2) < re_proximity) or (abs(rs2 - ps2) < re_proximity)): masked[5]["reads"].add(read) elif ((re1 - rs1) < min_frag_size) or ((re2 - rs2) < min_frag_size) : masked[6]["reads"].add(read) elif ((re1 - rs1) > max_frag_size) or ((re2 - rs2) > max_frag_size): masked[7]["reads"].add(read) elif (frag_count.get((cr1, rs1), 0) > cut or frag_count.get((cr2, rs2), 0) > cut): masked[8]["reads"].add(read) else: uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2))) if uniq_key in uniq_check: masked[9]["reads"].add(read) # in case we want to forget about all reads (not keeping one) # if not uniq_check[uniq_key] in masked[5]["reads"]: # masked[5]["reads"].add(uniq_check[uniq_key]) # continue else: # uniq_check[uniq_key] = read uniq_check.add(uniq_key) try: line = fhandler.next() except StopIteration: break fhandler.close() del(uniq_check) if verbose: for k in xrange(1, len(masked) + 1): print '%d- %-25s : %d' %(k, masked[k]['name'], len(masked[k]['reads'])) return masked
def filter_reads_OLD(fnam, max_molecule_length=500, over_represented=0.005, max_frag_size=100000, min_frag_size=100, re_proximity=5, verbose=True, savedata=None, min_dist_to_re=750): """ Apply different filters on pair of reads: 1- self-circle : reads are comming from a single RE fragment and point to the outside (----<===---===>---) 2- dangling-end : reads are comming from a single RE fragment and point to the inside (----===>---<===---) 3- error : reads are comming from a single RE fragment and point in the same direction 4- extra dangling-end : reads are comming from different RE fragment but are close enough (< max_molecule length) and point to the inside 5- too close from RES : semi-dangling-end filter, start position of one of the read is too close (5 bp by default) from RE cutting site. 6- too short : remove reads comming from small restriction less than 100 bp (default) because they are comparable to the read length 7- too large : remove reads comming from large restriction fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome) as they likely represent poorly assembled or repeated regions 8- over-represented : reads coming from the top 0.5% most frequently detected restriction fragments, they may be prone to PCR artifacts or represent fragile regions of the genome or genome assembly errors 9- duplicated : the combination of the start positions of the reads is repeated -> PCR artifact (only keep one copy) 10- random breaks : start position of one of the read is too far ( more than min_dist_to_re) from RE cutting site. Non-canonical enzyme activity or random physical breakage of the chromatin. :param fnam: path to file containing the pair of reads in tsv format, file generated by :func:`pytadbit.mapping.mapper.get_intersection` :param 500 max_molecule_length: facing reads that are within max_molecule_length, will be classified as 'extra dangling-ends' :param 0.005 over_represented: to remove the very top fragment containing more reads :param 100000 max_frag_size: maximum fragment size allowed (fragments should not span over several bins) :param 100 min_frag_size: remove fragment that are too short (shorter than the sequenced read length) :param 5 re_proximity: should be adjusted according to RE site, to filter semi-dangling-ends :param 750 min_dist_to_re: minimum distance the start of a read should be from a RE site (usually 1.5 times the insert size). Applied in filter 10 :param None savedata: PATH where to write the number of reads retained by each filter :return: dicitonary with, as keys, the kind of filter applied, and as values a set of read IDs to be removed *Note: Filtering is not exclusive, one read can be filtered several times.* """ masked = { 1: { 'name': 'self-circle', 'reads': set() }, 2: { 'name': 'dangling-end', 'reads': set() }, 3: { 'name': 'error', 'reads': set() }, 4: { 'name': 'extra dangling-end', 'reads': set() }, 5: { 'name': 'too close from RES', 'reads': set() }, 6: { 'name': 'too short', 'reads': set() }, 7: { 'name': 'too large', 'reads': set() }, 8: { 'name': 'over-represented', 'reads': set() }, 9: { 'name': 'duplicated', 'reads': set() }, 10: { 'name': 'random breaks', 'reads': set() } } uniq_check = set() # uniq_check = {} frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) # use cut-1 because it represents the length of the list cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1] total = 1 bads = 0 fhandler = open(fnam) line = fhandler.next() while line.startswith('#'): line = fhandler.next() try: while True: (read, cr1, pos1, sd1, _, res1, re1, cr2, pos2, sd2, _, res2, re2) = line.split('\t') (ps1, ps2, sd1, sd2, re1, rs1, re2, rs2) = map(int, (pos1, pos2, sd1, sd2, re1, res1, re2, res2)) bad = False if cr1 == cr2: if re1 == re2: if sd1 != sd2: if (ps2 > ps1) == sd2: # ----<===---===>--- self-circles masked[1]["reads"].add(read) bad = True else: # ----===>---<===--- dangling-ends masked[2]["reads"].add(read) bad = True else: # --===>--===>-- or --<===--<===-- or same errors masked[3]["reads"].add(read) bad = True elif (abs(ps1 - ps2) < max_molecule_length and sd2 != sd1 and (ps2 > ps1) != sd2): # different fragments but facing and very close masked[4]["reads"].add(read) bad = True diff11 = re1 - ps1 diff12 = ps1 - rs1 diff21 = re2 - ps2 diff22 = ps2 - rs2 if ((diff11 < re_proximity) or (diff12 < re_proximity) or (diff21 < re_proximity) or (diff22 < re_proximity)): masked[5]["reads"].add(read) bad = True if (((diff11 > min_dist_to_re) and (diff12 > min_dist_to_re)) or ((diff21 > min_dist_to_re) and (diff22 > min_dist_to_re))): masked[10]["reads"].add(read) bad = True dif1 = re1 - rs1 dif2 = re2 - rs2 if (dif1 < min_frag_size) or (dif2 < min_frag_size): masked[6]["reads"].add(read) bad = True if (dif1 > max_frag_size) or (dif2 > max_frag_size): masked[7]["reads"].add(read) bad = True if (frag_count.get((cr1, res1), 0) > cut or frag_count.get( (cr2, res2), 0) > cut): masked[8]["reads"].add(read) bad = True uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2))) if uniq_key in uniq_check: masked[9]["reads"].add(read) bad = True else: uniq_check.add(uniq_key) bads += bad line = fhandler.next() total += 1 except StopIteration: pass fhandler.close() if savedata: out = open(savedata, 'w') out.write('TOTAL\t%d\n' % total) for k in xrange(1, len(masked) + 1): out.write('%s\t%d\n' % (masked[k]['name'], len(masked[k]['reads']))) out.write('Valid pairs\t%d\n' % (total - bads)) out.close() if verbose: print 'Filtered reads (and percentage of total):\n' print ' %-25s : %12d (100.00%%)' % ('TOTAL mapped', total) print ' ' + '-' * 53 for k in xrange(1, len(masked) + 1): print ' %2d- %-25s : %12d (%6.2f%%)' % ( k, masked[k]['name'], len(masked[k]['reads']), float(len(masked[k]['reads'])) / total * 100) print '\n %-25s : %12d (%6.2f%%)' % ('Valid-pairs', total - bads, float(total - bads) / (total) * 100) del (uniq_check) return masked