Example #1
0
def _filter_over_represented(fnam, over_represented, output):
    # t0 = time()
    frag_count = count_re_fragments(fnam)
    num_frags = len(frag_count)
    cut = int((1 - over_represented) * num_frags + 0.5)
    # use cut-1 because it represents the length of the list
    cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1]
    masked = {8 : {'name': 'over-represented'  , 'reads': 0}}
    outfil = {}
    for k in masked:
        masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv'
        outfil[k] = open(masked[k]['fnam'], 'w')
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        line = fhandler.next()
    try:
        while True:
            read, cr1,  _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split('\t')
            if (frag_count.get((cr1, rs1), 0) > cut or
                  frag_count.get((cr2, rs2), 0) > cut):
                masked[8]["reads"] += 1
                outfil[8].write(read + '\n')
            line = fhandler.next()
    except StopIteration:
        pass
    # print 'done 3', time() - t0
    for k in masked:
        masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv'
        outfil[k].close()
    return masked
Example #2
0
def _filter_over_represented(fnam, over_represented, output):
    # t0 = time()
    frag_count = count_re_fragments(fnam)
    num_frags = len(frag_count)
    cut = int((1 - over_represented) * num_frags + 0.5)
    # use cut-1 because it represents the length of the list
    cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1]
    masked = {8: {"name": "over-represented", "reads": 0}}
    outfil = {}
    for k in masked:
        masked[k]["fnam"] = output + "_" + masked[k]["name"].replace(" ", "_") + ".tsv"
        outfil[k] = open(masked[k]["fnam"], "w")
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith("#"):
        line = fhandler.next()
    try:
        while True:
            read, cr1, _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split("\t")
            if frag_count.get((cr1, rs1), 0) > cut or frag_count.get((cr2, rs2), 0) > cut:
                masked[8]["reads"] += 1
                outfil[8].write(read + "\n")
            line = fhandler.next()
    except StopIteration:
        pass
    # print 'done 3', time() - t0
    for k in masked:
        masked[k]["fnam"] = output + "_" + masked[k]["name"].replace(" ", "_") + ".tsv"
        outfil[k].close()
    return masked
Example #3
0
def filter_reads_OLD(fnam, max_molecule_length=500, over_represented=0.005,
                 max_frag_size=100000, min_frag_size=100, re_proximity=5,
                 verbose=True, savedata=None, min_dist_to_re=750):
    """
    Apply different filters on pair of reads:

       1- self-circle        : reads are comming from a single RE fragment and
          point to the outside (----<===---===>---)
       2- dangling-end       : reads are comming from a single RE fragment and
          point to the inside (----===>---<===---)
       3- error              : reads are comming from a single RE fragment and
          point in the same direction
       4- extra dangling-end : reads are comming from different RE fragment but
          are close enough (< max_molecule length) and point to the inside
       5- too close from RES : semi-dangling-end filter, start position of one
          of the read is too close (5 bp by default) from RE cutting site.
       6- too short          : remove reads comming from small restriction less
          than 100 bp (default) because they are comparable to the read length
       7- too large          : remove reads comming from large restriction
          fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome)
          as they likely represent poorly assembled or repeated regions
       8- over-represented   : reads coming from the top 0.5% most frequently
          detected restriction fragments, they may be prone to PCR artifacts or
          represent fragile regions of the genome or genome assembly errors
       9- duplicated         : the combination of the start positions of the
          reads is repeated -> PCR artifact (only keep one copy)
       10- random breaks     : start position of one of the read is too far (
          more than min_dist_to_re) from RE cutting site. Non-canonical
          enzyme activity or random physical breakage of the chromatin.
    
    :param fnam: path to file containing the pair of reads in tsv format, file
       generated by :func:`pytadbit.mapping.mapper.get_intersection`
    :param 500 max_molecule_length: facing reads that are within
       max_molecule_length, will be classified as 'extra dangling-ends'
    :param 0.005 over_represented: to remove the very top fragment containing
       more reads
    :param 100000 max_frag_size: maximum fragment size allowed (fragments should
       not span over several bins)
    :param 100 min_frag_size: remove fragment that are too short (shorter than
       the sequenced read length)
    :param 5 re_proximity: should be adjusted according to RE site, to filter
       semi-dangling-ends
    :param 750 min_dist_to_re: minimum distance the start of a read should be
       from a RE site (usually 1.5 times the insert size). Applied in filter 10
    :param None savedata: PATH where to write the number of reads retained by
       each filter

    :return: dicitonary with, as keys, the kind of filter applied, and as values
       a set of read IDs to be removed

    *Note: Filtering is not exclusive, one read can be filtered several times.*
    """
    masked = {1 : {'name': 'self-circle'       , 'reads': set()}, 
              2 : {'name': 'dangling-end'      , 'reads': set()},
              3 : {'name': 'error'             , 'reads': set()},
              4 : {'name': 'extra dangling-end', 'reads': set()},
              5 : {'name': 'too close from RES', 'reads': set()},
              6 : {'name': 'too short'         , 'reads': set()},
              7 : {'name': 'too large'         , 'reads': set()},
              8 : {'name': 'over-represented'  , 'reads': set()},
              9 : {'name': 'duplicated'        , 'reads': set()},
              10: {'name': 'random breaks'     , 'reads': set()}}
    uniq_check = set()
    # uniq_check = {}
    frag_count = count_re_fragments(fnam)
    num_frags = len(frag_count)
    cut = int((1 - over_represented) * num_frags + 0.5)
    # use cut-1 because it represents the length of the list
    cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1]

    total = 1
    bads  = 0
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        line = fhandler.next()
    try:
        while True:
            (read,
             cr1, pos1, sd1, _, res1, re1,
             cr2, pos2, sd2, _, res2, re2) = line.split('\t')
            (ps1, ps2, sd1, sd2,
             re1, rs1, re2, rs2) = map(int, (pos1, pos2, sd1, sd2,
                                             re1, res1, re2, res2))
            bad = False
            if cr1 == cr2:
                if re1 == re2:
                    if sd1 != sd2:
                        if (ps2 > ps1) == sd2:
                            # ----<===---===>---                   self-circles
                            masked[1]["reads"].add(read)
                            bad = True
                        else:
                            # ----===>---<===---                   dangling-ends
                            masked[2]["reads"].add(read)
                            bad = True
                    else:
                        # --===>--===>-- or --<===--<===-- or same errors
                        masked[3]["reads"].add(read)
                        bad = True
                elif (abs(ps1 - ps2) < max_molecule_length
                      and sd2 != sd1
                      and (ps2 > ps1) != sd2):
                    # different fragments but facing and very close
                    masked[4]["reads"].add(read)
                    bad = True
            diff11 = re1 - ps1
            diff12 = ps1 - rs1
            diff21 = re2 - ps2
            diff22 = ps2 - rs2
            if ((diff11 < re_proximity) or
                (diff12 < re_proximity) or 
                (diff21 < re_proximity) or
                (diff22 < re_proximity)):
                masked[5]["reads"].add(read)
                bad = True
            if (((diff11 > min_dist_to_re) and
                 (diff12 > min_dist_to_re)) or 
                ((diff21 > min_dist_to_re) and
                 (diff22 > min_dist_to_re))):
                masked[10]["reads"].add(read)
                bad = True
            dif1 = re1 - rs1
            dif2 = re2 - rs2
            if (dif1 < min_frag_size) or (dif2 < min_frag_size):
                masked[6]["reads"].add(read)
                bad = True
            if (dif1 > max_frag_size) or (dif2 > max_frag_size):
                masked[7]["reads"].add(read)
                bad = True
            if (frag_count.get((cr1, res1), 0) > cut or
                  frag_count.get((cr2, res2), 0) > cut):
                masked[8]["reads"].add(read)
                bad = True
            uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2)))
            if uniq_key in uniq_check:
                masked[9]["reads"].add(read)
                bad = True
            else:
                uniq_check.add(uniq_key)
            bads += bad
            line = fhandler.next()
            total += 1
    except StopIteration:
        pass
    fhandler.close()
    if savedata:
        out = open(savedata, 'w')
        out.write('TOTAL\t%d\n' % total)
        for k in xrange(1, len(masked) + 1):
            out.write('%s\t%d\n' % (masked[k]['name'], len(masked[k]['reads'])))
        out.write('Valid pairs\t%d\n' % (total - bads))
        out.close()
    if verbose:
        print 'Filtered reads (and percentage of total):\n'
        print '     %-25s : %12d (100.00%%)' % ('TOTAL mapped', total)
        print '  ' + '-' * 53
        for k in xrange(1, len(masked) + 1):
            print '  %2d- %-25s : %12d (%6.2f%%)' %(
                k, masked[k]['name'], len(masked[k]['reads']),
                float(len(masked[k]['reads'])) / total * 100)
        print '\n     %-25s : %12d (%6.2f%%)' %(
            'Valid-pairs', total - bads, float(total - bads) / (
                total) * 100)
    del(uniq_check)
    return masked
Example #4
0
def filter_reads(fnam, max_molecule_length=500,
                 over_represented=0.005, max_frag_size=100000,
                 min_frag_size=100, re_proximity=5, verbose=True):
    """
    Apply different filters on pair of reads (in order of application):
       1- self-circle        : reads are comming from a single RE fragment and
          point to the outside (----<===---===>---)
       2- dangling-end       : reads are comming from a single RE fragment and
          point to the inside (----===>---<===---)
       3- extra dangling-end : reads are comming from different RE fragment but
          are close enough (< max_molecule length) and point to the inside
       4- error              : reads are comming from a single RE fragment and
          point in the same direction
       5- too close from RE  : start position of one of the read is too close (
          5 bp by default) from RE cutting site. Non-canonical enzyme activity
          or random physical breakage of the chromatin.
       6- too short          : remove reads comming from small restriction less
          than 100 bp (default) because they are comparable to the read length
       7- too large          : remove reads comming from large restriction
          fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome)
          as they likely represent poorly assembled or repeated regions
       8- over-represented   : reads coming from the top 0.5% most frequently
          detected restriction fragments, they may be prone to PCR artifacts or
          represent fragile regions of the genome or genome assembly errors
       9- duplicated         : the combination of the start positions of the
          reads is repeated -> PCR artifact (only keep one copy)
    
    :param fnam: path to file containing the pair of reads in tsv format, file
       generated by :func:`pytadbit.mapping.mapper.get_intersection`
    :param 500 max_molecule_length: facing reads that are within
       max_molecule_length, will be classified as 'extra dangling-ends'
    :param 0.005 over_represented:
    :param 100000 max_frag_size:
    :param 100 min_frag_size:
    :param 5 re_proximity:

    :return: dicitonary with, as keys, the kind of filter applied, and as values
       a set of read IDs to be removed
    """
    masked = {1: {'name': 'self-circle'       , 'reads': set()}, 
              2: {'name': 'dangling-end'      , 'reads': set()},
              3: {'name': 'error'             , 'reads': set()},
              4: {'name': 'extra dangling-end', 'reads': set()},
              5: {'name': 'too close from RE' , 'reads': set()},
              6: {'name': 'too short'         , 'reads': set()},
              7: {'name': 'too large'         , 'reads': set()},
              8: {'name': 'over-represented'  , 'reads': set()},
              9: {'name': 'duplicated'        , 'reads': set()}}
    uniq_check = set()
    # uniq_check = {}
    frag_count = count_re_fragments(fnam)
    num_frags = len(frag_count)
    cut = int((1 - over_represented) * num_frags + 0.5)
    cut = sorted([frag_count[crm] for crm in frag_count])[cut]

    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        line = fhandler.next()
    while True:
        (read,
         cr1, pos1, sd1, _, rs1, re1,
         cr2, pos2, sd2, _, rs2, re2) = line.strip().split('\t')
        (ps1, ps2, sd1, sd2,
         re1, rs1, re2, rs2) = map(int, (pos1, pos2, sd1, sd2,
                                         re1, rs1, re2, rs2))
        if cr1 == cr2:
            if re1 == re2:
                if sd1 != sd2:
                    if (ps2 > ps1) == sd2:
                        # ----<===---===>---                       self-circles
                        masked[1]["reads"].add(read)
                    else:
                        # ----===>---<===---                       dangling-ends
                        masked[2]["reads"].add(read)
                else:
                    # --===>--===>-- or --<===--<===-- or same     errors
                    masked[3]["reads"].add(read)
                try:
                    line = fhandler.next()
                except StopIteration:
                    break
                continue
            elif (abs(ps1 - ps2) < max_molecule_length
                  and sd2 != sd1
                  and ps2 > ps1 != sd2):
                # different fragments but facing and very close
                masked[4]["reads"].add(read)
                try:
                    line = fhandler.next()
                except StopIteration:
                    break
                continue
        if ((abs(re1 - ps1) < re_proximity) or
            (abs(rs1 - ps1) < re_proximity) or 
            (abs(re2 - ps2) < re_proximity) or
            (abs(rs2 - ps2) < re_proximity)):
            masked[5]["reads"].add(read)
        elif ((re1 - rs1) < min_frag_size) or ((re2 - rs2) < min_frag_size) :
            masked[6]["reads"].add(read)
        elif ((re1 - rs1) > max_frag_size) or ((re2 - rs2) > max_frag_size):
            masked[7]["reads"].add(read)
        elif (frag_count.get((cr1, rs1), 0) > cut or
              frag_count.get((cr2, rs2), 0) > cut):
            masked[8]["reads"].add(read)
        else:
            uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2)))
            if uniq_key in uniq_check:
                masked[9]["reads"].add(read)
                # in case we want to forget about all reads (not keeping one)
                # if not uniq_check[uniq_key] in masked[5]["reads"]:
                #     masked[5]["reads"].add(uniq_check[uniq_key])
                #     continue
            else:
                # uniq_check[uniq_key] = read
                uniq_check.add(uniq_key)
        try:
            line = fhandler.next()
        except StopIteration:
            break
    fhandler.close()
    del(uniq_check)
    if verbose:
        for k in xrange(1, len(masked) + 1):
            print '%d- %-25s : %d' %(k, masked[k]['name'], len(masked[k]['reads']))
    return masked
Example #5
0
def filter_reads_OLD(fnam,
                     max_molecule_length=500,
                     over_represented=0.005,
                     max_frag_size=100000,
                     min_frag_size=100,
                     re_proximity=5,
                     verbose=True,
                     savedata=None,
                     min_dist_to_re=750):
    """
    Apply different filters on pair of reads:

       1- self-circle        : reads are comming from a single RE fragment and
          point to the outside (----<===---===>---)
       2- dangling-end       : reads are comming from a single RE fragment and
          point to the inside (----===>---<===---)
       3- error              : reads are comming from a single RE fragment and
          point in the same direction
       4- extra dangling-end : reads are comming from different RE fragment but
          are close enough (< max_molecule length) and point to the inside
       5- too close from RES : semi-dangling-end filter, start position of one
          of the read is too close (5 bp by default) from RE cutting site.
       6- too short          : remove reads comming from small restriction less
          than 100 bp (default) because they are comparable to the read length
       7- too large          : remove reads comming from large restriction
          fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome)
          as they likely represent poorly assembled or repeated regions
       8- over-represented   : reads coming from the top 0.5% most frequently
          detected restriction fragments, they may be prone to PCR artifacts or
          represent fragile regions of the genome or genome assembly errors
       9- duplicated         : the combination of the start positions of the
          reads is repeated -> PCR artifact (only keep one copy)
       10- random breaks     : start position of one of the read is too far (
          more than min_dist_to_re) from RE cutting site. Non-canonical
          enzyme activity or random physical breakage of the chromatin.
    
    :param fnam: path to file containing the pair of reads in tsv format, file
       generated by :func:`pytadbit.mapping.mapper.get_intersection`
    :param 500 max_molecule_length: facing reads that are within
       max_molecule_length, will be classified as 'extra dangling-ends'
    :param 0.005 over_represented: to remove the very top fragment containing
       more reads
    :param 100000 max_frag_size: maximum fragment size allowed (fragments should
       not span over several bins)
    :param 100 min_frag_size: remove fragment that are too short (shorter than
       the sequenced read length)
    :param 5 re_proximity: should be adjusted according to RE site, to filter
       semi-dangling-ends
    :param 750 min_dist_to_re: minimum distance the start of a read should be
       from a RE site (usually 1.5 times the insert size). Applied in filter 10
    :param None savedata: PATH where to write the number of reads retained by
       each filter

    :return: dicitonary with, as keys, the kind of filter applied, and as values
       a set of read IDs to be removed

    *Note: Filtering is not exclusive, one read can be filtered several times.*
    """
    masked = {
        1: {
            'name': 'self-circle',
            'reads': set()
        },
        2: {
            'name': 'dangling-end',
            'reads': set()
        },
        3: {
            'name': 'error',
            'reads': set()
        },
        4: {
            'name': 'extra dangling-end',
            'reads': set()
        },
        5: {
            'name': 'too close from RES',
            'reads': set()
        },
        6: {
            'name': 'too short',
            'reads': set()
        },
        7: {
            'name': 'too large',
            'reads': set()
        },
        8: {
            'name': 'over-represented',
            'reads': set()
        },
        9: {
            'name': 'duplicated',
            'reads': set()
        },
        10: {
            'name': 'random breaks',
            'reads': set()
        }
    }
    uniq_check = set()
    # uniq_check = {}
    frag_count = count_re_fragments(fnam)
    num_frags = len(frag_count)
    cut = int((1 - over_represented) * num_frags + 0.5)
    # use cut-1 because it represents the length of the list
    cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1]

    total = 1
    bads = 0
    fhandler = open(fnam)
    line = fhandler.next()
    while line.startswith('#'):
        line = fhandler.next()
    try:
        while True:
            (read, cr1, pos1, sd1, _, res1, re1, cr2, pos2, sd2, _, res2,
             re2) = line.split('\t')
            (ps1, ps2, sd1, sd2, re1, rs1, re2,
             rs2) = map(int, (pos1, pos2, sd1, sd2, re1, res1, re2, res2))
            bad = False
            if cr1 == cr2:
                if re1 == re2:
                    if sd1 != sd2:
                        if (ps2 > ps1) == sd2:
                            # ----<===---===>---                   self-circles
                            masked[1]["reads"].add(read)
                            bad = True
                        else:
                            # ----===>---<===---                   dangling-ends
                            masked[2]["reads"].add(read)
                            bad = True
                    else:
                        # --===>--===>-- or --<===--<===-- or same errors
                        masked[3]["reads"].add(read)
                        bad = True
                elif (abs(ps1 - ps2) < max_molecule_length and sd2 != sd1
                      and (ps2 > ps1) != sd2):
                    # different fragments but facing and very close
                    masked[4]["reads"].add(read)
                    bad = True
            diff11 = re1 - ps1
            diff12 = ps1 - rs1
            diff21 = re2 - ps2
            diff22 = ps2 - rs2
            if ((diff11 < re_proximity) or (diff12 < re_proximity)
                    or (diff21 < re_proximity) or (diff22 < re_proximity)):
                masked[5]["reads"].add(read)
                bad = True
            if (((diff11 > min_dist_to_re) and (diff12 > min_dist_to_re)) or
                ((diff21 > min_dist_to_re) and (diff22 > min_dist_to_re))):
                masked[10]["reads"].add(read)
                bad = True
            dif1 = re1 - rs1
            dif2 = re2 - rs2
            if (dif1 < min_frag_size) or (dif2 < min_frag_size):
                masked[6]["reads"].add(read)
                bad = True
            if (dif1 > max_frag_size) or (dif2 > max_frag_size):
                masked[7]["reads"].add(read)
                bad = True
            if (frag_count.get((cr1, res1), 0) > cut or frag_count.get(
                (cr2, res2), 0) > cut):
                masked[8]["reads"].add(read)
                bad = True
            uniq_key = tuple(sorted((cr1 + pos1, cr2 + pos2)))
            if uniq_key in uniq_check:
                masked[9]["reads"].add(read)
                bad = True
            else:
                uniq_check.add(uniq_key)
            bads += bad
            line = fhandler.next()
            total += 1
    except StopIteration:
        pass
    fhandler.close()
    if savedata:
        out = open(savedata, 'w')
        out.write('TOTAL\t%d\n' % total)
        for k in xrange(1, len(masked) + 1):
            out.write('%s\t%d\n' %
                      (masked[k]['name'], len(masked[k]['reads'])))
        out.write('Valid pairs\t%d\n' % (total - bads))
        out.close()
    if verbose:
        print 'Filtered reads (and percentage of total):\n'
        print '     %-25s : %12d (100.00%%)' % ('TOTAL mapped', total)
        print '  ' + '-' * 53
        for k in xrange(1, len(masked) + 1):
            print '  %2d- %-25s : %12d (%6.2f%%)' % (
                k, masked[k]['name'], len(masked[k]['reads']),
                float(len(masked[k]['reads'])) / total * 100)
        print '\n     %-25s : %12d (%6.2f%%)' % ('Valid-pairs', total - bads,
                                                 float(total - bads) /
                                                 (total) * 100)
    del (uniq_check)
    return masked