def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
        pname = sample_pat.patient
        PCR = int(PCR)

        for fragment in fragments:
            if submit:
                fork_self(samplename_pat,
                          fragment,
                          VERBOSE=VERBOSE,
                          n_pairs=n_pairs,
                          PCR=PCR,
                          summary=summary)
                continue

            if summary:
                sfn = get_filter_mapped_init_summary_filename(pname,
                                                              samplename_pat,
                                                              fragment,
                                                              PCR=PCR)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --samples '+samplename_pat+\
                            ' --fragments '+fragment+\
                            ' --verbose '+str(VERBOSE))
                    if n_pairs != -1:
                        f.write(' --maxreads ' + str(n_pairs))
                    f.write('\n')

            filter_mapped_reads(sample_pat,
                                fragment,
                                PCR=PCR,
                                VERBOSE=VERBOSE,
                                maxreads=n_pairs,
        samples_seq_group = samples_seq.loc[samples_seq.index.isin(samplenames_seq)]
        sample_pat.samples_seq = samples_seq_group
        pname = sample_pat.patient
        PCR = int(PCR)

        for fragment in fragments:
            if submit:
                fork_self(samplename_pat, fragment,
                          VERBOSE=VERBOSE,
                          n_pairs=n_pairs,
                          PCR=PCR,
                          summary=summary)
                continue

            if summary:
                sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --samples '+samplename_pat+\
                            ' --fragments '+fragment+\
                            ' --verbose '+str(VERBOSE))
                    if n_pairs != -1:
                        f.write(' --maxreads '+str(n_pairs))
                    f.write('\n')

            filter_mapped_reads(sample_pat, fragment,
                                PCR=PCR,
                                VERBOSE=VERBOSE, maxreads=n_pairs,
                                summary=summary)