Ejemplo n.º 1
0
 def get_mapped_filtered_filename(self, samplename, fragment, PCR=1):
     '''Get filename(s) of mapped and filtered reads for a sample'''
     from hivwholeseq.patients.filenames import get_mapped_filtered_filename
     return get_mapped_filtered_filename(self.patient,
                                         samplename,
                                         fragment,
                                         PCR=PCR)
Ejemplo n.º 2
0
 def get_mapped_filtered_filename(self, fragment, PCR=1, **kwargs):
     '''Get filename(s) of mapped and filtered reads'''
     from hivwholeseq.patients.filenames import get_mapped_filtered_filename
     return get_mapped_filtered_filename(self.patient,
                                         self.name,
                                         fragment,
                                         PCR=PCR,
                                         **kwargs)
Ejemplo n.º 3
0
# Modules
import os
import argparse
import datetime
from hivwholeseq.patients.patients import load_samples_sequenced, Patient, \
        SamplePat
from hivwholeseq.patients.filenames import get_mapped_filtered_filename
from hivwholeseq.utils.generic import modification_date
from hivwholeseq.store.check_patients import (
    pretty_print_info, pretty_print_info_genomewide)


# Globals
title_len = 15
cell_len = 7
get_decontaminated_filename = lambda *args, **kwargs: get_mapped_filtered_filename(*args, decontaminated=True, **kwargs)



# Script
if __name__ == '__main__':


    # Parse input args
    parser = argparse.ArgumentParser(description='Check patient samples')
    parser.add_argument('--samples', nargs='+',
                        help='Samples to analyze')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()
Ejemplo n.º 4
0
def check_pipeline_patient(p, VERBOSE=0):
    '''Check patient pipeline'''
    from hivwholeseq.utils.exceptions import PipelineError

    def print_info_summary(p):
        n_samples = len(p.samples)
        p.discard_nonsequenced_samples()
        n_samples_seq = len(p.samples)
        print p.name, '# samples:', str(n_samples) + ' (' + str(
            n_samples_seq) + ' sequenced)'
        title = 'Samples'
        line = ('{:<' + str(title_len) + '}').format(title + ':')
        line = line + ' '.join(p.samples.index.tolist())
        print line

        fn = p.folder
        title = 'Folder'
        line = ('{:<' + str(title_len) + '}').format(title + ':')
        if os.path.isdir(fn):
            status = 'OK'
        else:
            status = 'MISS'
        line = line + ('{:<' + str(cell_len) + '}').format(status)
        print line

        if status != 'OK':
            print ''
            raise PipelineError('Missing patient folder')

        return status

    def print_info_references(p):
        '''Print info on references'''
        title = 'References'
        line = ('{:<' + str(title_len) + '}').format(title + ':')
        stati = []
        for fragment in ('F' + str(i + 1) for i in xrange(6)):
            fn = p.get_reference_filename(fragment)
            if os.path.isfile(fn):
                status = 'OK'
                p.mod_dates[('reference', fragment)] = modification_date(fn)
            else:
                status = 'MISS'
            stati.append(status)
            line = line + fragment + ': ' + (
                '{:>' + str(cell_len - len(fragment) - 1) +
                '}').format(status) + '  '
        print line

        if frozenset(stati) != frozenset(['OK']):
            print ''
            raise PipelineError('Amplicon reference failed!')

        title = 'Genome ref'
        line = ('{:<' + str(title_len) + '}').format(title + ':')
        fn = p.get_reference_filename('genomewide', 'fasta')
        if os.path.isfile(fn):
            status = 'OK'
            p.mod_dates[('reference', 'genomewide')] = modification_date(fn)
        else:
            status = 'MISS'
        line = line + ('{:<' + str(cell_len) + '}').format(status)
        print line

        if status != 'OK':
            print ''
            raise PipelineError('Genomewide reference failed!')

        check_reference_overlap(p)

        title = 'Annotated'
        line = ('{:<' + str(title_len) + '}').format(title + ':')
        fn = p.get_reference_filename('genomewide', 'gb')
        if os.path.isfile(fn):
            md = modification_date(fn)
            if md >= p.mod_dates[('reference', 'genomewide')]:
                status = 'OK'
            else:
                status = 'OLD'
        else:
            status = 'MISS'
        line = line + ('{:<' + str(cell_len) + '}').format(status)
        print line
        if status != 'OK':
            print ''
            raise PipelineError('Annotated reference failed!')

    p.mod_dates = {}

    print_info_summary(p)

    print_info_references(p)

    from hivwholeseq.patients.filenames import get_mapped_filtered_filename
    print_info(
        p, 'Map + filter', 'filter',
        lambda pn, sn, fr: get_mapped_filtered_filename(
            pn, sn, fr, decontaminated=False), 'reference')

    print_info(
        p, 'Decontaminate', 'decontaminate',
        lambda pn, sn, fr: get_mapped_filtered_filename(
            pn, sn, fr, decontaminated=True), 'filter')

    print_info(p, 'Consensus', 'consensus', 'get_consensus_filename',
               'decontaminate')

    print_info_genomewide(p, 'Cons genomewide', 'consensus',
                          'get_consensus_filename')

    print_info(p, 'Allele counts', 'allele counts',
               'get_allele_counts_filename', 'decontaminate')

    print_info(p, 'Allele cocounts', 'allele cocounts',
               'get_allele_cocounts_filename', 'decontaminate')

    print_info_genomewide(p,
                          'Allele counts genomewide',
                          'allele counts',
                          'get_allele_counts_filename',
                          require_all=False)

    print_info_patient(p, 'Maps to HXB2', 'reference',
                       'get_map_coordinates_reference_filename', 'reference')

    print ''
Ejemplo n.º 5
0
'''
# Modules
import os
import argparse
import datetime
from hivwholeseq.patients.patients import load_samples_sequenced, Patient, \
        SamplePat
from hivwholeseq.patients.filenames import get_mapped_filtered_filename
from hivwholeseq.utils.generic import modification_date
from hivwholeseq.store.check_patients import (pretty_print_info,
                                              pretty_print_info_genomewide)

# Globals
title_len = 15
cell_len = 7
get_decontaminated_filename = lambda *args, **kwargs: get_mapped_filtered_filename(
    *args, decontaminated=True, **kwargs)

# Script
if __name__ == '__main__':

    # Parse input args
    parser = argparse.ArgumentParser(description='Check patient samples')
    parser.add_argument('--samples', nargs='+', help='Samples to analyze')
    parser.add_argument('--verbose',
                        type=int,
                        default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()
    VERBOSE = args.verbose
    samplenames = args.samples
def check_pipeline_patient(p, VERBOSE=0):
    '''Check patient pipeline'''
    from hivwholeseq.utils.exceptions import PipelineError

    def print_info_summary(p):
        n_samples = len(p.samples)
        p.discard_nonsequenced_samples()
        n_samples_seq = len(p.samples)
        print p.name, '# samples:', str(n_samples)+ ' ('+str(n_samples_seq)+' sequenced)'
        title = 'Samples'
        line = ('{:<'+str(title_len)+'}').format(title+':')
        line = line+' '.join(p.samples.index.tolist())
        print line

        fn = p.folder
        title = 'Folder'
        line = ('{:<'+str(title_len)+'}').format(title+':')
        if os.path.isdir(fn):
            status = 'OK'
        else:
            status = 'MISS'
        line = line + ('{:<'+str(cell_len)+'}').format(status)
        print line

        if status != 'OK':
            print ''
            raise PipelineError('Missing patient folder')

        return status

    def print_info_references(p):
        '''Print info on references'''
        title = 'References'
        line = ('{:<'+str(title_len)+'}').format(title+':')
        stati = []
        for fragment in ('F'+str(i+1) for i in xrange(6)):
            fn = p.get_reference_filename(fragment)
            if os.path.isfile(fn):
                status = 'OK'
                p.mod_dates[('reference', fragment)] = modification_date(fn)
            else:
                status = 'MISS'
            stati.append(status)
            line = line + fragment + ': ' + ('{:>'+str(cell_len - len(fragment) - 1)+'}').format(status) + '  '
        print line
    
        if frozenset(stati) != frozenset(['OK']):
            print ''
            raise PipelineError('Amplicon reference failed!')
    
        title = 'Genome ref'
        line = ('{:<'+str(title_len)+'}').format(title+':')
        fn = p.get_reference_filename('genomewide', 'fasta')
        if os.path.isfile(fn):
            status = 'OK'
            p.mod_dates[('reference', 'genomewide')] = modification_date(fn)
        else:
            status = 'MISS'
        line = line + ('{:<'+str(cell_len)+'}').format(status)
        print line
    
        if status != 'OK':
            print ''
            raise PipelineError('Genomewide reference failed!')
    
        check_reference_overlap(p)

        title = 'Annotated'
        line = ('{:<'+str(title_len)+'}').format(title+':')
        fn = p.get_reference_filename('genomewide', 'gb')
        if os.path.isfile(fn):
            md = modification_date(fn)
            if md >= p.mod_dates[('reference', 'genomewide')]:
                status = 'OK'
            else:
                status = 'OLD'
        else:
            status = 'MISS'
        line = line + ('{:<'+str(cell_len)+'}').format(status)
        print line
        if status != 'OK':
            print ''
            raise PipelineError('Annotated reference failed!')


    p.mod_dates = {}
    
    print_info_summary(p)

    print_info_references(p)

    from hivwholeseq.patients.filenames import get_mapped_filtered_filename
    print_info(p, 'Map + filter', 'filter',
               lambda pn, sn, fr: get_mapped_filtered_filename(pn, sn, fr, decontaminated=False),
               'reference')

    print_info(p, 'Decontaminate', 'decontaminate',
               lambda pn, sn, fr: get_mapped_filtered_filename(pn, sn, fr, decontaminated=True),
               'filter')

    print_info(p, 'Consensus', 'consensus',
               'get_consensus_filename',
               'decontaminate')

    print_info_genomewide(p, 'Cons genomewide', 'consensus',
                          'get_consensus_filename')

    print_info(p, 'Allele counts', 'allele counts',
               'get_allele_counts_filename',
               'decontaminate')

    print_info(p, 'Allele cocounts', 'allele cocounts',
               'get_allele_cocounts_filename',
               'decontaminate')

    print_info_genomewide(p, 'Allele counts genomewide', 'allele counts',
                          'get_allele_counts_filename',
                          require_all=False)

    print_info_patient(p, 'Maps to HXB2', 'reference',
                       'get_map_coordinates_reference_filename',
                       'reference')


    print ''
Ejemplo n.º 7
0
def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
Ejemplo n.º 8
0
 def get_mapped_filtered_filename(self, samplename, fragment, PCR=1):
     '''Get filename(s) of mapped and filtered reads for a sample'''
     from hivwholeseq.patients.filenames import get_mapped_filtered_filename
     return get_mapped_filtered_filename(self.patient, samplename, fragment, PCR=PCR)
Ejemplo n.º 9
0
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')