Ejemplo n.º 1
0
 def get_mapped_filenames(self, fragment, PCR=1):
     '''Get filename(s) of mapped and filtered reads'''
     samples_seq = self.samples_seq
     fns = [
         get_mapped_to_initial_filename(self.patient,
                                        self.name,
                                        samplename,
                                        PCR=PCR)
         for samplename, sample in samples_seq.iterrows()
     ]
     return fns
Ejemplo n.º 2
0
def get_allele_frequency_trajectories(pname,
                                      samples,
                                      fragment,
                                      qual_min=30,
                                      VERBOSE=0):
    '''Scan the reads of all samples and write to a single file'''
    if VERBOSE >= 1:
        print 'Getting allele frequency trajectories:', pname, fragment

    from hivwholeseq.patients.filenames import get_initial_reference_filename, \
            get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \
            get_allele_count_trajectories_filename
    from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \
            get_allele_counts_insertions_from_file_unfiltered, \
            filter_nus

    refseq = SeqIO.read(get_initial_reference_filename(pname, fragment),
                        'fasta')

    # Prepare output data structures
    cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int)
    nus_traj = np.zeros((len(samples), len(alpha), len(refseq)))

    for it, sample in enumerate(samples):
        if VERBOSE >= 2:
            print pname, it, sample

        input_filename = get_mapped_to_initial_filename(pname,
                                                        sample,
                                                        fragment,
                                                        type='bam')
        (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(
            input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
        # Take the total counts, blending in the read types
        cou = counts.sum(axis=0)
        cos_traj[it] = cou

        # Take the filtered frequencies, blending in the read types
        nu = filter_nus(counts)
        nus_traj[it] = nu

    #FIXME: test, etc.

    return (cos_traj, nus_traj)
Ejemplo n.º 3
0
def filter_mapped_reads(sample,
                        fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname,
                                               samplename_pat,
                                               fragment,
                                               type='bam',
                                               PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4] + '_trashed.bam'

    infilenames = [
        get_mapped_to_initial_filename(pname,
                                       samplename_pat,
                                       samplename,
                                       fragment,
                                       type='bam',
                                       PCR=PCR)
        for samplename in samplenames_seq
    ]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print('WARNING: No mapped files found: ' +
              ', '.join([pname, samplename_pat, fragment,
                         str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)

    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:

            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1),
                                       int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()

                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(
                            reads,
                            ref,
                            hist_distance_from_consensus,
                            hist_dist_along,
                            binsize,
                            max_mismatches=max_mismatches,
                            match_len_min=match_len_min,
                            trim_bad_cigars=trim_bad_cigars,
                            VERBOSE=VERBOSE)

                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname,
                                                      samplename_pat,
                                                      fragment,
                                                      PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname ' + pname + ', ' + samplename_pat +
                    ', ' + fragment + '\n')
            f.write('Total:\t\t\t' + str(irp + 1) + '\n')
            f.write('Good:\t\t\t' + str(n_good) + '\n')
            f.write('Unmapped:\t\t' + str(n_unmapped) + '\n')
            f.write('Unpaired:\t\t' + str(n_unpaired) + '\n')
            f.write('Many-mutations:\t\t' + str(n_mutator) + '\n')
            f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n')
            f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True,
                           filtered=True):
    '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)'''
    import hivwholeseq
    JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/'
    JOBLOGOUT = JOBDIR+'logout/'
    JOBLOGERR = JOBDIR+'logerr/'
    cluster_time = ['23:59:59', '0:59:59']
    vmem = '8G'

    pname = patient.id
    sample = patient.sample_table.loc[samplename]
    seq_run = sample['run']
    data_folder = MiSeq_runs[seq_run]['folder']
    adaID = sample['adaID']

    if VERBOSE:
        print 'Map via stampy: '+pname+' '+samplename+' '+fragment

    if summary:
        summary_filename = get_map_initial_summary_filename(pname, samplename, fragment)

    # Specific fragment (e.g. F5 --> F5bi)
    frag_spec = filter(lambda x: fragment in x, sample['fragments'])
    if not len(frag_spec):
        raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.')
    frag_spec = frag_spec[0]

    input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam')

    # Submit map scripts in parallel to the cluster
    jobs_done = np.zeros(threads, bool)
    job_IDs = np.zeros(threads, 'S30')
    for j in xrange(threads):
    
        output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                         fragment,
                                                         type='sam', part=(j+1))
        # Map
        call_list = ['qsub','-cwd',
                     '-b', 'y',
                     '-S', '/bin/bash',
                     '-o', JOBLOGOUT,
                     '-e', JOBLOGERR,
                     '-N', 'm '+samplename+fragment+' p'+str(j+1),
                     '-l', 'h_rt='+cluster_time[threads >= 10],
                     '-l', 'h_vmem='+vmem,
                     stampy_bin,
                     '--overwrite',
                     '-g', get_initial_index_filename(pname, fragment, ext=False),
                     '-h', get_initial_hash_filename(pname, fragment, ext=False),
                     '-o', output_filename,
                     '--processpart='+str(j+1)+'/'+str(threads),
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')
        call_list = call_list + ['-M', input_filename]

        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        job_ID = sp.check_output(call_list)
        job_ID = job_ID.split()[2]
        job_IDs[j] = job_ID

    # Monitor output
    output_file_parts = [get_mapped_to_initial_filename(pname, samplename,
                                                        fragment,
                                                        type='bam', part=(j+1))
                         for j in xrange(threads)]
    time_wait = 10 # secs
    while not jobs_done.all():

        # Sleep some time
        time.sleep(time_wait)

        # Get the output of qstat to check the status of jobs
        qstat_output = sp.check_output(['qstat'])
        qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
        if len(qstat_output) < 3:
            jobs_done[:] = True
            break
        else:
            qstat_output = [line.split()[0] for line in qstat_output[2:]]

        time_wait = 10 # secs
        for j in xrange(threads):
            if jobs_done[j]:
                continue

            if job_IDs[j] not in qstat_output:
                # Convert to BAM for merging
                if VERBOSE >= 1:
                    print 'Convert mapped reads to BAM for merging: sample '+\
                           samplename+', part '+str(j+1)+ ' of '+ \
                           str(threads)
                convert_sam_to_bam(output_file_parts[j])
                # We do not need to wait if we did the conversion (it takes
                # longer than some secs)
                time_wait = 0
                jobs_done[j] = True

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Stampy mapped ('+str(threads)+' threads).\n')

    # Concatenate output files
    output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='bam', unsorted=True)
    if VERBOSE >= 1:
        print 'Concatenate premapped reads: sample '+samplename
    pysam.cat('-o', output_filename, *output_file_parts)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('BAM files concatenated (unsorted).\n')

    # Sort the file by read names (to ensure the pair_generator)
    output_filename_sorted = get_mapped_to_initial_filename(pname, samplename,
                                                            fragment,
                                                            type='bam')
    # NOTE: we exclude the extension and the option -f because of a bug in samtools
    if VERBOSE >= 1:
        print 'Sort mapped reads: sample '+samplename
    pysam.sort('-n', output_filename, output_filename_sorted[:-4])
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file sorted.\n')

    # Reheader the file without BAM -> SAM -> BAM
    if VERBOSE >= 1:
        print 'Reheader mapped reads: sample '+samplename
    header_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='sam', part=1)
    pysam.reheader(header_filename, output_filename_sorted)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: sample '+samplename
    remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
def map_stampy_singlethread(sample, fragment, VERBOSE=0, n_pairs=-1,
                            summary=True, only_chunk=None, filtered=True):
    '''Map using stampy, single thread (no cluster queueing race conditions)'''
    pname = sample.patient
    samplename_pat = sample['patient sample']
    seq_run = sample['seq run']
    data_folder = sample.sequencing_run['folder']
    adaID = sample['adapter']
    PCR = int(sample.PCR)

    if VERBOSE:
        print 'Map via stampy (single thread): '+samplename+' '+fragment

    if summary:
        summary_filename = get_map_initial_summary_filename(pname, samplename_pat, 
                                                            samplename, fragment,
                                                            PCR=PCR)

    # Specific fragment (e.g. F5 --> F5bi)
    frag_spec = filter(lambda x: fragment in x, sample.regions_complete)
    if not len(frag_spec):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (specific fragment for '+fragment+'not found).\n')

        raise ValueError(samplename+': fragment '+fragment+' not found.')
    else:
        frag_spec = frag_spec[0]

    input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam',
                                        only_chunk=only_chunk, filtered=filtered)

    # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
    if not os.path.isfile(input_filename):
        if fragment == 'F3':
            input_filename = input_filename.replace('F3a', 'F3')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # Extract subsample of reads if requested
    if n_pairs > 0:
        from hivwholeseq.utils.mapping import extract_mapped_reads_subsample
        input_filename_sub = get_mapped_to_initial_filename(pname, samplename_pat,
                                                            samplename, fragment,
                                                            PCR=PCR,
                                                            type='bam')[:-4]+\
                '_unmapped.bam'
        n_written = extract_mapped_reads_subsample(input_filename,
                                                   input_filename_sub,
                                                   n_pairs, VERBOSE=VERBOSE)

    # Get output filename
    output_filename = get_mapped_to_initial_filename(pname, samplename_pat, 
                                                     samplename, fragment,
                                                     PCR=PCR,
                                                     type='sam', only_chunk=only_chunk)

    # Map
    call_list = [stampy_bin,
                 '-g', get_initial_index_filename(pname, fragment, ext=False),
                 '-h', get_initial_hash_filename(pname, fragment, ext=False),
                 '-o', output_filename,
                 '--overwrite',
                 '--substitutionrate='+subsrate,
                 '--gapopen', stampy_gapopen,
                 '--gapextend', stampy_gapextend]
    if stampy_sensitive:
        call_list.append('--sensitive')

    if n_pairs > 0:
        call_list = call_list + ['-M', input_filename_sub]
    else:
        call_list = call_list + ['-M', input_filename]
    call_list = map(str, call_list)
    if VERBOSE >=2:
        print ' '.join(call_list)
    sp.call(call_list)

    output_filename_bam = get_mapped_to_initial_filename(pname, samplename_pat,
                                                         samplename, fragment,
                                                         type='bam',
                                                         PCR=PCR,
                                                         only_chunk=only_chunk)
    convert_sam_to_bam(output_filename_bam)

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Stampy mapped (single thread).\n')

    if only_chunk is None:
        if VERBOSE >= 1:
            print 'Remove temporary files: sample '+samplename
        remove_mapped_init_tempfiles(pname, samplename_pat,
                                     samplename, fragment,
                                     PCR=PCR,
                                     VERBOSE=VERBOSE, only_chunk=only_chunk)

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

    if n_pairs > 0:
        os.remove(input_filename_sub)
Ejemplo n.º 6
0
def filter_mapped_reads(sample, fragment,
                        PCR=1,
                        maxreads=-1,
                        VERBOSE=0,
                        n_cycles=600,
                        max_mismatches=100,
                        match_len_min=30,
                        trim_bad_cigars=3,
                        summary=True):
    '''Filter the reads to good chunks'''
    pname = sample.patient
    samplename_pat = sample.name
    samplenames_seq = sample.samples_seq.index.tolist()

    if VERBOSE >= 1:
        print 'Filtering reads:', pname, samplename_pat, fragment, PCR

    reffilename = get_initial_reference_filename(pname, fragment)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment,
                                               type='bam', PCR=PCR,
                                               decontaminated=False)
    trashfilename = outfilename[:-4]+'_trashed.bam'

    infilenames = [get_mapped_to_initial_filename(pname, samplename_pat,
                                                 samplename, fragment,
                                                 type='bam', PCR=PCR)
                   for samplename in samplenames_seq]
    infilenames = filter(os.path.isfile, infilenames)
    if not len(infilenames):
        print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat,
                                                              fragment, str(PCR)]))
        return

    # Take reads evenly distributed across sequencing repetitions
    maxreads /= len(infilenames)

    if VERBOSE >= 2:
        print 'Input mapped filenames:',
        if len(infilenames) >= 2:
            print ''
        print '\n'.join(infilenames)

    # Use first file as template for the new bamfile
    infilename = infilenames[0]
    if not os.path.isfile(infilename):
        convert_sam_to_bam(infilename)
 
    with pysam.Samfile(infilename, 'rb') as bamfile:
        with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\
             pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile:
 
            n_good = 0
            n_wrongname = 0
            n_unmapped = 0
            n_unpaired = 0
            n_mutator = 0
            n_badcigar = 0
            n_tiny = 0
            binsize = 200
            hist_distance_from_consensus = np.zeros(n_cycles + 1, int)
            hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int)

            # Iterate over input files, the first is already open
            for infilename in infilenames:

                if infilename != infilename[0]:
                    file_open = lambda: pysam.Samfile(infilename, 'rb')
                    file_close = lambda f: f.close()

                    if not os.path.isfile(infilename):
                        convert_sam_to_bam(infilename)

                else:
                    file_open = lambda: bamfile
                    file_close = lambda f: None

                try:
                    bamfile = file_open()
    
                    for irp, reads in enumerate(pair_generator(bamfile)):
                        if irp == maxreads:
                            break

                        pair_type = filter_read_pair(reads, ref,
                                                     hist_distance_from_consensus,
                                                     hist_dist_along,
                                                     binsize,
                                                     max_mismatches=max_mismatches,
                                                     match_len_min=match_len_min,
                                                     trim_bad_cigars=trim_bad_cigars,
                                                     VERBOSE=VERBOSE)
                    
                        if pair_type == 'unmapped':
                            n_unmapped += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'unpaired':
                            n_unpaired += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'mutator':
                            n_mutator += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'bad_cigar':
                            n_badcigar += 1
                            map(trashfile.write, reads)

                        elif pair_type == 'tiny':
                            n_tiny += 1
                            map(trashfile.write, reads)

                        else:
                            n_good += 1
                            map(outfile.write, reads)

                finally:
                    file_close(bamfile)

    if VERBOSE >= 1:
        print 'Read pairs: '
        print 'Good:', n_good
        print 'Unmapped:', n_unmapped
        print 'Unpaired:', n_unpaired
        print 'Many-mutations:', n_mutator
        print 'Bad CIGARs:', n_badcigar
        print 'Tiny:', n_tiny
        print

    if summary:
        sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
        with open(sfn, 'a') as f:
            f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n')
            f.write('Total:\t\t\t'+str(irp + 1)+'\n')
            f.write('Good:\t\t\t'+str(n_good)+'\n')
            f.write('Unmapped:\t\t'+str(n_unmapped)+'\n')
            f.write('Unpaired:\t\t'+str(n_unpaired)+'\n')
            f.write('Many-mutations:\t\t'+str(n_mutator)+'\n')
            f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n')
            f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')